From 9c31a901b7828ea1930ee1ec708d42f7b8d33b42 Mon Sep 17 00:00:00 2001
From: Lacerum <smithaaronleo@gmail.com>
Date: Sat, 23 Apr 2022 14:44:47 -0700
Subject: [PATCH] another attempt at robots, merged regex as well

---
 spacetime-crawler4py-master/robotsokay.py | 30 ++++++++++++++++++++++-
 spacetime-crawler4py-master/scraper.py    | 26 ++++++--------------
 2 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/spacetime-crawler4py-master/robotsokay.py b/spacetime-crawler4py-master/robotsokay.py
index 7ead0f4..80db778 100644
--- a/spacetime-crawler4py-master/robotsokay.py
+++ b/spacetime-crawler4py-master/robotsokay.py
@@ -1,5 +1,6 @@
 import re
 from urllib import robotparser
+from urllib.parse import urlparse
 from bs4 import BeautifulSoup
 from collections import defaultdict
 import requests
@@ -9,6 +10,7 @@ import requests
 # https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
 # http://pymotw.com/2/robotparser/
 # https://stackoverflow.com/questions/43085744/parsing-robots-txt-in-python
+'''This is ver 1.0
 robots_seen = dict() # all robots go here (global so we can store over all site)
 def robots_ok(parsed)->bool:
     global robots_seen                                  # global dict for files
@@ -32,4 +34,30 @@ def robots_are_ok(parsed):
         return robots_ok(parsed)
     else:
         return robots_seen[parsed.netloc] # if it has been read return its value
-                                            
\ No newline at end of file
+'''
+# Ver 1.1 maybe if I am understanding this correctly
+robots_seen = dict()  # dict of all seen robot files and store not allowed                         
+def robots_ok(url)->bool:
+    try:
+        parsed = urlparse(url)                                                     # parse url
+    except:
+        print("Error in parse for: " + url)
+
+    robotstxt = ""                                                                 # string for location of file
+    try:
+        robottxts = parsed.scheme + "://" + parsed.hostname + "/robots.txt"        # location of file
+    except:
+        print("Error in parse for robots.txt: " + parsed)
+    
+    if robotstxt not in robots_seen:                                               # if url not in dict add to dict 
+        robots_seen[robotstxt] = robotparser.RobotFileParser(robotstxt)
+        try:
+           robots_seen[robotstxt] = robotparser.RobotFileParser.read(robotstxt)
+        except:
+            del robots_seen[robotstxt]
+            return True
+    try:
+        return robots_seen[robotstxt].can_fetch('*', url)
+    except:
+        print("There was an error with: " + url)
+        return True
diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py
index e81319b..fd18e4c 100644
--- a/spacetime-crawler4py-master/scraper.py
+++ b/spacetime-crawler4py-master/scraper.py
@@ -54,12 +54,6 @@ def extract_next_links(url, resp):
             if "do=" in href_link:
                 continue
 
-            # don't know if this is too expensive, otherwise idk
-            # takes parsed url and if not ok on robots goes next, else we can write file    
-            parsed = urlparse(href_link)    
-            if not robots_are_ok(parsed):
-                continue
-
             tempFile.write(href_link + "\n")
             #Adding to the boi wonder pages
             pages.append(href_link)
@@ -94,7 +88,7 @@ def is_valid(url):
             + r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso"
             + r"|epub|dll|cnf|tgz|sha1"
             + r"|thmx|mso|arff|rtf|jar|csv"
-            + r"|rm|smil|wmv|swf|wma|zip|rar|gz)$",parsed.path.lower()):
+            + r"|rm|smil|wmv|swf|wma|zip|rar|gz)$",url_parsed_path):
             return False
         elif not re.match(
             r".*ics.uci.edu/.*"
@@ -110,26 +104,20 @@ def is_valid(url):
         # we can adjust it based on what the cralwer does as well
         if len(url) > 169:
             return False
-        # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters 
-        if re.match(r".*(&filter%.*){3,}",url_parsed_path):
+        if robots_ok(url) == False:         # if robots returns false than no go
+            return False
+        if re.match(r".*(&filter%.*){3,}"                                                          # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters
+            + r"|^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$"                                             # looping directory check 
+            + r"|^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$" # extra directories check (we can add as we find)
+            + r"|^.*calendar.*$",url_parsed_path):                                                 # calendar checks plus adding or downloading calendar (ical)
             return False
         # this is for urls which when opened, download a file (do we want to download these files and tokenize them)
         # elif re.match(r"^.*\&format=(\D{3,4})\Z$",url_parsed_path):
         #     return False
-        # another looping directory check but more advanced than the one contained in is_a_trap
-        if re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",url_parsed_path):
-            return False
-        # extra directories check (we can add as we find)
-        if re.match(r"^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", url_parsed_path):
-            return False
-        # calendar checks plus adding or downloading calendar (ical)
-        if re.match(r"^.*calendar.*$",url_parsed_path):
-            return False
         if parsed.query.find('ical') != -1:
             return False 
         else:
             return True
-
     except TypeError:
         print ("TypeError for ", parsed)
         raise