another attempt at robots, merged regex as well

2022-04-23 14:44:47 -07:00
2 changed files with 36 additions and 20 deletions
--- a/spacetime-crawler4py-master/robotsokay.py
+++ b/spacetime-crawler4py-master/robotsokay.py
@@ -1,5 +1,6 @@
 import re
 from urllib import robotparser
+from urllib.parse import urlparse
 from bs4 import BeautifulSoup
 from collections import defaultdict
 import requests
@@ -9,6 +10,7 @@ import requests
 # https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
 # http://pymotw.com/2/robotparser/
 # https://stackoverflow.com/questions/43085744/parsing-robots-txt-in-python
+'''This is ver 1.0
 robots_seen = dict() # all robots go here (global so we can store over all site)
 def robots_ok(parsed)->bool:
    global robots_seen                                  # global dict for files
@@ -32,4 +34,30 @@ def robots_are_ok(parsed):
        return robots_ok(parsed)
    else:
        return robots_seen[parsed.netloc] # if it has been read return its value
-                                            
+'''
+# Ver 1.1 maybe if I am understanding this correctly
+robots_seen = dict()  # dict of all seen robot files and store not allowed                         
+def robots_ok(url)->bool:
+    try:
+        parsed = urlparse(url)                                                     # parse url
+    except:
+        print("Error in parse for: " + url)
+
+    robotstxt = ""                                                                 # string for location of file
+    try:
+        robottxts = parsed.scheme + "://" + parsed.hostname + "/robots.txt"        # location of file
+    except:
+        print("Error in parse for robots.txt: " + parsed)
+    
+    if robotstxt not in robots_seen:                                               # if url not in dict add to dict 
+        robots_seen[robotstxt] = robotparser.RobotFileParser(robotstxt)
+        try:
+           robots_seen[robotstxt] = robotparser.RobotFileParser.read(robotstxt)
+        except:
+            del robots_seen[robotstxt]
+            return True
+    try:
+        return robots_seen[robotstxt].can_fetch('*', url)
+    except:
+        print("There was an error with: " + url)
+        return True
--- a/spacetime-crawler4py-master/scraper.py
+++ b/spacetime-crawler4py-master/scraper.py
@@ -54,12 +54,6 @@ def extract_next_links(url, resp):
            if "do=" in href_link:
                continue

-            # don't know if this is too expensive, otherwise idk
-            # takes parsed url and if not ok on robots goes next, else we can write file    
-            parsed = urlparse(href_link)    
-            if not robots_are_ok(parsed):
-                continue
-
            tempFile.write(href_link + "\n")
            #Adding to the boi wonder pages
            pages.append(href_link)
@@ -94,7 +88,7 @@ def is_valid(url):
            + r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso"
            + r"|epub|dll|cnf|tgz|sha1"
            + r"|thmx|mso|arff|rtf|jar|csv"
-            + r"|rm|smil|wmv|swf|wma|zip|rar|gz)$",parsed.path.lower()):
+            + r"|rm|smil|wmv|swf|wma|zip|rar|gz)$",url_parsed_path):
            return False
        elif not re.match(
            r".*ics.uci.edu/.*"
@@ -110,26 +104,20 @@ def is_valid(url):
        # we can adjust it based on what the cralwer does as well
        if len(url) > 169:
            return False
-        # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters 
-        if re.match(r".*(&filter%.*){3,}",url_parsed_path):
+        if robots_ok(url) == False:         # if robots returns false than no go
+            return False
+        if re.match(r".*(&filter%.*){3,}"                                                          # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters
+            + r"|^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$"                                             # looping directory check 
+            + r"|^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$" # extra directories check (we can add as we find)
+            + r"|^.*calendar.*$",url_parsed_path):                                                 # calendar checks plus adding or downloading calendar (ical)
            return False
        # this is for urls which when opened, download a file (do we want to download these files and tokenize them)
        # elif re.match(r"^.*\&format=(\D{3,4})\Z$",url_parsed_path):
        #     return False
-        # another looping directory check but more advanced than the one contained in is_a_trap
-        if re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",url_parsed_path):
-            return False
-        # extra directories check (we can add as we find)
-        if re.match(r"^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", url_parsed_path):
-            return False
-        # calendar checks plus adding or downloading calendar (ical)
-        if re.match(r"^.*calendar.*$",url_parsed_path):
-            return False
        if parsed.query.find('ical') != -1:
            return False 
        else:
            return True
-
    except TypeError:
        print ("TypeError for ", parsed)
        raise