another attempt at robots, merged regex as well

2022-04-23 14:44:47 -07:00 · 2022-04-23 14:44:47 -07:00 · 9c31a901b7
commit 9c31a901b7
parent 809b3dc820
2 changed files with 36 additions and 20 deletions
--- a/spacetime-crawler4py-master/robotsokay.py
+++ b/spacetime-crawler4py-master/robotsokay.py
@ -1,5 +1,6 @@
 import re
 from urllib import robotparser
 from urllib.parse import urlparse
 from bs4 import BeautifulSoup
 from collections import defaultdict
 import requests
@ -9,6 +10,7 @@ import requests
 # https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
 # http://pymotw.com/2/robotparser/
 # https://stackoverflow.com/questions/43085744/parsing-robots-txt-in-python
 '''This is ver 1.0
 robots_seen = dict() # all robots go here (global so we can store over all site)
 def robots_ok(parsed)->bool:
    global robots_seen                                  # global dict for files
@ -32,4 +34,30 @@ def robots_are_ok(parsed):
        return robots_ok(parsed)
    else:
        return robots_seen[parsed.netloc] # if it has been read return its value
 '''
 # Ver 1.1 maybe if I am understanding this correctly
 robots_seen = dict()  # dict of all seen robot files and store not allowed                         
 def robots_ok(url)->bool:
    try:
        parsed = urlparse(url)                                                     # parse url
    except:
        print("Error in parse for: " + url)
    robotstxt = ""                                                                 # string for location of file
    try:
        robottxts = parsed.scheme + "://" + parsed.hostname + "/robots.txt"        # location of file
    except:
        print("Error in parse for robots.txt: " + parsed)
    if robotstxt not in robots_seen:                                               # if url not in dict add to dict 
        robots_seen[robotstxt] = robotparser.RobotFileParser(robotstxt)
        try:
           robots_seen[robotstxt] = robotparser.RobotFileParser.read(robotstxt)
        except:
            del robots_seen[robotstxt]
            return True
    try:
        return robots_seen[robotstxt].can_fetch('*', url)
    except:
        print("There was an error with: " + url)
        return True
--- a/spacetime-crawler4py-master/scraper.py
+++ b/spacetime-crawler4py-master/scraper.py
@ -54,12 +54,6 @@ def extract_next_links(url, resp):
            if "do=" in href_link:
                continue
            # don't know if this is too expensive, otherwise idk
            # takes parsed url and if not ok on robots goes next, else we can write file    
            parsed = urlparse(href_link)    
            if not robots_are_ok(parsed):
                continue
            tempFile.write(href_link + "\n")
            #Adding to the boi wonder pages
            pages.append(href_link)
@ -94,7 +88,7 @@ def is_valid(url):
            + r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso"
            + r"|epub|dll|cnf|tgz|sha1"
            + r"|thmx|mso|arff|rtf|jar|csv"
-            + r"|rm|smil|wmv|swf|wma|zip|rar|gz)$",parsed.path.lower()):
+            + r"|rm|smil|wmv|swf|wma|zip|rar|gz)$",url_parsed_path):
            return False
        elif not re.match(
            r".*ics.uci.edu/.*"
@ -110,26 +104,20 @@ def is_valid(url):
        # we can adjust it based on what the cralwer does as well
        if len(url) > 169:
            return False
-        # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters 
+        if robots_ok(url) == False:         # if robots returns false than no go
-        if re.match(r".*(&filter%.*){3,}",url_parsed_path):
+            return False
        if re.match(r".*(&filter%.*){3,}"                                                          # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters
            + r"|^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$"                                             # looping directory check 
            + r"|^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$" # extra directories check (we can add as we find)
            + r"|^.*calendar.*$",url_parsed_path):                                                 # calendar checks plus adding or downloading calendar (ical)
            return False
        # this is for urls which when opened, download a file (do we want to download these files and tokenize them)
        # elif re.match(r"^.*\&format=(\D{3,4})\Z$",url_parsed_path):
        #     return False
        # another looping directory check but more advanced than the one contained in is_a_trap
        if re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",url_parsed_path):
            return False
        # extra directories check (we can add as we find)
        if re.match(r"^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", url_parsed_path):
            return False
        # calendar checks plus adding or downloading calendar (ical)
        if re.match(r"^.*calendar.*$",url_parsed_path):
            return False
        if parsed.query.find('ical') != -1:
            return False 
        else:
            return True
    except TypeError:
        print ("TypeError for ", parsed)
        raise