From 9c31a901b7828ea1930ee1ec708d42f7b8d33b42 Mon Sep 17 00:00:00 2001 From: Lacerum Date: Sat, 23 Apr 2022 14:44:47 -0700 Subject: [PATCH] another attempt at robots, merged regex as well --- spacetime-crawler4py-master/robotsokay.py | 30 ++++++++++++++++++++++- spacetime-crawler4py-master/scraper.py | 26 ++++++-------------- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/spacetime-crawler4py-master/robotsokay.py b/spacetime-crawler4py-master/robotsokay.py index 7ead0f4..80db778 100644 --- a/spacetime-crawler4py-master/robotsokay.py +++ b/spacetime-crawler4py-master/robotsokay.py @@ -1,5 +1,6 @@ import re from urllib import robotparser +from urllib.parse import urlparse from bs4 import BeautifulSoup from collections import defaultdict import requests @@ -9,6 +10,7 @@ import requests # https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser # http://pymotw.com/2/robotparser/ # https://stackoverflow.com/questions/43085744/parsing-robots-txt-in-python +'''This is ver 1.0 robots_seen = dict() # all robots go here (global so we can store over all site) def robots_ok(parsed)->bool: global robots_seen # global dict for files @@ -32,4 +34,30 @@ def robots_are_ok(parsed): return robots_ok(parsed) else: return robots_seen[parsed.netloc] # if it has been read return its value - \ No newline at end of file +''' +# Ver 1.1 maybe if I am understanding this correctly +robots_seen = dict() # dict of all seen robot files and store not allowed +def robots_ok(url)->bool: + try: + parsed = urlparse(url) # parse url + except: + print("Error in parse for: " + url) + + robotstxt = "" # string for location of file + try: + robottxts = parsed.scheme + "://" + parsed.hostname + "/robots.txt" # location of file + except: + print("Error in parse for robots.txt: " + parsed) + + if robotstxt not in robots_seen: # if url not in dict add to dict + robots_seen[robotstxt] = robotparser.RobotFileParser(robotstxt) + try: + robots_seen[robotstxt] = robotparser.RobotFileParser.read(robotstxt) + except: + del robots_seen[robotstxt] + return True + try: + return robots_seen[robotstxt].can_fetch('*', url) + except: + print("There was an error with: " + url) + return True diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py index e81319b..fd18e4c 100644 --- a/spacetime-crawler4py-master/scraper.py +++ b/spacetime-crawler4py-master/scraper.py @@ -54,12 +54,6 @@ def extract_next_links(url, resp): if "do=" in href_link: continue - # don't know if this is too expensive, otherwise idk - # takes parsed url and if not ok on robots goes next, else we can write file - parsed = urlparse(href_link) - if not robots_are_ok(parsed): - continue - tempFile.write(href_link + "\n") #Adding to the boi wonder pages pages.append(href_link) @@ -94,7 +88,7 @@ def is_valid(url): + r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso" + r"|epub|dll|cnf|tgz|sha1" + r"|thmx|mso|arff|rtf|jar|csv" - + r"|rm|smil|wmv|swf|wma|zip|rar|gz)$",parsed.path.lower()): + + r"|rm|smil|wmv|swf|wma|zip|rar|gz)$",url_parsed_path): return False elif not re.match( r".*ics.uci.edu/.*" @@ -110,26 +104,20 @@ def is_valid(url): # we can adjust it based on what the cralwer does as well if len(url) > 169: return False - # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters - if re.match(r".*(&filter%.*){3,}",url_parsed_path): + if robots_ok(url) == False: # if robots returns false than no go + return False + if re.match(r".*(&filter%.*){3,}" # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters + + r"|^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$" # looping directory check + + r"|^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$" # extra directories check (we can add as we find) + + r"|^.*calendar.*$",url_parsed_path): # calendar checks plus adding or downloading calendar (ical) return False # this is for urls which when opened, download a file (do we want to download these files and tokenize them) # elif re.match(r"^.*\&format=(\D{3,4})\Z$",url_parsed_path): # return False - # another looping directory check but more advanced than the one contained in is_a_trap - if re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",url_parsed_path): - return False - # extra directories check (we can add as we find) - if re.match(r"^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", url_parsed_path): - return False - # calendar checks plus adding or downloading calendar (ical) - if re.match(r"^.*calendar.*$",url_parsed_path): - return False if parsed.query.find('ical') != -1: return False else: return True - except TypeError: print ("TypeError for ", parsed) raise