import re from urllib import robotparser from urllib.parse import urlparse from bs4 import BeautifulSoup from collections import defaultdict import requests # Tests to see if the url is ok to be crawled by checking against the robots.txt # file. return true if page is allowed to be crawled, returns true if not robots file, and false otherwise # https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser # http://pymotw.com/2/robotparser/ # https://stackoverflow.com/questions/43085744/parsing-robots-txt-in-python '''This is ver 1.0 robots_seen = dict() # all robots go here (global so we can store over all site) def robots_ok(parsed)->bool: global robots_seen # global dict for files robots_seen[parsed.netloc] = False # default seen try: url = 'http://' + parsed.netloc + '/robots.txt' # filter url and set sitemap = requests.get(url) # sitmap get if sitemap.status_code != 200: # no file so let her rip return True eva = robotparser.RobotFileParser(url) eva.read() if eva.can_fetch('*', url): # if eva can see url add to dict robots_seen[parsed.netloc] = True return robots_seen[parsed.netloc] # the dict except: return False # default # check if the site is in the dict if not run it into the dict def robots_are_ok(parsed): global robots_seen if parsed.netloc not in robots_seen: # if not in dict run check site return robots_ok(parsed) else: return robots_seen[parsed.netloc] # if it has been read return its value ''' # Ver 1.1 maybe if I am understanding this correctly robots_seen = dict() # dict of all seen robot files and store not allowed def robots_ok(url)->bool: try: parsed = urlparse(url) # parse url except: print("Error in parse for: " + url) robotstxt = "" # string for location of file try: robottxts = parsed.scheme + "://" + parsed.hostname + "/robots.txt" # location of file except: print("Error in parse for robots.txt: " + parsed) if robotstxt not in robots_seen: # if url not in dict add to dict robots_seen[robotstxt] = robotparser.RobotFileParser(robotstxt) try: robots_seen[robotstxt] = robotparser.RobotFileParser.read(robotstxt) except: del robots_seen[robotstxt] return True try: return robots_seen[robotstxt].can_fetch('*', url) except: print("There was an error with: " + url) return True