64 lines
2.8 KiB
Python
64 lines
2.8 KiB
Python
import re
|
|
from urllib import robotparser
|
|
from urllib.parse import urlparse
|
|
from bs4 import BeautifulSoup
|
|
from collections import defaultdict
|
|
import requests
|
|
|
|
# Tests to see if the url is ok to be crawled by checking against the robots.txt
|
|
# file. return true if page is allowed to be crawled, returns true if not robots file, and false otherwise
|
|
# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
|
|
# http://pymotw.com/2/robotparser/
|
|
# https://stackoverflow.com/questions/43085744/parsing-robots-txt-in-python
|
|
'''This is ver 1.0
|
|
robots_seen = dict() # all robots go here (global so we can store over all site)
|
|
def robots_ok(parsed)->bool:
|
|
global robots_seen # global dict for files
|
|
robots_seen[parsed.netloc] = False # default seen
|
|
try:
|
|
url = 'http://' + parsed.netloc + '/robots.txt' # filter url and set
|
|
sitemap = requests.get(url) # sitmap get
|
|
if sitemap.status_code != 200: # no file so let her rip
|
|
return True
|
|
eva = robotparser.RobotFileParser(url)
|
|
eva.read()
|
|
if eva.can_fetch('*', url): # if eva can see url add to dict
|
|
robots_seen[parsed.netloc] = True
|
|
return robots_seen[parsed.netloc] # the dict
|
|
except:
|
|
return False # default
|
|
# check if the site is in the dict if not run it into the dict
|
|
def robots_are_ok(parsed):
|
|
global robots_seen
|
|
if parsed.netloc not in robots_seen: # if not in dict run check site
|
|
return robots_ok(parsed)
|
|
else:
|
|
return robots_seen[parsed.netloc] # if it has been read return its value
|
|
'''
|
|
# Ver 1.1 maybe if I am understanding this correctly
|
|
robots_seen = dict() # dict of all seen robot files and store not allowed
|
|
def robots_ok(url)->bool:
|
|
try:
|
|
parsed = urlparse(url) # parse url
|
|
except:
|
|
print("Error in parse for: " + url)
|
|
|
|
robotstxt = "" # string for location of file
|
|
try:
|
|
robottxts = parsed.scheme + "://" + parsed.hostname + "/robots.txt" # location of file
|
|
except:
|
|
print("Error in parse for robots.txt: " + parsed)
|
|
|
|
if robotstxt not in robots_seen: # if url not in dict add to dict
|
|
robots_seen[robotstxt] = robotparser.RobotFileParser(robotstxt)
|
|
try:
|
|
robots_seen[robotstxt] = robotparser.RobotFileParser.read(robotstxt)
|
|
except:
|
|
del robots_seen[robotstxt]
|
|
return True
|
|
try:
|
|
return robots_seen[robotstxt].can_fetch('*', url)
|
|
except:
|
|
print("There was an error with: " + url)
|
|
return True
|