another attempt at robots, merged regex as well
This commit is contained in:
parent
809b3dc820
commit
9c31a901b7
@ -1,5 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
from urllib import robotparser
|
from urllib import robotparser
|
||||||
|
from urllib.parse import urlparse
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import requests
|
import requests
|
||||||
@ -9,6 +10,7 @@ import requests
|
|||||||
# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
|
# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
|
||||||
# http://pymotw.com/2/robotparser/
|
# http://pymotw.com/2/robotparser/
|
||||||
# https://stackoverflow.com/questions/43085744/parsing-robots-txt-in-python
|
# https://stackoverflow.com/questions/43085744/parsing-robots-txt-in-python
|
||||||
|
'''This is ver 1.0
|
||||||
robots_seen = dict() # all robots go here (global so we can store over all site)
|
robots_seen = dict() # all robots go here (global so we can store over all site)
|
||||||
def robots_ok(parsed)->bool:
|
def robots_ok(parsed)->bool:
|
||||||
global robots_seen # global dict for files
|
global robots_seen # global dict for files
|
||||||
@ -32,4 +34,30 @@ def robots_are_ok(parsed):
|
|||||||
return robots_ok(parsed)
|
return robots_ok(parsed)
|
||||||
else:
|
else:
|
||||||
return robots_seen[parsed.netloc] # if it has been read return its value
|
return robots_seen[parsed.netloc] # if it has been read return its value
|
||||||
|
'''
|
||||||
|
# Ver 1.1 maybe if I am understanding this correctly
|
||||||
|
robots_seen = dict() # dict of all seen robot files and store not allowed
|
||||||
|
def robots_ok(url)->bool:
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url) # parse url
|
||||||
|
except:
|
||||||
|
print("Error in parse for: " + url)
|
||||||
|
|
||||||
|
robotstxt = "" # string for location of file
|
||||||
|
try:
|
||||||
|
robottxts = parsed.scheme + "://" + parsed.hostname + "/robots.txt" # location of file
|
||||||
|
except:
|
||||||
|
print("Error in parse for robots.txt: " + parsed)
|
||||||
|
|
||||||
|
if robotstxt not in robots_seen: # if url not in dict add to dict
|
||||||
|
robots_seen[robotstxt] = robotparser.RobotFileParser(robotstxt)
|
||||||
|
try:
|
||||||
|
robots_seen[robotstxt] = robotparser.RobotFileParser.read(robotstxt)
|
||||||
|
except:
|
||||||
|
del robots_seen[robotstxt]
|
||||||
|
return True
|
||||||
|
try:
|
||||||
|
return robots_seen[robotstxt].can_fetch('*', url)
|
||||||
|
except:
|
||||||
|
print("There was an error with: " + url)
|
||||||
|
return True
|
||||||
|
@ -54,12 +54,6 @@ def extract_next_links(url, resp):
|
|||||||
if "do=" in href_link:
|
if "do=" in href_link:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# don't know if this is too expensive, otherwise idk
|
|
||||||
# takes parsed url and if not ok on robots goes next, else we can write file
|
|
||||||
parsed = urlparse(href_link)
|
|
||||||
if not robots_are_ok(parsed):
|
|
||||||
continue
|
|
||||||
|
|
||||||
tempFile.write(href_link + "\n")
|
tempFile.write(href_link + "\n")
|
||||||
#Adding to the boi wonder pages
|
#Adding to the boi wonder pages
|
||||||
pages.append(href_link)
|
pages.append(href_link)
|
||||||
@ -94,7 +88,7 @@ def is_valid(url):
|
|||||||
+ r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso"
|
+ r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso"
|
||||||
+ r"|epub|dll|cnf|tgz|sha1"
|
+ r"|epub|dll|cnf|tgz|sha1"
|
||||||
+ r"|thmx|mso|arff|rtf|jar|csv"
|
+ r"|thmx|mso|arff|rtf|jar|csv"
|
||||||
+ r"|rm|smil|wmv|swf|wma|zip|rar|gz)$",parsed.path.lower()):
|
+ r"|rm|smil|wmv|swf|wma|zip|rar|gz)$",url_parsed_path):
|
||||||
return False
|
return False
|
||||||
elif not re.match(
|
elif not re.match(
|
||||||
r".*ics.uci.edu/.*"
|
r".*ics.uci.edu/.*"
|
||||||
@ -110,26 +104,20 @@ def is_valid(url):
|
|||||||
# we can adjust it based on what the cralwer does as well
|
# we can adjust it based on what the cralwer does as well
|
||||||
if len(url) > 169:
|
if len(url) > 169:
|
||||||
return False
|
return False
|
||||||
# this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters
|
if robots_ok(url) == False: # if robots returns false than no go
|
||||||
if re.match(r".*(&filter%.*){3,}",url_parsed_path):
|
return False
|
||||||
|
if re.match(r".*(&filter%.*){3,}" # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters
|
||||||
|
+ r"|^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$" # looping directory check
|
||||||
|
+ r"|^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$" # extra directories check (we can add as we find)
|
||||||
|
+ r"|^.*calendar.*$",url_parsed_path): # calendar checks plus adding or downloading calendar (ical)
|
||||||
return False
|
return False
|
||||||
# this is for urls which when opened, download a file (do we want to download these files and tokenize them)
|
# this is for urls which when opened, download a file (do we want to download these files and tokenize them)
|
||||||
# elif re.match(r"^.*\&format=(\D{3,4})\Z$",url_parsed_path):
|
# elif re.match(r"^.*\&format=(\D{3,4})\Z$",url_parsed_path):
|
||||||
# return False
|
# return False
|
||||||
# another looping directory check but more advanced than the one contained in is_a_trap
|
|
||||||
if re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",url_parsed_path):
|
|
||||||
return False
|
|
||||||
# extra directories check (we can add as we find)
|
|
||||||
if re.match(r"^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", url_parsed_path):
|
|
||||||
return False
|
|
||||||
# calendar checks plus adding or downloading calendar (ical)
|
|
||||||
if re.match(r"^.*calendar.*$",url_parsed_path):
|
|
||||||
return False
|
|
||||||
if parsed.query.find('ical') != -1:
|
if parsed.query.find('ical') != -1:
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
except TypeError:
|
except TypeError:
|
||||||
print ("TypeError for ", parsed)
|
print ("TypeError for ", parsed)
|
||||||
raise
|
raise
|
||||||
|
Loading…
Reference in New Issue
Block a user