another attempt at robots, merged regex as well

This commit is contained in:
Lacerum 2022-04-23 14:44:47 -07:00
parent 809b3dc820
commit 9c31a901b7
2 changed files with 36 additions and 20 deletions

View File

@ -1,5 +1,6 @@
import re import re
from urllib import robotparser from urllib import robotparser
from urllib.parse import urlparse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from collections import defaultdict from collections import defaultdict
import requests import requests
@ -9,6 +10,7 @@ import requests
# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser # https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
# http://pymotw.com/2/robotparser/ # http://pymotw.com/2/robotparser/
# https://stackoverflow.com/questions/43085744/parsing-robots-txt-in-python # https://stackoverflow.com/questions/43085744/parsing-robots-txt-in-python
'''This is ver 1.0
robots_seen = dict() # all robots go here (global so we can store over all site) robots_seen = dict() # all robots go here (global so we can store over all site)
def robots_ok(parsed)->bool: def robots_ok(parsed)->bool:
global robots_seen # global dict for files global robots_seen # global dict for files
@ -32,4 +34,30 @@ def robots_are_ok(parsed):
return robots_ok(parsed) return robots_ok(parsed)
else: else:
return robots_seen[parsed.netloc] # if it has been read return its value return robots_seen[parsed.netloc] # if it has been read return its value
'''
# Ver 1.1 maybe if I am understanding this correctly
robots_seen = dict() # dict of all seen robot files and store not allowed
def robots_ok(url)->bool:
try:
parsed = urlparse(url) # parse url
except:
print("Error in parse for: " + url)
robotstxt = "" # string for location of file
try:
robottxts = parsed.scheme + "://" + parsed.hostname + "/robots.txt" # location of file
except:
print("Error in parse for robots.txt: " + parsed)
if robotstxt not in robots_seen: # if url not in dict add to dict
robots_seen[robotstxt] = robotparser.RobotFileParser(robotstxt)
try:
robots_seen[robotstxt] = robotparser.RobotFileParser.read(robotstxt)
except:
del robots_seen[robotstxt]
return True
try:
return robots_seen[robotstxt].can_fetch('*', url)
except:
print("There was an error with: " + url)
return True

View File

@ -54,12 +54,6 @@ def extract_next_links(url, resp):
if "do=" in href_link: if "do=" in href_link:
continue continue
# don't know if this is too expensive, otherwise idk
# takes parsed url and if not ok on robots goes next, else we can write file
parsed = urlparse(href_link)
if not robots_are_ok(parsed):
continue
tempFile.write(href_link + "\n") tempFile.write(href_link + "\n")
#Adding to the boi wonder pages #Adding to the boi wonder pages
pages.append(href_link) pages.append(href_link)
@ -94,7 +88,7 @@ def is_valid(url):
+ r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso" + r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso"
+ r"|epub|dll|cnf|tgz|sha1" + r"|epub|dll|cnf|tgz|sha1"
+ r"|thmx|mso|arff|rtf|jar|csv" + r"|thmx|mso|arff|rtf|jar|csv"
+ r"|rm|smil|wmv|swf|wma|zip|rar|gz)$",parsed.path.lower()): + r"|rm|smil|wmv|swf|wma|zip|rar|gz)$",url_parsed_path):
return False return False
elif not re.match( elif not re.match(
r".*ics.uci.edu/.*" r".*ics.uci.edu/.*"
@ -110,26 +104,20 @@ def is_valid(url):
# we can adjust it based on what the cralwer does as well # we can adjust it based on what the cralwer does as well
if len(url) > 169: if len(url) > 169:
return False return False
# this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters if robots_ok(url) == False: # if robots returns false than no go
if re.match(r".*(&filter%.*){3,}",url_parsed_path): return False
if re.match(r".*(&filter%.*){3,}" # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters
+ r"|^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$" # looping directory check
+ r"|^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$" # extra directories check (we can add as we find)
+ r"|^.*calendar.*$",url_parsed_path): # calendar checks plus adding or downloading calendar (ical)
return False return False
# this is for urls which when opened, download a file (do we want to download these files and tokenize them) # this is for urls which when opened, download a file (do we want to download these files and tokenize them)
# elif re.match(r"^.*\&format=(\D{3,4})\Z$",url_parsed_path): # elif re.match(r"^.*\&format=(\D{3,4})\Z$",url_parsed_path):
# return False # return False
# another looping directory check but more advanced than the one contained in is_a_trap
if re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",url_parsed_path):
return False
# extra directories check (we can add as we find)
if re.match(r"^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", url_parsed_path):
return False
# calendar checks plus adding or downloading calendar (ical)
if re.match(r"^.*calendar.*$",url_parsed_path):
return False
if parsed.query.find('ical') != -1: if parsed.query.find('ical') != -1:
return False return False
else: else:
return True return True
except TypeError: except TypeError:
print ("TypeError for ", parsed) print ("TypeError for ", parsed)
raise raise