trap fixes based on internet and what I found
This commit is contained in:
parent
4ace2164f2
commit
8f260cb110
@ -1,3 +1,4 @@
|
|||||||
|
from distutils.filelist import findall
|
||||||
from operator import truediv
|
from operator import truediv
|
||||||
import re
|
import re
|
||||||
from urllib import robotparser
|
from urllib import robotparser
|
||||||
@ -44,8 +45,8 @@ def is_a_loop_trap(url):
|
|||||||
# http://pymotw.com/2/robotparser/
|
# http://pymotw.com/2/robotparser/
|
||||||
def robots_ok(baseurl):
|
def robots_ok(baseurl):
|
||||||
eva = robotparser.RobotFileParser()
|
eva = robotparser.RobotFileParser()
|
||||||
rooturl = str(urljoin(baseurl, '/')[:-1]) # get each subdomain by itself
|
rooturl = str(urljoin(baseurl, '/')[:-1]) # get each path by itself
|
||||||
eva.set_url(rooturl + "/robots.txt") # set location of robots.txt
|
eva.set_url(rooturl + "/robots.txt") # set location of robots.txt
|
||||||
eva.read() # read and fead to parser
|
eva.read() # read and fead to parser
|
||||||
return eva.can_fetch('*', baseurl) # returns true if useragent is allowed to crawl
|
return eva.can_fetch('*', baseurl) # returns true if useragent is allowed to crawl
|
||||||
|
|
||||||
@ -139,10 +140,21 @@ def is_valid(url):
|
|||||||
elif not robots_ok(url):
|
elif not robots_ok(url):
|
||||||
return False
|
return False
|
||||||
# https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression
|
# https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression
|
||||||
# add lem check
|
# length check for looping filters and queries (could add hash check for similarity or regex, but don't know if we want to as this works well enought)
|
||||||
# add another dir check
|
# we can adjust it based on what the cralwer does as well
|
||||||
# add extra dir check (we can add as we find)
|
elif len(url) > 150:
|
||||||
# add cal check
|
return False
|
||||||
|
# another looping directory check but more advanced than the one contained in is_a_trap
|
||||||
|
elif re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",parsed.path.lower()):
|
||||||
|
return False
|
||||||
|
# extra directories check (we can add as we find)
|
||||||
|
elif re.match(r"^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", parsed.path.lower()):
|
||||||
|
return False
|
||||||
|
# calendar checks plus adding or downloading calendar (ical)
|
||||||
|
elif re.match(r"^.*calendar.*$",parsed.path.lower()):
|
||||||
|
return False
|
||||||
|
elif parsed.query.find('ical') != -1:
|
||||||
|
return False
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user