From 8f260cb1104a68f2f688c3aff2b7b15a22241168 Mon Sep 17 00:00:00 2001 From: Lacerum Date: Tue, 19 Apr 2022 03:02:14 -0700 Subject: [PATCH] trap fixes based on internet and what I found --- spacetime-crawler4py-master/scraper.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py index cba8c3b..9eb88ba 100644 --- a/spacetime-crawler4py-master/scraper.py +++ b/spacetime-crawler4py-master/scraper.py @@ -1,3 +1,4 @@ +from distutils.filelist import findall from operator import truediv import re from urllib import robotparser @@ -44,8 +45,8 @@ def is_a_loop_trap(url): # http://pymotw.com/2/robotparser/ def robots_ok(baseurl): eva = robotparser.RobotFileParser() - rooturl = str(urljoin(baseurl, '/')[:-1]) # get each subdomain by itself - eva.set_url(rooturl + "/robots.txt") # set location of robots.txt + rooturl = str(urljoin(baseurl, '/')[:-1]) # get each path by itself + eva.set_url(rooturl + "/robots.txt") # set location of robots.txt eva.read() # read and fead to parser return eva.can_fetch('*', baseurl) # returns true if useragent is allowed to crawl @@ -139,10 +140,21 @@ def is_valid(url): elif not robots_ok(url): return False # https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression - # add lem check - # add another dir check - # add extra dir check (we can add as we find) - # add cal check + # length check for looping filters and queries (could add hash check for similarity or regex, but don't know if we want to as this works well enought) + # we can adjust it based on what the cralwer does as well + elif len(url) > 150: + return False + # another looping directory check but more advanced than the one contained in is_a_trap + elif re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",parsed.path.lower()): + return False + # extra directories check (we can add as we find) + elif re.match(r"^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", parsed.path.lower()): + return False + # calendar checks plus adding or downloading calendar (ical) + elif re.match(r"^.*calendar.*$",parsed.path.lower()): + return False + elif parsed.query.find('ical') != -1: + return False else: return True