trap fixes based on internet and what I found
This commit is contained in:
		| @@ -1,3 +1,4 @@ | |||||||
|  | from distutils.filelist import findall | ||||||
| from operator import truediv | from operator import truediv | ||||||
| import re | import re | ||||||
| from urllib import robotparser | from urllib import robotparser | ||||||
| @@ -44,8 +45,8 @@ def is_a_loop_trap(url): | |||||||
| # http://pymotw.com/2/robotparser/ | # http://pymotw.com/2/robotparser/ | ||||||
| def robots_ok(baseurl): | def robots_ok(baseurl): | ||||||
|     eva = robotparser.RobotFileParser() |     eva = robotparser.RobotFileParser() | ||||||
|     rooturl = str(urljoin(baseurl, '/')[:-1])   # get each subdomain by itself |     rooturl = str(urljoin(baseurl, '/')[:-1])   # get each path by itself | ||||||
|     eva.set_url(rooturl + "/robots.txt")         # set location of robots.txt  |     eva.set_url(rooturl + "/robots.txt")        # set location of robots.txt  | ||||||
|     eva.read()                                  # read and fead to parser |     eva.read()                                  # read and fead to parser | ||||||
|     return eva.can_fetch('*', baseurl)          # returns true if useragent is allowed to crawl |     return eva.can_fetch('*', baseurl)          # returns true if useragent is allowed to crawl | ||||||
|  |  | ||||||
| @@ -139,10 +140,21 @@ def is_valid(url): | |||||||
|         elif not robots_ok(url): |         elif not robots_ok(url): | ||||||
|             return False |             return False | ||||||
|         # https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression |         # https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression | ||||||
|         # add lem check |         # length check for looping filters and queries (could add hash check for similarity or regex, but don't know if we want to as this works well enought) | ||||||
|         # add another dir check |         # we can adjust it based on what the cralwer does as well | ||||||
|         # add extra dir check (we can add as we find) |         elif len(url) > 150: | ||||||
|         # add cal check |             return False | ||||||
|  |         # another looping directory check but more advanced than the one contained in is_a_trap | ||||||
|  |         elif re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",parsed.path.lower()): | ||||||
|  |             return False | ||||||
|  |         # extra directories check (we can add as we find) | ||||||
|  |         elif re.match(r"^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", parsed.path.lower()): | ||||||
|  |             return False | ||||||
|  |         # calendar checks plus adding or downloading calendar (ical) | ||||||
|  |         elif re.match(r"^.*calendar.*$",parsed.path.lower()): | ||||||
|  |             return False | ||||||
|  |         elif parsed.query.find('ical') != -1: | ||||||
|  |             return False  | ||||||
|         else: |         else: | ||||||
|             return True |             return True | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Lacerum
					Lacerum