diff --git a/spacetime-crawler4py-master/.gitignore b/spacetime-crawler4py-master/.gitignore new file mode 100644 index 0000000..416ebbb --- /dev/null +++ b/spacetime-crawler4py-master/.gitignore @@ -0,0 +1,4 @@ +__pycache__/* +logs/* +utils/* +crawler/__pycache__/* diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py index b997474..1dbabd0 100644 --- a/spacetime-crawler4py-master/scraper.py +++ b/spacetime-crawler4py-master/scraper.py @@ -1,7 +1,12 @@ +from distutils.filelist import findall +from operator import truediv import re + import urllib.request +from urllib import robotparser from urllib.parse import urlparse from urllib.parse import urljoin +from urllib.robotparser import RobotFileParser from bs4 import BeautifulSoup from nltk.tokenize import word_tokenize from nltk.corpus import stopwords @@ -63,6 +68,35 @@ def scraper(url, resp): return links_valid +# hopefuly fixes some loop traps and repeating (looping) directories +# the amount of repeated subdirectories allowed can be changed +# https://subscription.packtpub.com/book/big-data-and-business-intelligence/9781782164364/1/ch01lvl1sec11/crawling-your-first-website +# https://www.searchenginejournal.com/crawler-traps-causes-solutions-prevention/305781/ +def is_a_loop_trap(url): + word_dict = {} + parsed = urlparse(url) + url_path = str(parsed.path) + word_list = url_path.split('/') + for word in word_list: + if word in word_dict: + word_dict[word] += 1 + if word_dict[word] == 3: + return True + else: + word_dict[word] = 1 + return False + +# Tests to see if the url is ok to be crawled by checking against the robots.txt +# file. It does so by checking the URL or URL prefixes and will return true if page is allowed to be crawled +# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser +# http://pymotw.com/2/robotparser/ +def robots_ok(baseurl): + eva = robotparser.RobotFileParser() + rooturl = str(urljoin(baseurl, '/')[:-1]) # get each path by itself + eva.set_url(rooturl + "/robots.txt") # set location of robots.txt + eva.read() # read and fead to parser + return eva.can_fetch('*', baseurl) # returns true if useragent is allowed to crawl + def extract_next_links(url, resp): # Implementation required. # url: the URL that was used to get the page @@ -96,7 +130,13 @@ def extract_next_links(url, resp): #skipping query with specific actions which mutate the websites and cause a trap if "do=" in href_link: continue - + ''' + # this is currently in the is_vaild but implimended in a different way, don't know which one would make more sense + # skip as not allowed + if not robots_ok(href_link): + continue + ''' + tempFile.write(href_link + "\n") #Adding to the boi wonder pages pages.append(href_link) @@ -141,6 +181,33 @@ def is_valid(url): return False elif parsed.fragment: return False + elif is_a_loop_trap(url): + return False + # maybe this should go in the next link? + elif not robots_ok(url): + return False + # https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression + # length check for looping filters and queries (could add hash check for similarity or regex, but don't know if we want to as this works well enought) + # we can adjust it based on what the cralwer does as well + elif len(url) > 169: + return False + # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters + elif re.match(r".*(&filter%.*){3,}",parsed.path.lower()): + return False + # this is for urls which when opened, download a file (do we want to download these files and tokenize them) + # elif re.match(r"^.*\&format=(\D{3,4})\Z$",parsed.path.lower()): + # return False + # another looping directory check but more advanced than the one contained in is_a_trap + elif re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",parsed.path.lower()): + return False + # extra directories check (we can add as we find) + elif re.match(r"^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", parsed.path.lower()): + return False + # calendar checks plus adding or downloading calendar (ical) + elif re.match(r"^.*calendar.*$",parsed.path.lower()): + return False + elif parsed.query.find('ical') != -1: + return False else: return True