diff --git a/spacetime-crawler4py-master/robotsokay.py b/spacetime-crawler4py-master/robotsokay.py new file mode 100644 index 0000000..7ead0f4 --- /dev/null +++ b/spacetime-crawler4py-master/robotsokay.py @@ -0,0 +1,35 @@ +import re +from urllib import robotparser +from bs4 import BeautifulSoup +from collections import defaultdict +import requests + +# Tests to see if the url is ok to be crawled by checking against the robots.txt +# file. return true if page is allowed to be crawled, returns true if not robots file, and false otherwise +# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser +# http://pymotw.com/2/robotparser/ +# https://stackoverflow.com/questions/43085744/parsing-robots-txt-in-python +robots_seen = dict() # all robots go here (global so we can store over all site) +def robots_ok(parsed)->bool: + global robots_seen # global dict for files + robots_seen[parsed.netloc] = False # default seen + try: + url = 'http://' + parsed.netloc + '/robots.txt' # filter url and set + sitemap = requests.get(url) # sitmap get + if sitemap.status_code != 200: # no file so let her rip + return True + eva = robotparser.RobotFileParser(url) + eva.read() + if eva.can_fetch('*', url): # if eva can see url add to dict + robots_seen[parsed.netloc] = True + return robots_seen[parsed.netloc] # the dict + except: + return False # default +# check if the site is in the dict if not run it into the dict +def robots_are_ok(parsed): + global robots_seen + if parsed.netloc not in robots_seen: # if not in dict run check site + return robots_ok(parsed) + else: + return robots_seen[parsed.netloc] # if it has been read return its value + \ No newline at end of file diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py index 1dbabd0..48ac1d0 100644 --- a/spacetime-crawler4py-master/scraper.py +++ b/spacetime-crawler4py-master/scraper.py @@ -1,25 +1,10 @@ from distutils.filelist import findall from operator import truediv import re - -import urllib.request -from urllib import robotparser from urllib.parse import urlparse from urllib.parse import urljoin -from urllib.robotparser import RobotFileParser from bs4 import BeautifulSoup -from nltk.tokenize import word_tokenize -from nltk.corpus import stopwords -from nltk.corpus import words -import html2text -import nltk -#moved all my code to a separted py file and imported it here -from datacollection import * - -# nltk.download('stopwords') -# nltk.download('words') -# there is another nltk.download() requirement but I removed it so i forgot what it was -# it'll show in the console/terminal if you run the code i believe. it appeared in mine +from robotsokay import * def scraper(url, resp): links = extract_next_links(url, resp) @@ -68,35 +53,6 @@ def scraper(url, resp): return links_valid -# hopefuly fixes some loop traps and repeating (looping) directories -# the amount of repeated subdirectories allowed can be changed -# https://subscription.packtpub.com/book/big-data-and-business-intelligence/9781782164364/1/ch01lvl1sec11/crawling-your-first-website -# https://www.searchenginejournal.com/crawler-traps-causes-solutions-prevention/305781/ -def is_a_loop_trap(url): - word_dict = {} - parsed = urlparse(url) - url_path = str(parsed.path) - word_list = url_path.split('/') - for word in word_list: - if word in word_dict: - word_dict[word] += 1 - if word_dict[word] == 3: - return True - else: - word_dict[word] = 1 - return False - -# Tests to see if the url is ok to be crawled by checking against the robots.txt -# file. It does so by checking the URL or URL prefixes and will return true if page is allowed to be crawled -# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser -# http://pymotw.com/2/robotparser/ -def robots_ok(baseurl): - eva = robotparser.RobotFileParser() - rooturl = str(urljoin(baseurl, '/')[:-1]) # get each path by itself - eva.set_url(rooturl + "/robots.txt") # set location of robots.txt - eva.read() # read and fead to parser - return eva.can_fetch('*', baseurl) # returns true if useragent is allowed to crawl - def extract_next_links(url, resp): # Implementation required. # url: the URL that was used to get the page @@ -130,13 +86,13 @@ def extract_next_links(url, resp): #skipping query with specific actions which mutate the websites and cause a trap if "do=" in href_link: continue - ''' - # this is currently in the is_vaild but implimended in a different way, don't know which one would make more sense - # skip as not allowed - if not robots_ok(href_link): + + # don't know if this is too expensive, otherwise idk + # takes parsed url and if not ok on robots goes next, else we can write file + parsed = urlparse(href_link) + if not robots_are_ok(parsed): continue - ''' - + tempFile.write(href_link + "\n") #Adding to the boi wonder pages pages.append(href_link) @@ -160,6 +116,7 @@ def is_valid(url): try: #Gotta check if they are in the domain parsed = urlparse(url) + url_parsed_path = parsed.path.lower() # this may help speed things up a little bit (less calls to parsed.path) if parsed.scheme not in set(["http", "https"]): return False elif re.match( @@ -181,32 +138,27 @@ def is_valid(url): return False elif parsed.fragment: return False - elif is_a_loop_trap(url): - return False - # maybe this should go in the next link? - elif not robots_ok(url): - return False # https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression # length check for looping filters and queries (could add hash check for similarity or regex, but don't know if we want to as this works well enought) # we can adjust it based on what the cralwer does as well - elif len(url) > 169: + if len(url) > 169: return False # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters - elif re.match(r".*(&filter%.*){3,}",parsed.path.lower()): + if re.match(r".*(&filter%.*){3,}",url_parsed_path): return False # this is for urls which when opened, download a file (do we want to download these files and tokenize them) - # elif re.match(r"^.*\&format=(\D{3,4})\Z$",parsed.path.lower()): + # elif re.match(r"^.*\&format=(\D{3,4})\Z$",url_parsed_path): # return False # another looping directory check but more advanced than the one contained in is_a_trap - elif re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",parsed.path.lower()): + if re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",url_parsed_path): return False # extra directories check (we can add as we find) - elif re.match(r"^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", parsed.path.lower()): + if re.match(r"^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", url_parsed_path): return False # calendar checks plus adding or downloading calendar (ical) - elif re.match(r"^.*calendar.*$",parsed.path.lower()): + if re.match(r"^.*calendar.*$",url_parsed_path): return False - elif parsed.query.find('ical') != -1: + if parsed.query.find('ical') != -1: return False else: return True