diff --git a/spacetime-crawler4py-master/crawler/frontier.py b/spacetime-crawler4py-master/crawler/frontier.py index 9ba9f14..1c8d28f 100644 --- a/spacetime-crawler4py-master/crawler/frontier.py +++ b/spacetime-crawler4py-master/crawler/frontier.py @@ -6,6 +6,7 @@ from queue import Queue, Empty from utils import get_logger, get_urlhash, normalize from scraper import is_valid +from datacollection import * class Frontier(object): def __init__(self, config, restart): @@ -13,6 +14,17 @@ class Frontier(object): self.config = config self.to_be_downloaded = list() + # data collection is going to happen in the frontier + # uniques encompass overall unique links + self.uniques = set() + # grand_dict encompasses all the words over the entire set of links + self.grand_dict = dict() + # ics dict contains all subdomains of ics + self.ics = dict() + # used to find the longest page + self.max = -9999 + self.longest = None + if not os.path.exists(self.config.save_file) and not restart: # Save file does not exist, but request to load save. self.logger.info( @@ -60,7 +72,31 @@ class Frontier(object): self.save[urlhash] = (url, False) self.save.sync() self.to_be_downloaded.append(url) - + + # Q1 + self.uniques.add(removeFragment(url)) + + # Q2 + tempTok = tokenize(url) + if len(tempTok) > max: + self.max = len(tempTok) + self.longest = url + + # Q3 + tempTok = removeStopWords(tempTok) + computeFrequencies(tempTok, self.grand_dict) + + # Q4 + fragless = removeFragment(url) + domain = findDomains(fragless.netloc) + if domain[1] == 'ics': + if domain[0] not in self.ics: + self.ics[domain[0]] = urlData(url, domain[0], domain[1]) + else: + if fragless not in self.ics[domain[0]].getUniques(): + self.ics[domain[0]].appendUnique(fragless) + + def mark_url_complete(self, url): urlhash = get_urlhash(url) if urlhash not in self.save: diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py index 52730f7..e3e87b1 100644 --- a/spacetime-crawler4py-master/scraper.py +++ b/spacetime-crawler4py-master/scraper.py @@ -17,33 +17,6 @@ from datacollection import * # it'll show in the console/terminal if you run the code i believe. it appeared in mine def scraper(url, resp): - # initialize set for unique links - # used a set for elimatining duplicates - uniques = set() - # have to add the original url to the unique set - copyoriginal = url - uniques.add(removeFragment(copyoriginal)) - - # initializing longest for finding the longest page - max = -9999 - longest = None - - # have to do this for the original url - tok = tokenize(url) - if len(tok) > max: - max = len(tok) - longest = url - - # grand_dict is a dictionary that contains every word over the entire set of pages (excluding stop words) - # key: word , value: frequencies - grand_dict = dict() - tok = removeStopWords(tok) - computeFrequencies(tok, grand_dict) - - # ics is a dict with subdomains - ics = dict() - - links = extract_next_links(url, resp) links_valid = list() valid_links = open("valid_links.txt",'a') @@ -54,39 +27,14 @@ def scraper(url, resp): if is_valid(link): links_valid.append(link) valid_links.write(link + "\n") - - # Answering q1 for report - uniques.add(removeFragment(link)) - - # Answering q2 - tempTok = tokenize(link) - if len(tempTok) > max: - max = len(tempTok) - longest = link - - - # Answering q3 - tempTok = removeStopWords(tempTok) - computeFrequencies(tempTok, grand_dict) - - # Answering q4 - fragless = removeFragment(link) - domain = findDomains(fragless.netloc) - if domain[1] == 'ics': - if domain[0] not in ics: - ics[domain[0]] = urlData(link, domain[0], domain[1]) - else: - if fragless not in ics[domain[0]].getUniques(): - ics[domain[0]].appendUnique(fragless) - - else: invalid_links.write("From: " + url + "\n") invalid_links.write(link + "\n") + # Needs to be moved # creating text file that includes the number of unique links f = open("q1.txt", "w") - f.write("Number of unique pages: {length}".format(length = len(uniques))) + f.write("Number of unique pages: {length}\n".format(length = len(uniques))) f.close() # creating text file for question 2 @@ -102,7 +50,8 @@ def scraper(url, resp): if i == 50: break else: - f.write(k, ':', v) + f.write(k, ':', v, '\n') + i += 1 f.close() # creating text file for question 4