Moved stuff out of scraper

2022-04-20 00:49:49 -07:00
parent 44c86eb51a
commit bdd61a373b
2 changed files with 41 additions and 56 deletions
--- a/spacetime-crawler4py-master/crawler/frontier.py
+++ b/spacetime-crawler4py-master/crawler/frontier.py
@@ -6,6 +6,7 @@ from queue import Queue, Empty
 from utils import get_logger, get_urlhash, normalize
 from scraper import is_valid
 from datacollection import *
 class Frontier(object):
    def __init__(self, config, restart):
@@ -13,6 +14,17 @@ class Frontier(object):
        self.config = config
        self.to_be_downloaded = list()
        # data collection is going to happen in the frontier
        # uniques encompass overall unique links
        self.uniques = set() 
        # grand_dict encompasses all the words over the entire set of links
        self.grand_dict = dict()
        # ics dict contains all subdomains of ics
        self.ics = dict()
        # used to find the longest page
        self.max = -9999
        self.longest = None
        if not os.path.exists(self.config.save_file) and not restart:
            # Save file does not exist, but request to load save.
            self.logger.info(
@@ -60,7 +72,31 @@ class Frontier(object):
            self.save[urlhash] = (url, False)
            self.save.sync()
            self.to_be_downloaded.append(url)
-    
+        
        # Q1
        self.uniques.add(removeFragment(url)) 
        # Q2
        tempTok = tokenize(url)
        if len(tempTok) > max:
                self.max = len(tempTok)
                self.longest = url
        # Q3
        tempTok = removeStopWords(tempTok)
        computeFrequencies(tempTok, self.grand_dict)
        # Q4
        fragless = removeFragment(url)
        domain = findDomains(fragless.netloc)
        if domain[1] == 'ics':
            if domain[0] not in self.ics:
                self.ics[domain[0]] = urlData(url, domain[0], domain[1])
            else:
                if fragless not in self.ics[domain[0]].getUniques():
                    self.ics[domain[0]].appendUnique(fragless)
    def mark_url_complete(self, url):
        urlhash = get_urlhash(url)
        if urlhash not in self.save:
--- a/spacetime-crawler4py-master/scraper.py
+++ b/spacetime-crawler4py-master/scraper.py
@@ -17,33 +17,6 @@ from datacollection import *
 #       it'll show in the console/terminal if you run the code i believe. it appeared in mine
 def scraper(url, resp):
    # initialize set for unique links
    #       used a set for elimatining duplicates
    uniques = set() 
    # have to add the original url to the unique set
    copyoriginal = url
    uniques.add(removeFragment(copyoriginal))
    # initializing longest for finding the longest page
    max = -9999
    longest = None
    # have to do this for the original url
    tok = tokenize(url)
    if len(tok) > max:
        max = len(tok)
        longest = url
    # grand_dict is a dictionary that contains every word over the entire set of pages (excluding stop words)
    #       key: word , value: frequencies
    grand_dict = dict()
    tok = removeStopWords(tok)
    computeFrequencies(tok, grand_dict)
    # ics is a dict with subdomains
    ics = dict()
    links = extract_next_links(url, resp)
    links_valid = list()
    valid_links = open("valid_links.txt",'a')
@@ -54,39 +27,14 @@ def scraper(url, resp):
        if is_valid(link):
            links_valid.append(link)
            valid_links.write(link + "\n")
            # Answering q1 for report
            uniques.add(removeFragment(link)) 
            # Answering q2
            tempTok = tokenize(link)
            if len(tempTok) > max:
                max = len(tempTok)
                longest = link
            # Answering q3
            tempTok = removeStopWords(tempTok)
            computeFrequencies(tempTok, grand_dict)
            # Answering q4 
            fragless = removeFragment(link)
            domain = findDomains(fragless.netloc)
            if domain[1] == 'ics':
                if domain[0] not in ics:
                    ics[domain[0]] = urlData(link, domain[0], domain[1])
                else:
                    if fragless not in ics[domain[0]].getUniques():
                        ics[domain[0]].appendUnique(fragless)
        else:
            invalid_links.write("From: " + url + "\n")
            invalid_links.write(link + "\n")
    # Needs to be moved
    # creating text file that includes the number of unique links
    f = open("q1.txt", "w")
-    f.write("Number of unique pages: {length}".format(length = len(uniques)))
+    f.write("Number of unique pages: {length}\n".format(length = len(uniques)))
    f.close()
    # creating text file for question 2
@@ -102,7 +50,8 @@ def scraper(url, resp):
        if i == 50:
            break
        else:
-            f.write(k, ':', v)
+            f.write(k, ':', v, '\n')
            i += 1
    f.close()
    # creating text file for question 4