Moved stuff out of scraper

2022-04-20 00:49:49 -07:00
parent 44c86eb51a
commit bdd61a373b
2 changed files with 41 additions and 56 deletions
--- a/spacetime-crawler4py-master/crawler/frontier.py
+++ b/spacetime-crawler4py-master/crawler/frontier.py
@@ -6,6 +6,7 @@ from queue import Queue, Empty

 from utils import get_logger, get_urlhash, normalize
 from scraper import is_valid
+from datacollection import *

 class Frontier(object):
    def __init__(self, config, restart):
@@ -13,6 +14,17 @@ class Frontier(object):
        self.config = config
        self.to_be_downloaded = list()
        
+        # data collection is going to happen in the frontier
+        # uniques encompass overall unique links
+        self.uniques = set() 
+        # grand_dict encompasses all the words over the entire set of links
+        self.grand_dict = dict()
+        # ics dict contains all subdomains of ics
+        self.ics = dict()
+        # used to find the longest page
+        self.max = -9999
+        self.longest = None
+        
        if not os.path.exists(self.config.save_file) and not restart:
            # Save file does not exist, but request to load save.
            self.logger.info(
@@ -61,6 +73,30 @@ class Frontier(object):
            self.save.sync()
            self.to_be_downloaded.append(url)
        
+        # Q1
+        self.uniques.add(removeFragment(url)) 
+
+        # Q2
+        tempTok = tokenize(url)
+        if len(tempTok) > max:
+                self.max = len(tempTok)
+                self.longest = url
+
+        # Q3
+        tempTok = removeStopWords(tempTok)
+        computeFrequencies(tempTok, self.grand_dict)
+
+        # Q4
+        fragless = removeFragment(url)
+        domain = findDomains(fragless.netloc)
+        if domain[1] == 'ics':
+            if domain[0] not in self.ics:
+                self.ics[domain[0]] = urlData(url, domain[0], domain[1])
+            else:
+                if fragless not in self.ics[domain[0]].getUniques():
+                    self.ics[domain[0]].appendUnique(fragless)
+
+
    def mark_url_complete(self, url):
        urlhash = get_urlhash(url)
        if urlhash not in self.save:
--- a/spacetime-crawler4py-master/scraper.py
+++ b/spacetime-crawler4py-master/scraper.py
@@ -17,33 +17,6 @@ from datacollection import *
 #       it'll show in the console/terminal if you run the code i believe. it appeared in mine

 def scraper(url, resp):
-    # initialize set for unique links
-    #       used a set for elimatining duplicates
-    uniques = set() 
-    # have to add the original url to the unique set
-    copyoriginal = url
-    uniques.add(removeFragment(copyoriginal))
-
-    # initializing longest for finding the longest page
-    max = -9999
-    longest = None
-
-    # have to do this for the original url
-    tok = tokenize(url)
-    if len(tok) > max:
-        max = len(tok)
-        longest = url
-
-    # grand_dict is a dictionary that contains every word over the entire set of pages (excluding stop words)
-    #       key: word , value: frequencies
-    grand_dict = dict()
-    tok = removeStopWords(tok)
-    computeFrequencies(tok, grand_dict)
-
-    # ics is a dict with subdomains
-    ics = dict()
-    
-
    links = extract_next_links(url, resp)
    links_valid = list()
    valid_links = open("valid_links.txt",'a')
@@ -54,39 +27,14 @@ def scraper(url, resp):
        if is_valid(link):
            links_valid.append(link)
            valid_links.write(link + "\n")
-
-            # Answering q1 for report
-            uniques.add(removeFragment(link)) 
-
-            # Answering q2
-            tempTok = tokenize(link)
-            if len(tempTok) > max:
-                max = len(tempTok)
-                longest = link
-
-
-            # Answering q3
-            tempTok = removeStopWords(tempTok)
-            computeFrequencies(tempTok, grand_dict)
-
-            # Answering q4 
-            fragless = removeFragment(link)
-            domain = findDomains(fragless.netloc)
-            if domain[1] == 'ics':
-                if domain[0] not in ics:
-                    ics[domain[0]] = urlData(link, domain[0], domain[1])
-                else:
-                    if fragless not in ics[domain[0]].getUniques():
-                        ics[domain[0]].appendUnique(fragless)
-                    
-
        else:
            invalid_links.write("From: " + url + "\n")
            invalid_links.write(link + "\n")

+    # Needs to be moved
    # creating text file that includes the number of unique links
    f = open("q1.txt", "w")
-    f.write("Number of unique pages: {length}".format(length = len(uniques)))
+    f.write("Number of unique pages: {length}\n".format(length = len(uniques)))
    f.close()

    # creating text file for question 2
@@ -102,7 +50,8 @@ def scraper(url, resp):
        if i == 50:
            break
        else:
-            f.write(k, ':', v)
+            f.write(k, ':', v, '\n')
+            i += 1
    f.close()

    # creating text file for question 4