(andy) first move recent discussed issue

2022-04-21 20:31:38 -07:00
parent 320fe26c23
commit 754d3b4af6
3 changed files with 88 additions and 6 deletions
--- a/spacetime-crawler4py-master/crawler/frontier.py
+++ b/spacetime-crawler4py-master/crawler/frontier.py
@@ -111,6 +111,7 @@ class Frontier(object):
    


+        
        # Q1
        ###CRITICAL SECTION
        file_1_mutex.acquire()
@@ -198,3 +199,75 @@ class Frontier(object):
            return 4
        else:
            println("ERROR")
+
+    def q1(self, url):
+        # rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory
+        #       this saves to the local directory, so I can constantly access the right file and check if it exists or not
+        path_to_script = os.path.dirname(os.path.abspath(__file__))
+        my_filename = os.path.join(path_to_script, "q1.txt")
+        
+        # Will create a file of all the unique links and you can read the file and do lines = f.readlines() then len(lines) to get the number of unique links
+        if (os.path.exists(my_filename)):
+            f = open(my_filename, 'a')
+            f.write(removeFragment(url))
+            f.close()
+        else:
+            f = open(my_filename, 'w')
+            f.write(removeFragment(url))
+            f.close()
+        
+    def q234(self, url, resp):
+        # rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory
+        #       this saves to the local directory, so I can constantly access the right file and check if it exists or not
+        path_to_script = os.path.dirname(os.path.abspath(__file__))
+        my_filename = os.path.join(path_to_script, "q2.txt")
+
+        tempTok = tokenize(resp)
+        if len(tempTok) > self.max:
+            self.max = len(tempTok)
+            self.longest = url
+            f = open(my_filename, 'w')
+            f.write("Longest Page: {url}, length: {length}".format(url = self.longest, length = self.max))
+            f.close()
+
+        tempTok = removeStopWords(tempTok)
+        computeFrequencies(tempTok, self.grand_dict)
+
+        # rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory
+        #       this saves to the local directory, so I can constantly access the right file and check if it exists or not
+        path_to_script = os.path.dirname(os.path.abspath(__file__))
+        my_filename = os.path.join(path_to_script, "q3.txt")
+
+        f = open(my_filename, "w")
+        sortedGrandDict = {k: v for k, v in sorted(self.grand_dict.items(), key=lambda item: item[1], reverse = True)}
+        i = 0
+        for k, v in sortedGrandDict.items():
+            if i == 50:
+                break
+            else:
+                f.write("{}: {}\n".format(k, v))
+                i += 1
+        f.close()
+
+        fragless = removeFragment(url)
+        domain = findDomains(fragless.netloc)
+        if domain[1] == 'ics':
+            if domain[0] not in self.ics:
+                self.ics[domain[0]] = urlData(url, domain[0], domain[1])
+            else:
+                if fragless not in self.ics[domain[0]].getUniques():
+                    self.ics[domain[0]].appendUnique(fragless)
+        
+        # rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory
+        #       this saves to the local directory, so I can constantly access the right file and check if it exists or not
+        path_to_script = os.path.dirname(os.path.abspath(__file__))
+        my_filename = os.path.join(path_to_script, "q4.txt")
+
+        # creating text file for question 4
+        sortedDictKeys = sorted(self.ics.keys())
+        f = open(my_filename, "w")
+        for i in sortedDictKeys:
+            f.write("{url}, {num}".format(url = self.ics[i].getNiceLink(), num = len(self.ics[i].getUniques())))
+        f.close()
+
+        
--- a/spacetime-crawler4py-master/crawler/worker.py
+++ b/spacetime-crawler4py-master/crawler/worker.py
@@ -29,6 +29,17 @@ class Worker(Thread):
            resp = download(tbd_url, self.config, self.logger)
            toc = time.perf_counter()
            print(f"Took {toc - tic:0.4f} seconds to do download url")
+
+            tic = time.perf_counter()
+            self.frontier.q1(tbd_url)
+            toc = time.perf_counter()
+            print(f"Took {toc - tic:0.4f} seconds to do download url")
+            
+            tic = time.perf_counter()
+            self.frontier.q234(tbd_url, resp)
+            toc = time.perf_counter()
+            print(f"Took {toc - tic:0.4f} seconds to do download url")
+
            self.logger.info(
                f"Downloaded {tbd_url}, status <{resp.status}>, "
                f"using cache {self.config.cache_server}.")
--- a/spacetime-crawler4py-master/datacollection.py
+++ b/spacetime-crawler4py-master/datacollection.py
@@ -1,5 +1,5 @@
 import re
-
+import os
 import urllib.request
 from urllib.parse import urlparse
 from urllib.parse import urljoin
@@ -80,16 +80,14 @@ def findDomains(url):
                    return urlsplit[i-1], urlsplit[i] #something like random.vision.ics.uci.edu will be consider a unique page of vision
        return None, None

-def tokenize(url):
+def tokenize(resp):
    # getting connection from url
-    page = urllib.request.urlopen(url)
-    data = page.read()
    valid = re.compile(r'[^a-zA-Z0-9]+')
    # named it tSoup for merge convience
    # need the 'lxml' parser for this.
    #       When extract_next_links is called it returns a list full of links with no resp, and I had to find a way to get text from just link.
    #       Therefore, I decided to get the plain text this way.
-    tSoup = BeautifulSoup(data, 'lxml')
+    tSoup = BeautifulSoup(resp.raw_response.content, 'lxml')

    # Floyd (1 March 2021) Stackoverflow. https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python
    #       compared this with tSoup.get_text() and clean_text just provided content easier to tokenize and more inline with my intentions