Change more syntax to get data collection working, check extracturl and sorted links into sets instead of lists to signifcantly reduce url extractions

2022-04-20 04:03:58 -07:00 · 2022-04-20 04:03:58 -07:00 · 58d15918d5
commit 58d15918d5
parent d0dde4a4db
4 changed files with 19 additions and 9 deletions
--- a/spacetime-crawler4py-master/Logs/FRONTIER.log
+++ b/spacetime-crawler4py-master/Logs/FRONTIER.log
@ -26,3 +26,7 @@
 2022-04-20 03:45:40,958 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
 2022-04-20 03:49:54,245 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
 2022-04-20 03:50:02,043 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
 2022-04-20 03:59:18,104 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
 2022-04-20 04:00:14,031 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
 2022-04-20 04:01:31,386 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
 2022-04-20 04:02:16,043 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
--- a/spacetime-crawler4py-master/Logs/Worker.log
+++ b/spacetime-crawler4py-master/Logs/Worker.log
@ -28,3 +28,9 @@
 2022-04-20 03:46:32,554 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/from-theory-to-practice-ucis-machine-learning-hackathon-delivers, status <200>, using cache ('styx.ics.uci.edu', 9009).
 2022-04-20 03:49:54,351 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
 2022-04-20 03:50:02,145 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
 2022-04-20 03:52:31,224 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
 2022-04-20 03:59:18,220 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
 2022-04-20 04:00:14,134 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
 2022-04-20 04:01:31,499 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
 2022-04-20 04:02:16,153 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
 2022-04-20 04:02:35,264 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/stephan-mandt-and-collaborators-receive-3-5-million-to-study-machine-learning-for-climate-science, status <200>, using cache ('styx.ics.uci.edu', 9009).
--- a/spacetime-crawler4py-master/crawler/frontier.py
+++ b/spacetime-crawler4py-master/crawler/frontier.py
@ -111,17 +111,17 @@ class Frontier(object):
        f = open("q1.txt", "w")
-        f.write("Number of unique pages: {length}\n".format(length = len(uniques)))
+        f.write("Number of unique pages: {length}\n".format(length = len(self.uniques)))
        f.close()
        # creating text file for question 2
        f = open("q2.txt", "w")
-        f.write("Largest page url: {url} \nLength of page: {length}".format(url = longest, length = max))
+        f.write("Largest page url: {url} \nLength of page: {length}".format(url = self.longest, length = self.max))
        f.close()
        # creating text file for question 3
        f = open("q3.txt", "w")
-        sortedGrandDict = {k: v for k, v in sorted(grand_dict.items(), key=lambda item: item[1], reverse = True)}
+        sortedGrandDict = {k: v for k, v in sorted(self.grand_dict.items(), key=lambda item: item[1], reverse = True)}
        i = 0
        for k, v in sortedGrandDict.items():
            if i == 50:
@ -132,10 +132,10 @@ class Frontier(object):
        f.close()
        # creating text file for question 4
-        sortedDictKeys = sorted(ics.keys())
+        sortedDictKeys = sorted(self.ics.keys())
        f = open("q4.txt", "w")
        for i in sortedDictKeys:
-            f.write("{url}, {num}".format(url = ics[i].getNiceLink(), num = len(ics[i].getUniques())))
+            f.write("{url}, {num}".format(url = self.ics[i].getNiceLink(), num = len(self.ics[i].getUniques())))
        f.close()
--- a/spacetime-crawler4py-master/scraper.py
+++ b/spacetime-crawler4py-master/scraper.py
@ -24,7 +24,7 @@ from datacollection import *
 def scraper(url, resp):
    links = extract_next_links(url, resp)
-    links_valid = list()
+    links_valid = set()
    #valid_links = open("valid_links.txt",'a')
    #invalid_links = open("invalid_links.txt",'a')
@ -32,7 +32,7 @@ def scraper(url, resp):
    for link in links:
        tic = time.perf_counter()
        if is_valid(link):
-            links_valid.append(link)
+            links_valid.add(link)
            toc = time.perf_counter()
            print(f"Took {toc - tic:0.4f} seconds to do validate url")
            #valid_links.write(link + "\n")
@ -83,7 +83,7 @@ def extract_next_links(url, resp):
    #         resp.raw_response.url: the url, again
    #         resp.raw_response.content: the content of the page!
    # Return a list with the hyperlinks (as strings) scrapped from resp.raw_response.content
-    pages = list()
+    pages = set()
    if resp.status == 200:
        #do stuff
        soup = BeautifulSoup(resp.raw_response.content)
@ -115,7 +115,7 @@ def extract_next_links(url, resp):
            #tempFile.write(href_link + "\n")
            #Adding to the boi wonder pages
-            pages.append(href_link)
+            pages.add(href_link)
    else:
        print("Page error !")
    return pages