From 58d15918d5753caee1439b14166dda8fa16bd80d Mon Sep 17 00:00:00 2001 From: Hieuhuy Pham Date: Wed, 20 Apr 2022 04:03:58 -0700 Subject: [PATCH] Change more syntax to get data collection working, check extracturl and sorted links into sets instead of lists to signifcantly reduce url extractions --- spacetime-crawler4py-master/Logs/FRONTIER.log | 4 ++++ spacetime-crawler4py-master/Logs/Worker.log | 6 ++++++ spacetime-crawler4py-master/crawler/frontier.py | 10 +++++----- spacetime-crawler4py-master/scraper.py | 8 ++++---- 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/spacetime-crawler4py-master/Logs/FRONTIER.log b/spacetime-crawler4py-master/Logs/FRONTIER.log index 7a0fa0e..48ab184 100644 --- a/spacetime-crawler4py-master/Logs/FRONTIER.log +++ b/spacetime-crawler4py-master/Logs/FRONTIER.log @@ -26,3 +26,7 @@ 2022-04-20 03:45:40,958 - FRONTIER - INFO - Found save file frontier.shelve, deleting it. 2022-04-20 03:49:54,245 - FRONTIER - INFO - Found save file frontier.shelve, deleting it. 2022-04-20 03:50:02,043 - FRONTIER - INFO - Found save file frontier.shelve, deleting it. +2022-04-20 03:59:18,104 - FRONTIER - INFO - Found save file frontier.shelve, deleting it. +2022-04-20 04:00:14,031 - FRONTIER - INFO - Found save file frontier.shelve, deleting it. +2022-04-20 04:01:31,386 - FRONTIER - INFO - Found save file frontier.shelve, deleting it. +2022-04-20 04:02:16,043 - FRONTIER - INFO - Found save file frontier.shelve, deleting it. diff --git a/spacetime-crawler4py-master/Logs/Worker.log b/spacetime-crawler4py-master/Logs/Worker.log index f028367..c022b90 100644 --- a/spacetime-crawler4py-master/Logs/Worker.log +++ b/spacetime-crawler4py-master/Logs/Worker.log @@ -28,3 +28,9 @@ 2022-04-20 03:46:32,554 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/from-theory-to-practice-ucis-machine-learning-hackathon-delivers, status <200>, using cache ('styx.ics.uci.edu', 9009). 2022-04-20 03:49:54,351 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009). 2022-04-20 03:50:02,145 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009). +2022-04-20 03:52:31,224 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009). +2022-04-20 03:59:18,220 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009). +2022-04-20 04:00:14,134 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009). +2022-04-20 04:01:31,499 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009). +2022-04-20 04:02:16,153 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009). +2022-04-20 04:02:35,264 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/stephan-mandt-and-collaborators-receive-3-5-million-to-study-machine-learning-for-climate-science, status <200>, using cache ('styx.ics.uci.edu', 9009). diff --git a/spacetime-crawler4py-master/crawler/frontier.py b/spacetime-crawler4py-master/crawler/frontier.py index 2a95f03..162a732 100644 --- a/spacetime-crawler4py-master/crawler/frontier.py +++ b/spacetime-crawler4py-master/crawler/frontier.py @@ -111,17 +111,17 @@ class Frontier(object): f = open("q1.txt", "w") - f.write("Number of unique pages: {length}\n".format(length = len(uniques))) + f.write("Number of unique pages: {length}\n".format(length = len(self.uniques))) f.close() # creating text file for question 2 f = open("q2.txt", "w") - f.write("Largest page url: {url} \nLength of page: {length}".format(url = longest, length = max)) + f.write("Largest page url: {url} \nLength of page: {length}".format(url = self.longest, length = self.max)) f.close() # creating text file for question 3 f = open("q3.txt", "w") - sortedGrandDict = {k: v for k, v in sorted(grand_dict.items(), key=lambda item: item[1], reverse = True)} + sortedGrandDict = {k: v for k, v in sorted(self.grand_dict.items(), key=lambda item: item[1], reverse = True)} i = 0 for k, v in sortedGrandDict.items(): if i == 50: @@ -132,10 +132,10 @@ class Frontier(object): f.close() # creating text file for question 4 - sortedDictKeys = sorted(ics.keys()) + sortedDictKeys = sorted(self.ics.keys()) f = open("q4.txt", "w") for i in sortedDictKeys: - f.write("{url}, {num}".format(url = ics[i].getNiceLink(), num = len(ics[i].getUniques()))) + f.write("{url}, {num}".format(url = self.ics[i].getNiceLink(), num = len(self.ics[i].getUniques()))) f.close() diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py index a490616..d903864 100644 --- a/spacetime-crawler4py-master/scraper.py +++ b/spacetime-crawler4py-master/scraper.py @@ -24,7 +24,7 @@ from datacollection import * def scraper(url, resp): links = extract_next_links(url, resp) - links_valid = list() + links_valid = set() #valid_links = open("valid_links.txt",'a') #invalid_links = open("invalid_links.txt",'a') @@ -32,7 +32,7 @@ def scraper(url, resp): for link in links: tic = time.perf_counter() if is_valid(link): - links_valid.append(link) + links_valid.add(link) toc = time.perf_counter() print(f"Took {toc - tic:0.4f} seconds to do validate url") #valid_links.write(link + "\n") @@ -83,7 +83,7 @@ def extract_next_links(url, resp): # resp.raw_response.url: the url, again # resp.raw_response.content: the content of the page! # Return a list with the hyperlinks (as strings) scrapped from resp.raw_response.content - pages = list() + pages = set() if resp.status == 200: #do stuff soup = BeautifulSoup(resp.raw_response.content) @@ -115,7 +115,7 @@ def extract_next_links(url, resp): #tempFile.write(href_link + "\n") #Adding to the boi wonder pages - pages.append(href_link) + pages.add(href_link) else: print("Page error !") return pages