From 320fe26c2337b078ae9f47bff67fb5238ac58de8 Mon Sep 17 00:00:00 2001 From: Hieuhuy Pham Date: Thu, 21 Apr 2022 19:44:30 -0700 Subject: [PATCH] Added basic multi-threading, reader-first implementation --- spacetime-crawler4py-master/Logs/FRONTIER.log | 1 + spacetime-crawler4py-master/Logs/Worker.log | 8 ++ .../crawler/frontier.py | 103 ++++++++++++++---- 3 files changed, 89 insertions(+), 23 deletions(-) diff --git a/spacetime-crawler4py-master/Logs/FRONTIER.log b/spacetime-crawler4py-master/Logs/FRONTIER.log index 48ab184..24b8781 100644 --- a/spacetime-crawler4py-master/Logs/FRONTIER.log +++ b/spacetime-crawler4py-master/Logs/FRONTIER.log @@ -30,3 +30,4 @@ 2022-04-20 04:00:14,031 - FRONTIER - INFO - Found save file frontier.shelve, deleting it. 2022-04-20 04:01:31,386 - FRONTIER - INFO - Found save file frontier.shelve, deleting it. 2022-04-20 04:02:16,043 - FRONTIER - INFO - Found save file frontier.shelve, deleting it. +2022-04-20 18:08:59,911 - FRONTIER - INFO - Found save file frontier.shelve, deleting it. diff --git a/spacetime-crawler4py-master/Logs/Worker.log b/spacetime-crawler4py-master/Logs/Worker.log index c022b90..3817f18 100644 --- a/spacetime-crawler4py-master/Logs/Worker.log +++ b/spacetime-crawler4py-master/Logs/Worker.log @@ -34,3 +34,11 @@ 2022-04-20 04:01:31,499 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009). 2022-04-20 04:02:16,153 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009). 2022-04-20 04:02:35,264 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/stephan-mandt-and-collaborators-receive-3-5-million-to-study-machine-learning-for-climate-science, status <200>, using cache ('styx.ics.uci.edu', 9009). +2022-04-20 18:09:00,035 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9002). +2022-04-20 18:09:09,207 - Worker-0 - INFO - Downloaded https://mds.ics.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9002). +2022-04-20 18:09:13,589 - Worker-0 - INFO - Downloaded https://mds.ics.uci.edu/events, status <200>, using cache ('styx.ics.uci.edu', 9002). +2022-04-20 18:09:16,651 - Worker-0 - INFO - Downloaded https://www.statistics.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9002). +2022-04-20 18:09:18,416 - Worker-0 - INFO - Downloaded https://www.statistics.uci.edu/seminar-series, status <200>, using cache ('styx.ics.uci.edu', 9002). +2022-04-20 18:09:20,376 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/seminar-series-2020-2021, status <200>, using cache ('styx.ics.uci.edu', 9002). +2022-04-20 18:09:22,307 - Worker-0 - INFO - Downloaded http://www.stat.uci.edu/seminar-series/seminar-series-2015-2016, status <200>, using cache ('styx.ics.uci.edu', 9002). +2022-04-20 18:09:24,295 - Worker-0 - INFO - Downloaded http://www.stat.uci.edu/ICS/statistics/research/seminarseries/2011-2012/index, status <404>, using cache ('styx.ics.uci.edu', 9002). diff --git a/spacetime-crawler4py-master/crawler/frontier.py b/spacetime-crawler4py-master/crawler/frontier.py index 162a732..aadf695 100644 --- a/spacetime-crawler4py-master/crawler/frontier.py +++ b/spacetime-crawler4py-master/crawler/frontier.py @@ -8,6 +8,20 @@ from utils import get_logger, get_urlhash, normalize from scraper import is_valid from datacollection import * + +#*.ics.uci.edu/* 0 +#*.cs.uci.edu/* 1 +#*.informatics.uci.edu/* 2 +#*.stat.uci.edu/* 3 +#today.uci.edu/department/information_computer_sciences/* 4 + +domain_semaphores = [Semaphore(3),Semaphore(3),Semaphore(3),Semaphore(3),Semaphore(3)] +data_mutex = Lock() +file_1_mutex = Lock() +file_2_mutex = Lock() +file_3_mutex = Lock() +file_4_mutex = LocK() + class Frontier(object): def __init__(self, config, restart): self.logger = get_logger("FRONTIER") @@ -60,65 +74,75 @@ class Frontier(object): f"total urls discovered.") def get_tbd_url(self): + ###CRITICAL SECTION + data_mutex.acquire() try: return self.to_be_downloaded.pop() except IndexError: return None + data_mutex.release() def add_url(self, url): url = normalize(url) urlhash = get_urlhash(url) + ##CRITICAL SECTION + data_mutex.acquire() if urlhash not in self.save: self.save[urlhash] = (url, False) self.save.sync() self.to_be_downloaded.append(url) - + data_mutex.release() + ###CRITICAL SECTION def mark_url_complete(self, url): urlhash = get_urlhash(url) + + ##CRITICAL SECTION + data_mutex.acquire() if urlhash not in self.save: # This should not happen. self.logger.error( f"Completed url {url}, but have not seen it before.") - + self.save[urlhash] = (url, True) + self.save.sync() + data_mutex.release() + ##CRITICAL SECTION # Q1 + ###CRITICAL SECTION + file_1_mutex.acquire() self.uniques.add(removeFragment(url)) + #Writing to local file + f = open("q1.txt", "w") + f.write("Number of unique pages: {length}\n".format(length = len(self.uniques))) + f.close() + + file_1_mutex.release() + # Q2 + file_2_mutex.acquire() tempTok = tokenize(url) if len(tempTok) > self.max: self.max = len(tempTok) self.longest = url - # Q3 - tempTok = removeStopWords(tempTok) - computeFrequencies(tempTok, self.grand_dict) - - # Q4 - fragless = removeFragment(url) - domain = findDomains(fragless.netloc) - if domain[1] == 'ics': - if domain[0] not in self.ics: - self.ics[domain[0]] = urlData(url, domain[0], domain[1]) - else: - if fragless not in self.ics[domain[0]].getUniques(): - self.ics[domain[0]].appendUnique(fragless) - - - - f = open("q1.txt", "w") - f.write("Number of unique pages: {length}\n".format(length = len(self.uniques))) - f.close() # creating text file for question 2 f = open("q2.txt", "w") f.write("Largest page url: {url} \nLength of page: {length}".format(url = self.longest, length = self.max)) f.close() + file_2_mutex.release() + + # Q3 + file_3_mutex.acquire() + tempTok = removeStopWords(tempTok) + computeFrequencies(tempTok, self.grand_dict) + # creating text file for question 3 f = open("q3.txt", "w") sortedGrandDict = {k: v for k, v in sorted(self.grand_dict.items(), key=lambda item: item[1], reverse = True)} @@ -131,6 +155,20 @@ class Frontier(object): i += 1 f.close() + file_3_mutex.release() + + # Q4 + file_4_mutex.acquire() + + fragless = removeFragment(url) + domain = findDomains(fragless.netloc) + if domain[1] == 'ics': + if domain[0] not in self.ics: + self.ics[domain[0]] = urlData(url, domain[0], domain[1]) + else: + if fragless not in self.ics[domain[0]].getUniques(): + self.ics[domain[0]].appendUnique(fragless) + # creating text file for question 4 sortedDictKeys = sorted(self.ics.keys()) f = open("q4.txt", "w") @@ -138,6 +176,25 @@ class Frontier(object): f.write("{url}, {num}".format(url = self.ics[i].getNiceLink(), num = len(self.ics[i].getUniques()))) f.close() + file_4_mutex.release() - self.save[urlhash] = (url, True) - self.save.sync() + def acquire_polite(url): + + pass; + + def release_polite(domain): + pass; + + def get_semaphore_index(url): + if "ics.uci.edu" in url: + return 0 + elif "cs.uci.edu" in url: + return 1 + elif "informatics.uci.edu" in url: + return 2 + elif "stat.uci.edu" in url: + return 3 + elif "today.uci.edu/department/information_computer_sciences/" in url: + return 4 + else: + println("ERROR") \ No newline at end of file