From 8b96a7c9f79c2c4717f0e924a0e549cef3949ef6 Mon Sep 17 00:00:00 2001 From: Hieuhuy Pham Date: Thu, 21 Apr 2022 21:08:23 -0700 Subject: [PATCH] More refinement of frontier and worker for delicious multi-threading --- spacetime-crawler4py-master/config.ini | 6 +- .../crawler/frontier.py | 90 ++++--------------- spacetime-crawler4py-master/crawler/worker.py | 9 +- 3 files changed, 26 insertions(+), 79 deletions(-) diff --git a/spacetime-crawler4py-master/config.ini b/spacetime-crawler4py-master/config.ini index 1fdf095..2a0e135 100644 --- a/spacetime-crawler4py-master/config.ini +++ b/spacetime-crawler4py-master/config.ini @@ -1,6 +1,6 @@ [IDENTIFICATION] # Set your user agent string here. -USERAGENT = IR US22 19854690,44333574 +USERAGENT = IR US22 19854690,44333574,95241547 [CONNECTION] HOST = styx.ics.uci.edu @@ -9,12 +9,12 @@ PORT = 9000 [CRAWLER] SEEDURL = https://www.ics.uci.edu,https://www.cs.uci.edu,https://www.informatics.uci.edu,https://www.stat.uci.edu # In seconds -POLITENESS = 0.05 +POLITENESS = 0.5 [LOCAL PROPERTIES] # Save file for progress SAVE = frontier.shelve # IMPORTANT: DO NOT CHANGE IT IF YOU HAVE NOT IMPLEMENTED MULTITHREADING. -THREADCOUNT = 8 +THREADCOUNT = 1 diff --git a/spacetime-crawler4py-master/crawler/frontier.py b/spacetime-crawler4py-master/crawler/frontier.py index 2106d21..7d4f994 100644 --- a/spacetime-crawler4py-master/crawler/frontier.py +++ b/spacetime-crawler4py-master/crawler/frontier.py @@ -1,7 +1,7 @@ import os import shelve -from threading import Thread, RLock +from threading import Thread, Lock,Semaphore from queue import Queue, Empty from utils import get_logger, get_urlhash, normalize @@ -15,12 +15,14 @@ from datacollection import * #*.stat.uci.edu/* 3 #today.uci.edu/department/information_computer_sciences/* 4 +#Semaphore for each domain to keep each domain noice and tidy with politeness domain_semaphores = [Semaphore(3),Semaphore(3),Semaphore(3),Semaphore(3),Semaphore(3)] data_mutex = Lock() + file_1_mutex = Lock() file_2_mutex = Lock() file_3_mutex = Lock() -file_4_mutex = LocK() +file_4_mutex = Lock() class Frontier(object): def __init__(self, config, restart): @@ -112,73 +114,7 @@ class Frontier(object): - # Q1 - ###CRITICAL SECTION - file_1_mutex.acquire() - self.uniques.add(removeFragment(url)) - - #Writing to local file - f = open("q1.txt", "w") - f.write("Number of unique pages: {length}\n".format(length = len(self.uniques))) - f.close() - - file_1_mutex.release() - - # Q2 - file_2_mutex.acquire() - tempTok = tokenize(url) - if len(tempTok) > self.max: - self.max = len(tempTok) - self.longest = url - - - # creating text file for question 2 - f = open("q2.txt", "w") - f.write("Largest page url: {url} \nLength of page: {length}".format(url = self.longest, length = self.max)) - f.close() - - file_2_mutex.release() - - # Q3 - file_3_mutex.acquire() - tempTok = removeStopWords(tempTok) - computeFrequencies(tempTok, self.grand_dict) - - # creating text file for question 3 - f = open("q3.txt", "w") - sortedGrandDict = {k: v for k, v in sorted(self.grand_dict.items(), key=lambda item: item[1], reverse = True)} - i = 0 - for k, v in sortedGrandDict.items(): - if i == 50: - break - else: - f.write("{}: {}\n".format(k, v)) - i += 1 - f.close() - - file_3_mutex.release() - - # Q4 - file_4_mutex.acquire() - - fragless = removeFragment(url) - domain = findDomains(fragless.netloc) - if domain[1] == 'ics': - if domain[0] not in self.ics: - self.ics[domain[0]] = urlData(url, domain[0], domain[1]) - else: - if fragless not in self.ics[domain[0]].getUniques(): - self.ics[domain[0]].appendUnique(fragless) - - # creating text file for question 4 - sortedDictKeys = sorted(self.ics.keys()) - f = open("q4.txt", "w") - for i in sortedDictKeys: - f.write("{url}, {num}".format(url = self.ics[i].getNiceLink(), num = len(self.ics[i].getUniques()))) - f.close() - - file_4_mutex.release() - + def acquire_polite(url): return domain_semaphores[get_semaphore_index(url)].acquire() @@ -206,6 +142,8 @@ class Frontier(object): my_filename = os.path.join(path_to_script, "q1.txt") # Will create a file of all the unique links and you can read the file and do lines = f.readlines() then len(lines) to get the number of unique links + #Locking and releasing each file + file_1_mutex.acquire() if (os.path.exists(my_filename)): f = open(my_filename, 'a') f.write(removeFragment(url)) @@ -214,10 +152,12 @@ class Frontier(object): f = open(my_filename, 'w') f.write(removeFragment(url)) f.close() - + file_1_mutex.release() + def q234(self, url, resp): # rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory # this saves to the local directory, so I can constantly access the right file and check if it exists or not + file_2_mutex.acquire() path_to_script = os.path.dirname(os.path.abspath(__file__)) my_filename = os.path.join(path_to_script, "q2.txt") @@ -228,7 +168,8 @@ class Frontier(object): f = open(my_filename, 'w') f.write("Longest Page: {url}, length: {length}".format(url = self.longest, length = self.max)) f.close() - + file_2_mutex.release() + tempTok = removeStopWords(tempTok) computeFrequencies(tempTok, self.grand_dict) @@ -237,6 +178,8 @@ class Frontier(object): path_to_script = os.path.dirname(os.path.abspath(__file__)) my_filename = os.path.join(path_to_script, "q3.txt") + file_3_mutex.acquire() + f = open(my_filename, "w") sortedGrandDict = {k: v for k, v in sorted(self.grand_dict.items(), key=lambda item: item[1], reverse = True)} i = 0 @@ -248,6 +191,8 @@ class Frontier(object): i += 1 f.close() + file_3_mutex.release() + fragless = removeFragment(url) domain = findDomains(fragless.netloc) if domain[1] == 'ics': @@ -263,10 +208,11 @@ class Frontier(object): my_filename = os.path.join(path_to_script, "q4.txt") # creating text file for question 4 + file_4_mutex.acquire() sortedDictKeys = sorted(self.ics.keys()) f = open(my_filename, "w") for i in sortedDictKeys: f.write("{url}, {num}".format(url = self.ics[i].getNiceLink(), num = len(self.ics[i].getUniques()))) f.close() - + file_4_mutex.release() \ No newline at end of file diff --git a/spacetime-crawler4py-master/crawler/worker.py b/spacetime-crawler4py-master/crawler/worker.py index a64120a..ea81359 100644 --- a/spacetime-crawler4py-master/crawler/worker.py +++ b/spacetime-crawler4py-master/crawler/worker.py @@ -35,16 +35,17 @@ class Worker(Thread): tic = time.perf_counter() self.frontier.q1(tbd_url) toc = time.perf_counter() - print(f"Took {toc - tic:0.4f} seconds to do download url") + print(f"Took {toc - tic:0.4f} seconds to do log q1 url") tic = time.perf_counter() self.frontier.q234(tbd_url, resp) toc = time.perf_counter() - print(f"Took {toc - tic:0.4f} seconds to do download url") + print(f"Took {toc - tic:0.4f} seconds to do log q234 url") self.logger.info( f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") + tic = time.perf_counter() scraped_urls = scraper.scraper(tbd_url, resp) toc = time.perf_counter() @@ -57,7 +58,7 @@ class Worker(Thread): toc = time.perf_counter() print(f"Took {toc - tic:0.4f} seconds to do store stuffs") - while(start + self.config.time_delay > time.perf_counter()){ + while start + self.config.time_delay > time.perf_counter(): time.sleep(self.config.time_delay/5) self.frontier.release_polite(tbd_url) - } +