From 9301bd5ebe71ac097662ed50356aca38e4ffb659 Mon Sep 17 00:00:00 2001 From: Hieuhuy Pham Date: Thu, 21 Apr 2022 20:41:25 -0700 Subject: [PATCH] More locks and sempahore refinement --- spacetime-crawler4py-master/config.ini | 4 ++-- spacetime-crawler4py-master/crawler/frontier.py | 5 ++--- spacetime-crawler4py-master/crawler/worker.py | 11 +++++++++-- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/spacetime-crawler4py-master/config.ini b/spacetime-crawler4py-master/config.ini index 6e81dce..1fdf095 100644 --- a/spacetime-crawler4py-master/config.ini +++ b/spacetime-crawler4py-master/config.ini @@ -9,12 +9,12 @@ PORT = 9000 [CRAWLER] SEEDURL = https://www.ics.uci.edu,https://www.cs.uci.edu,https://www.informatics.uci.edu,https://www.stat.uci.edu # In seconds -POLITENESS = 0.5 +POLITENESS = 0.05 [LOCAL PROPERTIES] # Save file for progress SAVE = frontier.shelve # IMPORTANT: DO NOT CHANGE IT IF YOU HAVE NOT IMPLEMENTED MULTITHREADING. -THREADCOUNT = 1 +THREADCOUNT = 8 diff --git a/spacetime-crawler4py-master/crawler/frontier.py b/spacetime-crawler4py-master/crawler/frontier.py index aadf695..a1fff94 100644 --- a/spacetime-crawler4py-master/crawler/frontier.py +++ b/spacetime-crawler4py-master/crawler/frontier.py @@ -179,11 +179,10 @@ class Frontier(object): file_4_mutex.release() def acquire_polite(url): - - pass; + return domain_semaphores[get_semaphore_index(url)].acquire() def release_polite(domain): - pass; + return domain_semaphores[get_semaphore_index(url)].release() def get_semaphore_index(url): if "ics.uci.edu" in url: diff --git a/spacetime-crawler4py-master/crawler/worker.py b/spacetime-crawler4py-master/crawler/worker.py index 8a514f1..42818a5 100644 --- a/spacetime-crawler4py-master/crawler/worker.py +++ b/spacetime-crawler4py-master/crawler/worker.py @@ -18,6 +18,7 @@ class Worker(Thread): def run(self): while True: + start = time.perf_counter() tic = time.perf_counter() tbd_url = self.frontier.get_tbd_url() toc = time.perf_counter() @@ -25,10 +26,12 @@ class Worker(Thread): if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") break + self.frontier.acquire_polite(tbd_url) tic = time.perf_counter() resp = download(tbd_url, self.config, self.logger) toc = time.perf_counter() print(f"Took {toc - tic:0.4f} seconds to do download url") + self.logger.info( f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") @@ -36,11 +39,15 @@ class Worker(Thread): scraped_urls = scraper.scraper(tbd_url, resp) toc = time.perf_counter() print(f"Took {toc - tic:0.4f} seconds to do scrape url") - tic = time.perf_counter() + tic = time.perf_counter() for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) toc = time.perf_counter() print(f"Took {toc - tic:0.4f} seconds to do store stuffs") - time.sleep(self.config.time_delay) + + while(start + self.config.time_delay > time.perf_counter()){ + time.sleep(self.config.time_delay/5) + self.frontier.release_polite(tbd_url) + }