More locks and sempahore refinement

This commit is contained in:
Hieuhuy Pham 2022-04-21 20:41:25 -07:00
parent 320fe26c23
commit 9301bd5ebe
3 changed files with 13 additions and 7 deletions

View File

@ -9,12 +9,12 @@ PORT = 9000
[CRAWLER] [CRAWLER]
SEEDURL = https://www.ics.uci.edu,https://www.cs.uci.edu,https://www.informatics.uci.edu,https://www.stat.uci.edu SEEDURL = https://www.ics.uci.edu,https://www.cs.uci.edu,https://www.informatics.uci.edu,https://www.stat.uci.edu
# In seconds # In seconds
POLITENESS = 0.5 POLITENESS = 0.05
[LOCAL PROPERTIES] [LOCAL PROPERTIES]
# Save file for progress # Save file for progress
SAVE = frontier.shelve SAVE = frontier.shelve
# IMPORTANT: DO NOT CHANGE IT IF YOU HAVE NOT IMPLEMENTED MULTITHREADING. # IMPORTANT: DO NOT CHANGE IT IF YOU HAVE NOT IMPLEMENTED MULTITHREADING.
THREADCOUNT = 1 THREADCOUNT = 8

View File

@ -179,11 +179,10 @@ class Frontier(object):
file_4_mutex.release() file_4_mutex.release()
def acquire_polite(url): def acquire_polite(url):
return domain_semaphores[get_semaphore_index(url)].acquire()
pass;
def release_polite(domain): def release_polite(domain):
pass; return domain_semaphores[get_semaphore_index(url)].release()
def get_semaphore_index(url): def get_semaphore_index(url):
if "ics.uci.edu" in url: if "ics.uci.edu" in url:

View File

@ -18,6 +18,7 @@ class Worker(Thread):
def run(self): def run(self):
while True: while True:
start = time.perf_counter()
tic = time.perf_counter() tic = time.perf_counter()
tbd_url = self.frontier.get_tbd_url() tbd_url = self.frontier.get_tbd_url()
toc = time.perf_counter() toc = time.perf_counter()
@ -25,10 +26,12 @@ class Worker(Thread):
if not tbd_url: if not tbd_url:
self.logger.info("Frontier is empty. Stopping Crawler.") self.logger.info("Frontier is empty. Stopping Crawler.")
break break
self.frontier.acquire_polite(tbd_url)
tic = time.perf_counter() tic = time.perf_counter()
resp = download(tbd_url, self.config, self.logger) resp = download(tbd_url, self.config, self.logger)
toc = time.perf_counter() toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to do download url") print(f"Took {toc - tic:0.4f} seconds to do download url")
self.logger.info( self.logger.info(
f"Downloaded {tbd_url}, status <{resp.status}>, " f"Downloaded {tbd_url}, status <{resp.status}>, "
f"using cache {self.config.cache_server}.") f"using cache {self.config.cache_server}.")
@ -36,11 +39,15 @@ class Worker(Thread):
scraped_urls = scraper.scraper(tbd_url, resp) scraped_urls = scraper.scraper(tbd_url, resp)
toc = time.perf_counter() toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to do scrape url") print(f"Took {toc - tic:0.4f} seconds to do scrape url")
tic = time.perf_counter()
tic = time.perf_counter()
for scraped_url in scraped_urls: for scraped_url in scraped_urls:
self.frontier.add_url(scraped_url) self.frontier.add_url(scraped_url)
self.frontier.mark_url_complete(tbd_url) self.frontier.mark_url_complete(tbd_url)
toc = time.perf_counter() toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to do store stuffs") print(f"Took {toc - tic:0.4f} seconds to do store stuffs")
time.sleep(self.config.time_delay)
while(start + self.config.time_delay > time.perf_counter()){
time.sleep(self.config.time_delay/5)
self.frontier.release_polite(tbd_url)
}