More locks and sempahore refinement
This commit is contained in:
parent
320fe26c23
commit
9301bd5ebe
@ -9,12 +9,12 @@ PORT = 9000
|
|||||||
[CRAWLER]
|
[CRAWLER]
|
||||||
SEEDURL = https://www.ics.uci.edu,https://www.cs.uci.edu,https://www.informatics.uci.edu,https://www.stat.uci.edu
|
SEEDURL = https://www.ics.uci.edu,https://www.cs.uci.edu,https://www.informatics.uci.edu,https://www.stat.uci.edu
|
||||||
# In seconds
|
# In seconds
|
||||||
POLITENESS = 0.5
|
POLITENESS = 0.05
|
||||||
|
|
||||||
[LOCAL PROPERTIES]
|
[LOCAL PROPERTIES]
|
||||||
# Save file for progress
|
# Save file for progress
|
||||||
SAVE = frontier.shelve
|
SAVE = frontier.shelve
|
||||||
|
|
||||||
# IMPORTANT: DO NOT CHANGE IT IF YOU HAVE NOT IMPLEMENTED MULTITHREADING.
|
# IMPORTANT: DO NOT CHANGE IT IF YOU HAVE NOT IMPLEMENTED MULTITHREADING.
|
||||||
THREADCOUNT = 1
|
THREADCOUNT = 8
|
||||||
|
|
||||||
|
@ -179,11 +179,10 @@ class Frontier(object):
|
|||||||
file_4_mutex.release()
|
file_4_mutex.release()
|
||||||
|
|
||||||
def acquire_polite(url):
|
def acquire_polite(url):
|
||||||
|
return domain_semaphores[get_semaphore_index(url)].acquire()
|
||||||
pass;
|
|
||||||
|
|
||||||
def release_polite(domain):
|
def release_polite(domain):
|
||||||
pass;
|
return domain_semaphores[get_semaphore_index(url)].release()
|
||||||
|
|
||||||
def get_semaphore_index(url):
|
def get_semaphore_index(url):
|
||||||
if "ics.uci.edu" in url:
|
if "ics.uci.edu" in url:
|
||||||
|
@ -18,6 +18,7 @@ class Worker(Thread):
|
|||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
while True:
|
while True:
|
||||||
|
start = time.perf_counter()
|
||||||
tic = time.perf_counter()
|
tic = time.perf_counter()
|
||||||
tbd_url = self.frontier.get_tbd_url()
|
tbd_url = self.frontier.get_tbd_url()
|
||||||
toc = time.perf_counter()
|
toc = time.perf_counter()
|
||||||
@ -25,10 +26,12 @@ class Worker(Thread):
|
|||||||
if not tbd_url:
|
if not tbd_url:
|
||||||
self.logger.info("Frontier is empty. Stopping Crawler.")
|
self.logger.info("Frontier is empty. Stopping Crawler.")
|
||||||
break
|
break
|
||||||
|
self.frontier.acquire_polite(tbd_url)
|
||||||
tic = time.perf_counter()
|
tic = time.perf_counter()
|
||||||
resp = download(tbd_url, self.config, self.logger)
|
resp = download(tbd_url, self.config, self.logger)
|
||||||
toc = time.perf_counter()
|
toc = time.perf_counter()
|
||||||
print(f"Took {toc - tic:0.4f} seconds to do download url")
|
print(f"Took {toc - tic:0.4f} seconds to do download url")
|
||||||
|
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
f"Downloaded {tbd_url}, status <{resp.status}>, "
|
f"Downloaded {tbd_url}, status <{resp.status}>, "
|
||||||
f"using cache {self.config.cache_server}.")
|
f"using cache {self.config.cache_server}.")
|
||||||
@ -36,11 +39,15 @@ class Worker(Thread):
|
|||||||
scraped_urls = scraper.scraper(tbd_url, resp)
|
scraped_urls = scraper.scraper(tbd_url, resp)
|
||||||
toc = time.perf_counter()
|
toc = time.perf_counter()
|
||||||
print(f"Took {toc - tic:0.4f} seconds to do scrape url")
|
print(f"Took {toc - tic:0.4f} seconds to do scrape url")
|
||||||
tic = time.perf_counter()
|
|
||||||
|
|
||||||
|
tic = time.perf_counter()
|
||||||
for scraped_url in scraped_urls:
|
for scraped_url in scraped_urls:
|
||||||
self.frontier.add_url(scraped_url)
|
self.frontier.add_url(scraped_url)
|
||||||
self.frontier.mark_url_complete(tbd_url)
|
self.frontier.mark_url_complete(tbd_url)
|
||||||
toc = time.perf_counter()
|
toc = time.perf_counter()
|
||||||
print(f"Took {toc - tic:0.4f} seconds to do store stuffs")
|
print(f"Took {toc - tic:0.4f} seconds to do store stuffs")
|
||||||
time.sleep(self.config.time_delay)
|
|
||||||
|
while(start + self.config.time_delay > time.perf_counter()){
|
||||||
|
time.sleep(self.config.time_delay/5)
|
||||||
|
self.frontier.release_polite(tbd_url)
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user