Fixes error in syntax for new merged code from data collection branch, fixed 'infinite loop', added timers to measure performance of functions.
This commit is contained in:
@@ -73,12 +73,24 @@ class Frontier(object):
|
||||
self.save.sync()
|
||||
self.to_be_downloaded.append(url)
|
||||
|
||||
|
||||
|
||||
def mark_url_complete(self, url):
|
||||
urlhash = get_urlhash(url)
|
||||
if urlhash not in self.save:
|
||||
# This should not happen.
|
||||
self.logger.error(
|
||||
f"Completed url {url}, but have not seen it before.")
|
||||
|
||||
|
||||
|
||||
|
||||
# Q1
|
||||
self.uniques.add(removeFragment(url))
|
||||
|
||||
# Q2
|
||||
tempTok = tokenize(url)
|
||||
if len(tempTok) > max:
|
||||
if len(tempTok) > self.max:
|
||||
self.max = len(tempTok)
|
||||
self.longest = url
|
||||
|
||||
@@ -97,12 +109,35 @@ class Frontier(object):
|
||||
self.ics[domain[0]].appendUnique(fragless)
|
||||
|
||||
|
||||
def mark_url_complete(self, url):
|
||||
urlhash = get_urlhash(url)
|
||||
if urlhash not in self.save:
|
||||
# This should not happen.
|
||||
self.logger.error(
|
||||
f"Completed url {url}, but have not seen it before.")
|
||||
|
||||
f = open("q1.txt", "w")
|
||||
f.write("Number of unique pages: {length}\n".format(length = len(uniques)))
|
||||
f.close()
|
||||
|
||||
# creating text file for question 2
|
||||
f = open("q2.txt", "w")
|
||||
f.write("Largest page url: {url} \nLength of page: {length}".format(url = longest, length = max))
|
||||
f.close()
|
||||
|
||||
# creating text file for question 3
|
||||
f = open("q3.txt", "w")
|
||||
sortedGrandDict = {k: v for k, v in sorted(grand_dict.items(), key=lambda item: item[1], reverse = True)}
|
||||
i = 0
|
||||
for k, v in sortedGrandDict.items():
|
||||
if i == 50:
|
||||
break
|
||||
else:
|
||||
f.write("{}: {}\n".format(k, v))
|
||||
i += 1
|
||||
f.close()
|
||||
|
||||
# creating text file for question 4
|
||||
sortedDictKeys = sorted(ics.keys())
|
||||
f = open("q4.txt", "w")
|
||||
for i in sortedDictKeys:
|
||||
f.write("{url}, {num}".format(url = ics[i].getNiceLink(), num = len(ics[i].getUniques())))
|
||||
f.close()
|
||||
|
||||
|
||||
self.save[urlhash] = (url, True)
|
||||
self.save.sync()
|
||||
|
@@ -18,16 +18,29 @@ class Worker(Thread):
|
||||
|
||||
def run(self):
|
||||
while True:
|
||||
tic = time.perf_counter()
|
||||
tbd_url = self.frontier.get_tbd_url()
|
||||
toc = time.perf_counter()
|
||||
print(f"Took {toc - tic:0.4f} seconds to get_tbd_url")
|
||||
if not tbd_url:
|
||||
self.logger.info("Frontier is empty. Stopping Crawler.")
|
||||
break
|
||||
tic = time.perf_counter()
|
||||
resp = download(tbd_url, self.config, self.logger)
|
||||
toc = time.perf_counter()
|
||||
print(f"Took {toc - tic:0.4f} seconds to do download url")
|
||||
self.logger.info(
|
||||
f"Downloaded {tbd_url}, status <{resp.status}>, "
|
||||
f"using cache {self.config.cache_server}.")
|
||||
tic = time.perf_counter()
|
||||
scraped_urls = scraper.scraper(tbd_url, resp)
|
||||
toc = time.perf_counter()
|
||||
print(f"Took {toc - tic:0.4f} seconds to do scrape url")
|
||||
tic = time.perf_counter()
|
||||
|
||||
for scraped_url in scraped_urls:
|
||||
self.frontier.add_url(scraped_url)
|
||||
self.frontier.mark_url_complete(tbd_url)
|
||||
toc = time.perf_counter()
|
||||
print(f"Took {toc - tic:0.4f} seconds to do store stuffs")
|
||||
time.sleep(self.config.time_delay)
|
||||
|
Reference in New Issue
Block a user