Added basic multi-threading, reader-first implementation
This commit is contained in:
parent
9fcd9cfd99
commit
320fe26c23
@ -30,3 +30,4 @@
|
||||
2022-04-20 04:00:14,031 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||
2022-04-20 04:01:31,386 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||
2022-04-20 04:02:16,043 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||
2022-04-20 18:08:59,911 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||
|
@ -34,3 +34,11 @@
|
||||
2022-04-20 04:01:31,499 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
||||
2022-04-20 04:02:16,153 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
||||
2022-04-20 04:02:35,264 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/stephan-mandt-and-collaborators-receive-3-5-million-to-study-machine-learning-for-climate-science, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
||||
2022-04-20 18:09:00,035 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9002).
|
||||
2022-04-20 18:09:09,207 - Worker-0 - INFO - Downloaded https://mds.ics.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9002).
|
||||
2022-04-20 18:09:13,589 - Worker-0 - INFO - Downloaded https://mds.ics.uci.edu/events, status <200>, using cache ('styx.ics.uci.edu', 9002).
|
||||
2022-04-20 18:09:16,651 - Worker-0 - INFO - Downloaded https://www.statistics.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9002).
|
||||
2022-04-20 18:09:18,416 - Worker-0 - INFO - Downloaded https://www.statistics.uci.edu/seminar-series, status <200>, using cache ('styx.ics.uci.edu', 9002).
|
||||
2022-04-20 18:09:20,376 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/seminar-series-2020-2021, status <200>, using cache ('styx.ics.uci.edu', 9002).
|
||||
2022-04-20 18:09:22,307 - Worker-0 - INFO - Downloaded http://www.stat.uci.edu/seminar-series/seminar-series-2015-2016, status <200>, using cache ('styx.ics.uci.edu', 9002).
|
||||
2022-04-20 18:09:24,295 - Worker-0 - INFO - Downloaded http://www.stat.uci.edu/ICS/statistics/research/seminarseries/2011-2012/index, status <404>, using cache ('styx.ics.uci.edu', 9002).
|
||||
|
@ -8,6 +8,20 @@ from utils import get_logger, get_urlhash, normalize
|
||||
from scraper import is_valid
|
||||
from datacollection import *
|
||||
|
||||
|
||||
#*.ics.uci.edu/* 0
|
||||
#*.cs.uci.edu/* 1
|
||||
#*.informatics.uci.edu/* 2
|
||||
#*.stat.uci.edu/* 3
|
||||
#today.uci.edu/department/information_computer_sciences/* 4
|
||||
|
||||
domain_semaphores = [Semaphore(3),Semaphore(3),Semaphore(3),Semaphore(3),Semaphore(3)]
|
||||
data_mutex = Lock()
|
||||
file_1_mutex = Lock()
|
||||
file_2_mutex = Lock()
|
||||
file_3_mutex = Lock()
|
||||
file_4_mutex = LocK()
|
||||
|
||||
class Frontier(object):
|
||||
def __init__(self, config, restart):
|
||||
self.logger = get_logger("FRONTIER")
|
||||
@ -60,65 +74,75 @@ class Frontier(object):
|
||||
f"total urls discovered.")
|
||||
|
||||
def get_tbd_url(self):
|
||||
###CRITICAL SECTION
|
||||
data_mutex.acquire()
|
||||
try:
|
||||
return self.to_be_downloaded.pop()
|
||||
except IndexError:
|
||||
return None
|
||||
data_mutex.release()
|
||||
|
||||
def add_url(self, url):
|
||||
url = normalize(url)
|
||||
urlhash = get_urlhash(url)
|
||||
##CRITICAL SECTION
|
||||
data_mutex.acquire()
|
||||
if urlhash not in self.save:
|
||||
self.save[urlhash] = (url, False)
|
||||
self.save.sync()
|
||||
self.to_be_downloaded.append(url)
|
||||
|
||||
data_mutex.release()
|
||||
###CRITICAL SECTION
|
||||
|
||||
|
||||
def mark_url_complete(self, url):
|
||||
urlhash = get_urlhash(url)
|
||||
|
||||
##CRITICAL SECTION
|
||||
data_mutex.acquire()
|
||||
if urlhash not in self.save:
|
||||
# This should not happen.
|
||||
self.logger.error(
|
||||
f"Completed url {url}, but have not seen it before.")
|
||||
|
||||
self.save[urlhash] = (url, True)
|
||||
self.save.sync()
|
||||
data_mutex.release()
|
||||
##CRITICAL SECTION
|
||||
|
||||
|
||||
|
||||
# Q1
|
||||
###CRITICAL SECTION
|
||||
file_1_mutex.acquire()
|
||||
self.uniques.add(removeFragment(url))
|
||||
|
||||
#Writing to local file
|
||||
f = open("q1.txt", "w")
|
||||
f.write("Number of unique pages: {length}\n".format(length = len(self.uniques)))
|
||||
f.close()
|
||||
|
||||
file_1_mutex.release()
|
||||
|
||||
# Q2
|
||||
file_2_mutex.acquire()
|
||||
tempTok = tokenize(url)
|
||||
if len(tempTok) > self.max:
|
||||
self.max = len(tempTok)
|
||||
self.longest = url
|
||||
|
||||
# Q3
|
||||
tempTok = removeStopWords(tempTok)
|
||||
computeFrequencies(tempTok, self.grand_dict)
|
||||
|
||||
# Q4
|
||||
fragless = removeFragment(url)
|
||||
domain = findDomains(fragless.netloc)
|
||||
if domain[1] == 'ics':
|
||||
if domain[0] not in self.ics:
|
||||
self.ics[domain[0]] = urlData(url, domain[0], domain[1])
|
||||
else:
|
||||
if fragless not in self.ics[domain[0]].getUniques():
|
||||
self.ics[domain[0]].appendUnique(fragless)
|
||||
|
||||
|
||||
|
||||
f = open("q1.txt", "w")
|
||||
f.write("Number of unique pages: {length}\n".format(length = len(self.uniques)))
|
||||
f.close()
|
||||
|
||||
# creating text file for question 2
|
||||
f = open("q2.txt", "w")
|
||||
f.write("Largest page url: {url} \nLength of page: {length}".format(url = self.longest, length = self.max))
|
||||
f.close()
|
||||
|
||||
file_2_mutex.release()
|
||||
|
||||
# Q3
|
||||
file_3_mutex.acquire()
|
||||
tempTok = removeStopWords(tempTok)
|
||||
computeFrequencies(tempTok, self.grand_dict)
|
||||
|
||||
# creating text file for question 3
|
||||
f = open("q3.txt", "w")
|
||||
sortedGrandDict = {k: v for k, v in sorted(self.grand_dict.items(), key=lambda item: item[1], reverse = True)}
|
||||
@ -131,6 +155,20 @@ class Frontier(object):
|
||||
i += 1
|
||||
f.close()
|
||||
|
||||
file_3_mutex.release()
|
||||
|
||||
# Q4
|
||||
file_4_mutex.acquire()
|
||||
|
||||
fragless = removeFragment(url)
|
||||
domain = findDomains(fragless.netloc)
|
||||
if domain[1] == 'ics':
|
||||
if domain[0] not in self.ics:
|
||||
self.ics[domain[0]] = urlData(url, domain[0], domain[1])
|
||||
else:
|
||||
if fragless not in self.ics[domain[0]].getUniques():
|
||||
self.ics[domain[0]].appendUnique(fragless)
|
||||
|
||||
# creating text file for question 4
|
||||
sortedDictKeys = sorted(self.ics.keys())
|
||||
f = open("q4.txt", "w")
|
||||
@ -138,6 +176,25 @@ class Frontier(object):
|
||||
f.write("{url}, {num}".format(url = self.ics[i].getNiceLink(), num = len(self.ics[i].getUniques())))
|
||||
f.close()
|
||||
|
||||
file_4_mutex.release()
|
||||
|
||||
self.save[urlhash] = (url, True)
|
||||
self.save.sync()
|
||||
def acquire_polite(url):
|
||||
|
||||
pass;
|
||||
|
||||
def release_polite(domain):
|
||||
pass;
|
||||
|
||||
def get_semaphore_index(url):
|
||||
if "ics.uci.edu" in url:
|
||||
return 0
|
||||
elif "cs.uci.edu" in url:
|
||||
return 1
|
||||
elif "informatics.uci.edu" in url:
|
||||
return 2
|
||||
elif "stat.uci.edu" in url:
|
||||
return 3
|
||||
elif "today.uci.edu/department/information_computer_sciences/" in url:
|
||||
return 4
|
||||
else:
|
||||
println("ERROR")
|
Loading…
Reference in New Issue
Block a user