Locks are not racing anymore, locks work multi-thread works, change some storing information stuff so its more readble, add some new regex but it will need to be trim later because it does not do its job
This commit is contained in:
parent
74063e5d00
commit
c1b7a50460
@ -91,3 +91,52 @@
|
|||||||
2022-04-23 01:59:57,446 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
2022-04-23 01:59:57,446 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
2022-04-23 02:02:46,431 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
2022-04-23 02:02:46,431 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
2022-04-23 02:05:59,557 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
2022-04-23 02:05:59,557 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:26:02,713 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:26:15,186 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:26:45,445 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:27:24,255 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:31:59,791 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:32:26,864 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:35:18,046 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:37:12,709 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:37:48,356 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:38:16,370 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:38:22,050 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:38:50,914 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:39:41,890 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:41:44,405 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:43:16,946 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:44:33,013 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:44:54,848 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:46:31,871 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:51:57,008 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:52:42,659 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:54:20,296 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:57:49,247 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 16:59:12,978 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:00:10,268 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:00:41,805 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:01:46,542 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:03:07,751 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:04:06,325 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:06:00,643 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:06:09,928 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:06:50,980 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:07:03,781 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:07:48,403 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:08:32,837 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:10:06,168 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:10:56,162 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:12:04,126 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:13:56,449 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:14:32,348 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:15:10,188 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:15:18,099 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:15:28,945 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:23:44,222 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:24:20,095 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:24:58,182 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:25:29,482 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:25:43,095 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:58:37,549 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-23 17:58:48,116 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -7,7 +7,8 @@ HOST = styx.ics.uci.edu
|
|||||||
PORT = 9000
|
PORT = 9000
|
||||||
|
|
||||||
[CRAWLER]
|
[CRAWLER]
|
||||||
SEEDURL = https://www.ics.uci.edu,https://www.cs.uci.edu,https://www.informatics.uci.edu,https://www.stat.uci.edu
|
SEEDURL = https://www.ics.uci.edu,https://www.cs.uci.edu,https://www.informatics.uci.edu,https://www.stat.uci.edu,https://www.eecs.uci.edu
|
||||||
|
|
||||||
# In seconds
|
# In seconds
|
||||||
POLITENESS = 0.5
|
POLITENESS = 0.5
|
||||||
|
|
||||||
|
@ -25,7 +25,7 @@ class Frontier(object):
|
|||||||
self.config = config
|
self.config = config
|
||||||
|
|
||||||
#Load balancer, list()
|
#Load balancer, list()
|
||||||
self.to_be_downloaded = [list(),list(),list(),list(),list()]
|
self.to_be_downloaded = [set(),set(),set(),set(),set()]
|
||||||
|
|
||||||
self.balance_index = 0
|
self.balance_index = 0
|
||||||
|
|
||||||
@ -82,7 +82,7 @@ class Frontier(object):
|
|||||||
tbd_count = 0
|
tbd_count = 0
|
||||||
for url, completed in self.save.values():
|
for url, completed in self.save.values():
|
||||||
if not completed and is_valid(url):
|
if not completed and is_valid(url):
|
||||||
self.to_be_downloaded[self.get_domain_index(url)].append(url)
|
self.to_be_downloaded[self.get_domain_index(url)].add(url)
|
||||||
tbd_count += 1
|
tbd_count += 1
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
f"Found {tbd_count} urls to be downloaded from {total_count} "
|
f"Found {tbd_count} urls to be downloaded from {total_count} "
|
||||||
@ -92,22 +92,24 @@ class Frontier(object):
|
|||||||
###CRITICAL SECTION
|
###CRITICAL SECTION
|
||||||
self.data_mutex.acquire()
|
self.data_mutex.acquire()
|
||||||
try:
|
try:
|
||||||
initial = self.balance_index
|
#Load balancing
|
||||||
print("Initial " + str(initial))
|
loop = 10
|
||||||
|
while not self.to_be_downloaded[self.balance_index] and loop != 0:
|
||||||
self.balance_index = self.balance_index + 1
|
self.balance_index = self.balance_index + 1
|
||||||
if self.balance_index > 4:
|
if self.balance_index > 4:
|
||||||
self.balance_index = 0
|
self.balance_index = 0
|
||||||
while not self.to_be_downloaded[self.balance_index]:
|
loop = loop - 1
|
||||||
self.balance_index = self.balance_index + 1
|
if loop == 0:
|
||||||
if self.balance_index > 4:
|
|
||||||
self.balance_index = 0
|
|
||||||
if self.balance_index == initial:
|
|
||||||
self.data_mutex.release()
|
self.data_mutex.release()
|
||||||
return None
|
return None
|
||||||
hold = self.to_be_downloaded[self.balance_index].pop()
|
hold = self.to_be_downloaded[self.balance_index].pop()
|
||||||
|
self.balance_index = self.balance_index + 1
|
||||||
self.data_mutex.release()
|
self.data_mutex.release()
|
||||||
|
#print(hold)
|
||||||
return hold
|
return hold
|
||||||
|
|
||||||
except IndexError:
|
except IndexError:
|
||||||
|
print("POPPING RANDOM SHIT BRO")
|
||||||
self.data_mutex.release()
|
self.data_mutex.release()
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -118,7 +120,7 @@ class Frontier(object):
|
|||||||
if urlhash not in self.save:
|
if urlhash not in self.save:
|
||||||
self.save[urlhash] = (url, False)
|
self.save[urlhash] = (url, False)
|
||||||
self.save.sync()
|
self.save.sync()
|
||||||
self.to_be_downloaded[self.get_domain_index(url)].append(url)
|
self.to_be_downloaded[self.get_domain_index(url)].add(url)
|
||||||
###CRITICAL SECTION
|
###CRITICAL SECTION
|
||||||
|
|
||||||
|
|
||||||
@ -136,11 +138,12 @@ class Frontier(object):
|
|||||||
|
|
||||||
|
|
||||||
def get_domain_index(self,url):
|
def get_domain_index(self,url):
|
||||||
if "ics.uci.edu" in url:
|
#yeah if you put ics.uci.edu in first it will add all informatics link into that instead
|
||||||
|
if "informatics.uci.edu" in url:
|
||||||
return 0
|
return 0
|
||||||
elif "cs.uci.edu" in url:
|
elif "ics.uci.edu" in url:
|
||||||
return 1
|
return 1
|
||||||
elif "informatics.uci.edu" in url:
|
elif "cs.uci.edu" in url:
|
||||||
return 2
|
return 2
|
||||||
elif "stat.uci.edu" in url:
|
elif "stat.uci.edu" in url:
|
||||||
return 3
|
return 3
|
||||||
@ -216,7 +219,7 @@ class Frontier(object):
|
|||||||
self.file_2_mutex.release()
|
self.file_2_mutex.release()
|
||||||
|
|
||||||
toc = time.perf_counter()
|
toc = time.perf_counter()
|
||||||
print(f"Took {toc - tic:0.4f} seconds to save file 2 !")
|
#print(f"Took {toc - tic:0.4f} seconds to save file 2 !")
|
||||||
|
|
||||||
tic = time.perf_counter()
|
tic = time.perf_counter()
|
||||||
tempTok = removeStopWords(tempTok)
|
tempTok = removeStopWords(tempTok)
|
||||||
@ -241,7 +244,7 @@ class Frontier(object):
|
|||||||
self.file_3_mutex.release()
|
self.file_3_mutex.release()
|
||||||
|
|
||||||
toc = time.perf_counter()
|
toc = time.perf_counter()
|
||||||
print(f"Took {toc - tic:0.4f} seconds to save file 3 !")
|
#print(f"Took {toc - tic:0.4f} seconds to save file 3 !")
|
||||||
|
|
||||||
tic = time.perf_counter()
|
tic = time.perf_counter()
|
||||||
|
|
||||||
@ -264,10 +267,10 @@ class Frontier(object):
|
|||||||
sortedDictKeys = sorted(self.ics.keys())
|
sortedDictKeys = sorted(self.ics.keys())
|
||||||
f = open(my_filename, "w")
|
f = open(my_filename, "w")
|
||||||
for i in sortedDictKeys:
|
for i in sortedDictKeys:
|
||||||
f.write("{url}, {num}".format(url = self.ics[i].getNiceLink(), num = len(self.ics[i].getUniques())))
|
f.write("{url}, {num} + \n".format(url = self.ics[i].getNiceLink(), num = len(self.ics[i].getUniques())))
|
||||||
f.close()
|
f.close()
|
||||||
self.file_4_mutex.release()
|
self.file_4_mutex.release()
|
||||||
|
|
||||||
toc = time.perf_counter()
|
toc = time.perf_counter()
|
||||||
print(f"Took {toc - tic:0.4f} seconds to save file 4 !")
|
#print(f"Took {toc - tic:0.4f} seconds to save file 4 !")
|
||||||
|
|
@ -18,19 +18,19 @@ class Worker(Thread):
|
|||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
while True:
|
while True:
|
||||||
start = time.perf_counter()
|
|
||||||
tic = time.perf_counter()
|
tic = time.perf_counter()
|
||||||
tbd_url = self.frontier.get_tbd_url()
|
tbd_url = self.frontier.get_tbd_url()
|
||||||
toc = time.perf_counter()
|
toc = time.perf_counter()
|
||||||
print(f"Took {toc - tic:0.4f} seconds to get_tbd_url")
|
#print(f"Took {toc - tic:0.4f} seconds to get_tbd_url")
|
||||||
if not tbd_url:
|
if not tbd_url:
|
||||||
self.logger.info("Frontier is empty. Stopping Crawler.")
|
self.logger.info("Frontier is empty. Stopping Crawler.")
|
||||||
break
|
break
|
||||||
self.frontier.acquire_polite(tbd_url)
|
self.frontier.acquire_polite(tbd_url)
|
||||||
tic = time.perf_counter()
|
tic = time.perf_counter()
|
||||||
resp = download(tbd_url, self.config, self.logger)
|
resp = download(tbd_url, self.config, self.logger)
|
||||||
|
start = time.perf_counter()
|
||||||
toc = time.perf_counter()
|
toc = time.perf_counter()
|
||||||
print(f"Took {toc - tic:0.4f} seconds to do download url")
|
#print(f"Took {toc - tic:0.4f} seconds to do download url")
|
||||||
|
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
f"Downloaded {tbd_url}, status <{resp.status}>, "
|
f"Downloaded {tbd_url}, status <{resp.status}>, "
|
||||||
@ -39,32 +39,32 @@ class Worker(Thread):
|
|||||||
tic = time.perf_counter()
|
tic = time.perf_counter()
|
||||||
scraped_urls = scraper.scraper(tbd_url, resp)
|
scraped_urls = scraper.scraper(tbd_url, resp)
|
||||||
toc = time.perf_counter()
|
toc = time.perf_counter()
|
||||||
print(f"Took {toc - tic:0.4f} seconds to do scrape url")
|
#print(f"Took {toc - tic:0.4f} seconds to do scrape url")
|
||||||
|
|
||||||
tic = time.perf_counter()
|
tic = time.perf_counter()
|
||||||
print(self.frontier.acquire_data_mutex())
|
self.frontier.acquire_data_mutex()
|
||||||
for scraped_url in scraped_urls:
|
for scraped_url in scraped_urls:
|
||||||
self.frontier.add_url(scraped_url)
|
self.frontier.add_url(scraped_url)
|
||||||
self.frontier.mark_url_complete(tbd_url)
|
self.frontier.mark_url_complete(tbd_url)
|
||||||
self.frontier.release_data_mutex()
|
self.frontier.release_data_mutex()
|
||||||
toc = time.perf_counter()
|
toc = time.perf_counter()
|
||||||
print(f"Took {toc - tic:0.4f} seconds to do add_url stuffs")
|
#print(f"Took {toc - tic:0.4f} seconds to do add_url stuffs")
|
||||||
|
|
||||||
tic = time.perf_counter()
|
tic = time.perf_counter()
|
||||||
self.frontier.q1(tbd_url)
|
self.frontier.q1(tbd_url)
|
||||||
toc = time.perf_counter()
|
toc = time.perf_counter()
|
||||||
print(f"Took {toc - tic:0.4f} seconds to do log q1 url")
|
#print(f"Took {toc - tic:0.4f} seconds to do log q1 url")
|
||||||
|
|
||||||
tic = time.perf_counter()
|
tic = time.perf_counter()
|
||||||
self.frontier.q234(tbd_url, resp)
|
self.frontier.q234(tbd_url, resp)
|
||||||
toc = time.perf_counter()
|
toc = time.perf_counter()
|
||||||
print(f"Took {toc - tic:0.4f} seconds to do log q234 url")
|
#print(f"Took {toc - tic:0.4f} seconds to do log q234 url")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
while start + self.config.time_delay > time.perf_counter():
|
while start + self.config.time_delay > time.perf_counter():
|
||||||
print("Sleeping")
|
#print("Sleeping")
|
||||||
time.sleep(self.config.time_delay/5)
|
time.sleep(self.config.time_delay/5)
|
||||||
self.frontier.release_polite(tbd_url)
|
self.frontier.release_polite(tbd_url)
|
||||||
|
@ -15,22 +15,22 @@ def scraper(url, resp):
|
|||||||
|
|
||||||
|
|
||||||
links_valid = set()
|
links_valid = set()
|
||||||
#valid_links = open("valid_links.txt",'a')
|
valid_links = open("valid_links.txt",'a')
|
||||||
#invalid_links = open("invalid_links.txt",'a')
|
invalid_links = open("invalid_links.txt",'a')
|
||||||
|
|
||||||
tic = time.perf_counter()
|
tic = time.perf_counter()
|
||||||
|
|
||||||
for link in links:
|
for link in links:
|
||||||
if is_valid(link):
|
if is_valid(link):
|
||||||
links_valid.add(link)
|
links_valid.add(link)
|
||||||
#valid_links.write(link + "\n")
|
valid_links.write(link + "\n")
|
||||||
else:
|
else:
|
||||||
# invalid_links.write("From: " + url + "\n")
|
invalid_links.write("From: " + url + "\n")
|
||||||
#invalid_links.write(link + "\n")
|
invalid_links.write(link + "\n")
|
||||||
pass
|
pass
|
||||||
|
|
||||||
toc = time.perf_counter()
|
toc = time.perf_counter()
|
||||||
print(f"Took {toc - tic:0.4f} seconds to validate !!!")
|
#print(f"Took {toc - tic:0.4f} seconds to validate !!!")
|
||||||
|
|
||||||
return links_valid
|
return links_valid
|
||||||
|
|
||||||
@ -48,7 +48,7 @@ def extract_next_links(url, resp):
|
|||||||
if resp.status == 200:
|
if resp.status == 200:
|
||||||
#do stuff
|
#do stuff
|
||||||
soup = BeautifulSoup(resp.raw_response.content,'lxml')
|
soup = BeautifulSoup(resp.raw_response.content,'lxml')
|
||||||
#tempFile = open("test6.txt", 'a')
|
#tempFile = open("test.txt", 'a')
|
||||||
#Getting all the links, href = true means at least theres a href value, dont know what it is yet
|
#Getting all the links, href = true means at least theres a href value, dont know what it is yet
|
||||||
for link in soup.find_all('a', href=True):
|
for link in soup.find_all('a', href=True):
|
||||||
#There is a lot of relative paths stuff here gotta add them
|
#There is a lot of relative paths stuff here gotta add them
|
||||||
@ -64,6 +64,9 @@ def extract_next_links(url, resp):
|
|||||||
if(href_link.startswith("/")):
|
if(href_link.startswith("/")):
|
||||||
href_link = urljoin(url,href_link)
|
href_link = urljoin(url,href_link)
|
||||||
|
|
||||||
|
if(href_link.startswith("www.")):
|
||||||
|
href_link = "https://" + href_link
|
||||||
|
|
||||||
#skipping query with specific actions which mutate the websites and cause a trap
|
#skipping query with specific actions which mutate the websites and cause a trap
|
||||||
if "do=" in href_link:
|
if "do=" in href_link:
|
||||||
continue
|
continue
|
||||||
@ -127,10 +130,16 @@ def is_valid(url):
|
|||||||
return False
|
return False
|
||||||
elif parsed.fragment:
|
elif parsed.fragment:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
#ics.uci.edu accepts physics ... So we gotta get rid of it same with eecs and cs
|
||||||
|
if re.match(
|
||||||
|
r".*physics.uci.edu/.*"
|
||||||
|
+ r"|.*eecs.uci.edu/.*",url) :
|
||||||
|
return False
|
||||||
# https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression
|
# https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression
|
||||||
# length check for looping filters and queries (could add hash check for similarity or regex, but don't know if we want to as this works well enought)
|
# length check for looping filters and queries (could add hash check for similarity or regex, but don't know if we want to as this works well enought)
|
||||||
# we can adjust it based on what the cralwer does as well
|
# we can adjust it based on what the cralwer does as well
|
||||||
if len(url) > 169:
|
if len(url) > 250:
|
||||||
return False
|
return False
|
||||||
# this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters
|
# this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters
|
||||||
if re.match(r".*(&filter%.*){3,}",url_parsed_path):
|
if re.match(r".*(&filter%.*){3,}",url_parsed_path):
|
||||||
|
Loading…
Reference in New Issue
Block a user