This commit is contained in:
iNocturnis 2022-04-20 17:50:44 -07:00
commit e27b40f153
7 changed files with 162 additions and 55 deletions

View File

@ -0,0 +1,32 @@
2022-04-20 02:33:03,517 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:34:22,952 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:34:41,682 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:37:03,708 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:38:32,080 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:47:05,419 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:47:20,616 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:50:20,425 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:58:18,494 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:06:07,345 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:07:45,102 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:08:46,162 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:11:05,040 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:11:21,977 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:12:18,315 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:15:53,501 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:18:48,477 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:20:00,913 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:23:14,356 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:36:00,708 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:37:24,758 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:37:24,758 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:37:42,260 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:42:16,880 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:42:23,013 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:45:40,958 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:49:54,245 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:50:02,043 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:59:18,104 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 04:00:14,031 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 04:01:31,386 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 04:02:16,043 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.

View File

@ -0,0 +1,36 @@
2022-04-20 03:11:10,483 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:11:24,819 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:12:22,276 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:15:59,251 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:18:53,674 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:20:01,017 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:23:14,468 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:26:22,309 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:28:00,535 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:30:54,421 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:36:00,819 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:36:23,270 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/from-theory-to-practice-ucis-machine-learning-hackathon-delivers, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:36:39,470 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/choc-teams-up-with-uci-to-offer-pediatric-capstone-project-in-data-science, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:37:07,340 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/mine-dogucu-receives-young-investigator-award, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:37:24,747 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/qu-appointed-ims-program-secretary, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:37:24,856 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:37:24,856 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:37:42,359 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:38:03,484 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/from-theory-to-practice-ucis-machine-learning-hackathon-delivers, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:38:19,620 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/choc-teams-up-with-uci-to-offer-pediatric-capstone-project-in-data-science, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:38:36,526 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/mine-dogucu-receives-young-investigator-award, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:38:55,383 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/qu-appointed-ims-program-secretary, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:39:12,794 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/the-resilience-of-the-class-of-2021, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:39:29,963 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/capstone-program-showcases-growing-talent-of-ics-students, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:42:16,982 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:42:23,150 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:45:41,061 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:46:32,554 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/from-theory-to-practice-ucis-machine-learning-hackathon-delivers, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:49:54,351 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:50:02,145 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:52:31,224 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:59:18,220 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 04:00:14,134 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 04:01:31,499 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 04:02:16,153 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 04:02:35,264 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/stephan-mandt-and-collaborators-receive-3-5-million-to-study-machine-learning-for-climate-science, status <200>, using cache ('styx.ics.uci.edu', 9009).

View File

@ -73,12 +73,24 @@ class Frontier(object):
self.save.sync() self.save.sync()
self.to_be_downloaded.append(url) self.to_be_downloaded.append(url)
def mark_url_complete(self, url):
urlhash = get_urlhash(url)
if urlhash not in self.save:
# This should not happen.
self.logger.error(
f"Completed url {url}, but have not seen it before.")
# Q1 # Q1
self.uniques.add(removeFragment(url)) self.uniques.add(removeFragment(url))
# Q2 # Q2
tempTok = tokenize(url) tempTok = tokenize(url)
if len(tempTok) > max: if len(tempTok) > self.max:
self.max = len(tempTok) self.max = len(tempTok)
self.longest = url self.longest = url
@ -97,12 +109,35 @@ class Frontier(object):
self.ics[domain[0]].appendUnique(fragless) self.ics[domain[0]].appendUnique(fragless)
def mark_url_complete(self, url):
urlhash = get_urlhash(url) f = open("q1.txt", "w")
if urlhash not in self.save: f.write("Number of unique pages: {length}\n".format(length = len(self.uniques)))
# This should not happen. f.close()
self.logger.error(
f"Completed url {url}, but have not seen it before.") # creating text file for question 2
f = open("q2.txt", "w")
f.write("Largest page url: {url} \nLength of page: {length}".format(url = self.longest, length = self.max))
f.close()
# creating text file for question 3
f = open("q3.txt", "w")
sortedGrandDict = {k: v for k, v in sorted(self.grand_dict.items(), key=lambda item: item[1], reverse = True)}
i = 0
for k, v in sortedGrandDict.items():
if i == 50:
break
else:
f.write("{}: {}\n".format(k, v))
i += 1
f.close()
# creating text file for question 4
sortedDictKeys = sorted(self.ics.keys())
f = open("q4.txt", "w")
for i in sortedDictKeys:
f.write("{url}, {num}".format(url = self.ics[i].getNiceLink(), num = len(self.ics[i].getUniques())))
f.close()
self.save[urlhash] = (url, True) self.save[urlhash] = (url, True)
self.save.sync() self.save.sync()

View File

@ -18,16 +18,29 @@ class Worker(Thread):
def run(self): def run(self):
while True: while True:
tic = time.perf_counter()
tbd_url = self.frontier.get_tbd_url() tbd_url = self.frontier.get_tbd_url()
toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to get_tbd_url")
if not tbd_url: if not tbd_url:
self.logger.info("Frontier is empty. Stopping Crawler.") self.logger.info("Frontier is empty. Stopping Crawler.")
break break
tic = time.perf_counter()
resp = download(tbd_url, self.config, self.logger) resp = download(tbd_url, self.config, self.logger)
toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to do download url")
self.logger.info( self.logger.info(
f"Downloaded {tbd_url}, status <{resp.status}>, " f"Downloaded {tbd_url}, status <{resp.status}>, "
f"using cache {self.config.cache_server}.") f"using cache {self.config.cache_server}.")
tic = time.perf_counter()
scraped_urls = scraper.scraper(tbd_url, resp) scraped_urls = scraper.scraper(tbd_url, resp)
toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to do scrape url")
tic = time.perf_counter()
for scraped_url in scraped_urls: for scraped_url in scraped_urls:
self.frontier.add_url(scraped_url) self.frontier.add_url(scraped_url)
self.frontier.mark_url_complete(tbd_url) self.frontier.mark_url_complete(tbd_url)
toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to do store stuffs")
time.sleep(self.config.time_delay) time.sleep(self.config.time_delay)

View File

@ -1,4 +1,5 @@
import re import re
import urllib.request import urllib.request
from urllib.parse import urlparse from urllib.parse import urlparse
from urllib.parse import urljoin from urllib.parse import urljoin
@ -6,10 +7,16 @@ from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords from nltk.corpus import stopwords
from nltk.corpus import words from nltk.corpus import words
import re
import html2text import html2text
import nltk import nltk
# nltk.download('stopwords') #nltk.download('stopwords')
# nltk.download('words') #nltk.download('words')
#nltk.download('punkt')
english_words = words.words()
english_stop_words = stopwords.words('english')
# there is another nltk.download() requirement but I removed it so i forgot what it was # there is another nltk.download() requirement but I removed it so i forgot what it was
# it'll show in the console/terminal if you run the code i believe # it'll show in the console/terminal if you run the code i believe
# it showed in mine # it showed in mine
@ -77,7 +84,7 @@ def tokenize(url):
# getting connection from url # getting connection from url
page = urllib.request.urlopen(url) page = urllib.request.urlopen(url)
data = page.read() data = page.read()
valid = re.compile(r'[^a-zA-Z0-9]+')
# named it tSoup for merge convience # named it tSoup for merge convience
# need the 'lxml' parser for this. # need the 'lxml' parser for this.
# When extract_next_links is called it returns a list full of links with no resp, and I had to find a way to get text from just link. # When extract_next_links is called it returns a list full of links with no resp, and I had to find a way to get text from just link.
@ -89,10 +96,15 @@ def tokenize(url):
clean_text = ' '.join(tSoup.stripped_strings) clean_text = ' '.join(tSoup.stripped_strings)
token = word_tokenize(clean_text) token = word_tokenize(clean_text)
clean_token = list()
# This used the nltk.corpus and just removes the tokens that aren't words # This used the nltk.corpus and just removes the tokens that aren't words
token = [i for i in token if i.lower() in words.words()] #token = [i for i in token if i.lower() in english_words]
return token for word in token:
if not valid.match(word):
clean_token.append(word.lower())
return clean_token
#added this so the scraper code is not too redundant #added this so the scraper code is not too redundant
def computeFrequencies(tokens, d): def computeFrequencies(tokens, d):
@ -103,8 +115,7 @@ def computeFrequencies(tokens, d):
d[t] += 1 d[t] += 1
def removeStopWords(toks): def removeStopWords(toks):
stopWords = set(stopwords.words('english')) return [t for t in toks if t.lower() if not t.lower() in english_stop_words]
return [t for t in toks if t.lower() if not t.lower() in stopWords]
def removeFragment(u): def removeFragment(u):
# turn into a urlparse object # turn into a urlparse object

View File

@ -1,6 +1,9 @@
from distutils.filelist import findall from distutils.filelist import findall
from operator import truediv from operator import truediv
import re import re
import time
import urllib.request
from urllib import robotparser
from urllib.parse import urlparse from urllib.parse import urlparse
from urllib.parse import urljoin from urllib.parse import urljoin
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -8,48 +11,24 @@ from robotsokay import *
def scraper(url, resp): def scraper(url, resp):
links = extract_next_links(url, resp) links = extract_next_links(url, resp)
links_valid = list()
valid_links = open("valid_links.txt",'a') links_valid = set()
invalid_links = open("invalid_links.txt",'a') #valid_links = open("valid_links.txt",'a')
#invalid_links = open("invalid_links.txt",'a')
for link in links: for link in links:
tic = time.perf_counter()
if is_valid(link): if is_valid(link):
links_valid.append(link) links_valid.add(link)
valid_links.write(link + "\n") toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to do validate url")
#valid_links.write(link + "\n")
else: else:
invalid_links.write("From: " + url + "\n") # invalid_links.write("From: " + url + "\n")
invalid_links.write(link + "\n") #invalid_links.write(link + "\n")
pass
# Needs to be moved
# creating text file that includes the number of unique links
f = open("q1.txt", "w")
f.write("Number of unique pages: {length}\n".format(length = len(uniques)))
f.close()
# creating text file for question 2
f = open("q2.txt", "w")
f.write("Largest page url: {url} \nLength of page: {length}".format(url = longest, length = max))
f.close()
# creating text file for question 3
f = open("q3.txt", "w")
sortedGrandDict = {k: v for k, v in sorted(grand_dict.items(), key=lambda item: item[1], reverse = True)}
i = 0
for k, v in sortedGrandDict.items():
if i == 50:
break
else:
f.write("{}: {}\n".format(k, v))
i += 1
f.close()
# creating text file for question 4
sortedDictKeys = sorted(ics.keys())
f = open("q4.txt", "w")
for i in sortedDictKeys:
f.write("{url}, {num}".format(url = ics[i].getNiceLink(), num = len(ics[i].getUniques())))
f.close()
return links_valid return links_valid
@ -63,11 +42,11 @@ def extract_next_links(url, resp):
# resp.raw_response.url: the url, again # resp.raw_response.url: the url, again
# resp.raw_response.content: the content of the page! # resp.raw_response.content: the content of the page!
# Return a list with the hyperlinks (as strings) scrapped from resp.raw_response.content # Return a list with the hyperlinks (as strings) scrapped from resp.raw_response.content
pages = list() pages = set()
if resp.status == 200: if resp.status == 200:
#do stuff #do stuff
soup = BeautifulSoup(resp.raw_response.content) soup = BeautifulSoup(resp.raw_response.content)
tempFile = open("test6.txt", 'a') #tempFile = open("test6.txt", 'a')
#Getting all the links, href = true means at least theres a href value, dont know what it is yet #Getting all the links, href = true means at least theres a href value, dont know what it is yet
for link in soup.find_all('a', href=True): for link in soup.find_all('a', href=True):
#There is a lot of relative paths stuff here gotta add them #There is a lot of relative paths stuff here gotta add them
@ -93,9 +72,10 @@ def extract_next_links(url, resp):
if not robots_are_ok(parsed): if not robots_are_ok(parsed):
continue continue
tempFile.write(href_link + "\n")
#tempFile.write(href_link + "\n")
#Adding to the boi wonder pages #Adding to the boi wonder pages
pages.append(href_link) pages.add(href_link)
else: else:
print("Page error !") print("Page error !")
return pages return pages