12 Commits

7 changed files with 452 additions and 14 deletions

View File

@@ -0,0 +1,33 @@
2022-04-20 02:33:03,517 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:34:22,952 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:34:41,682 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:37:03,708 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:38:32,080 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:47:05,419 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:47:20,616 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:50:20,425 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:58:18,494 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:06:07,345 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:07:45,102 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:08:46,162 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:11:05,040 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:11:21,977 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:12:18,315 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:15:53,501 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:18:48,477 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:20:00,913 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:23:14,356 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:36:00,708 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:37:24,758 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:37:24,758 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:37:42,260 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:42:16,880 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:42:23,013 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:45:40,958 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:49:54,245 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:50:02,043 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:59:18,104 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 04:00:14,031 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 04:01:31,386 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 04:02:16,043 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 18:08:59,911 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.

View File

@@ -0,0 +1,44 @@
2022-04-20 03:11:10,483 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:11:24,819 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:12:22,276 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:15:59,251 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:18:53,674 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:20:01,017 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:23:14,468 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:26:22,309 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:28:00,535 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:30:54,421 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:36:00,819 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:36:23,270 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/from-theory-to-practice-ucis-machine-learning-hackathon-delivers, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:36:39,470 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/choc-teams-up-with-uci-to-offer-pediatric-capstone-project-in-data-science, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:37:07,340 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/mine-dogucu-receives-young-investigator-award, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:37:24,747 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/qu-appointed-ims-program-secretary, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:37:24,856 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:37:24,856 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:37:42,359 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:38:03,484 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/from-theory-to-practice-ucis-machine-learning-hackathon-delivers, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:38:19,620 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/choc-teams-up-with-uci-to-offer-pediatric-capstone-project-in-data-science, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:38:36,526 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/mine-dogucu-receives-young-investigator-award, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:38:55,383 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/qu-appointed-ims-program-secretary, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:39:12,794 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/the-resilience-of-the-class-of-2021, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:39:29,963 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/capstone-program-showcases-growing-talent-of-ics-students, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:42:16,982 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:42:23,150 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:45:41,061 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:46:32,554 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/from-theory-to-practice-ucis-machine-learning-hackathon-delivers, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:49:54,351 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:50:02,145 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:52:31,224 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:59:18,220 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 04:00:14,134 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 04:01:31,499 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 04:02:16,153 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 04:02:35,264 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/stephan-mandt-and-collaborators-receive-3-5-million-to-study-machine-learning-for-climate-science, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 18:09:00,035 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9002).
2022-04-20 18:09:09,207 - Worker-0 - INFO - Downloaded https://mds.ics.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9002).
2022-04-20 18:09:13,589 - Worker-0 - INFO - Downloaded https://mds.ics.uci.edu/events, status <200>, using cache ('styx.ics.uci.edu', 9002).
2022-04-20 18:09:16,651 - Worker-0 - INFO - Downloaded https://www.statistics.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9002).
2022-04-20 18:09:18,416 - Worker-0 - INFO - Downloaded https://www.statistics.uci.edu/seminar-series, status <200>, using cache ('styx.ics.uci.edu', 9002).
2022-04-20 18:09:20,376 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/seminar-series-2020-2021, status <200>, using cache ('styx.ics.uci.edu', 9002).
2022-04-20 18:09:22,307 - Worker-0 - INFO - Downloaded http://www.stat.uci.edu/seminar-series/seminar-series-2015-2016, status <200>, using cache ('styx.ics.uci.edu', 9002).
2022-04-20 18:09:24,295 - Worker-0 - INFO - Downloaded http://www.stat.uci.edu/ICS/statistics/research/seminarseries/2011-2012/index, status <404>, using cache ('styx.ics.uci.edu', 9002).

View File

@@ -6,6 +6,21 @@ from queue import Queue, Empty
from utils import get_logger, get_urlhash, normalize from utils import get_logger, get_urlhash, normalize
from scraper import is_valid from scraper import is_valid
from datacollection import *
#*.ics.uci.edu/* 0
#*.cs.uci.edu/* 1
#*.informatics.uci.edu/* 2
#*.stat.uci.edu/* 3
#today.uci.edu/department/information_computer_sciences/* 4
domain_semaphores = [Semaphore(3),Semaphore(3),Semaphore(3),Semaphore(3),Semaphore(3)]
data_mutex = Lock()
file_1_mutex = Lock()
file_2_mutex = Lock()
file_3_mutex = Lock()
file_4_mutex = LocK()
class Frontier(object): class Frontier(object):
def __init__(self, config, restart): def __init__(self, config, restart):
@@ -13,6 +28,17 @@ class Frontier(object):
self.config = config self.config = config
self.to_be_downloaded = list() self.to_be_downloaded = list()
# data collection is going to happen in the frontier
# uniques encompass overall unique links
self.uniques = set()
# grand_dict encompasses all the words over the entire set of links
self.grand_dict = dict()
# ics dict contains all subdomains of ics
self.ics = dict()
# used to find the longest page
self.max = -9999
self.longest = None
if not os.path.exists(self.config.save_file) and not restart: if not os.path.exists(self.config.save_file) and not restart:
# Save file does not exist, but request to load save. # Save file does not exist, but request to load save.
self.logger.info( self.logger.info(
@@ -48,25 +74,200 @@ class Frontier(object):
f"total urls discovered.") f"total urls discovered.")
def get_tbd_url(self): def get_tbd_url(self):
###CRITICAL SECTION
data_mutex.acquire()
try: try:
return self.to_be_downloaded.pop() return self.to_be_downloaded.pop()
except IndexError: except IndexError:
return None return None
data_mutex.release()
def add_url(self, url): def add_url(self, url):
url = normalize(url) url = normalize(url)
urlhash = get_urlhash(url) urlhash = get_urlhash(url)
##CRITICAL SECTION
data_mutex.acquire()
if urlhash not in self.save: if urlhash not in self.save:
self.save[urlhash] = (url, False) self.save[urlhash] = (url, False)
self.save.sync() self.save.sync()
self.to_be_downloaded.append(url) self.to_be_downloaded.append(url)
data_mutex.release()
###CRITICAL SECTION
def mark_url_complete(self, url): def mark_url_complete(self, url):
urlhash = get_urlhash(url) urlhash = get_urlhash(url)
##CRITICAL SECTION
data_mutex.acquire()
if urlhash not in self.save: if urlhash not in self.save:
# This should not happen. # This should not happen.
self.logger.error( self.logger.error(
f"Completed url {url}, but have not seen it before.") f"Completed url {url}, but have not seen it before.")
self.save[urlhash] = (url, True) self.save[urlhash] = (url, True)
self.save.sync() self.save.sync()
data_mutex.release()
##CRITICAL SECTION
# Q1
###CRITICAL SECTION
file_1_mutex.acquire()
self.uniques.add(removeFragment(url))
#Writing to local file
f = open("q1.txt", "w")
f.write("Number of unique pages: {length}\n".format(length = len(self.uniques)))
f.close()
file_1_mutex.release()
# Q2
file_2_mutex.acquire()
tempTok = tokenize(url)
if len(tempTok) > self.max:
self.max = len(tempTok)
self.longest = url
# creating text file for question 2
f = open("q2.txt", "w")
f.write("Largest page url: {url} \nLength of page: {length}".format(url = self.longest, length = self.max))
f.close()
file_2_mutex.release()
# Q3
file_3_mutex.acquire()
tempTok = removeStopWords(tempTok)
computeFrequencies(tempTok, self.grand_dict)
# creating text file for question 3
f = open("q3.txt", "w")
sortedGrandDict = {k: v for k, v in sorted(self.grand_dict.items(), key=lambda item: item[1], reverse = True)}
i = 0
for k, v in sortedGrandDict.items():
if i == 50:
break
else:
f.write("{}: {}\n".format(k, v))
i += 1
f.close()
file_3_mutex.release()
# Q4
file_4_mutex.acquire()
fragless = removeFragment(url)
domain = findDomains(fragless.netloc)
if domain[1] == 'ics':
if domain[0] not in self.ics:
self.ics[domain[0]] = urlData(url, domain[0], domain[1])
else:
if fragless not in self.ics[domain[0]].getUniques():
self.ics[domain[0]].appendUnique(fragless)
# creating text file for question 4
sortedDictKeys = sorted(self.ics.keys())
f = open("q4.txt", "w")
for i in sortedDictKeys:
f.write("{url}, {num}".format(url = self.ics[i].getNiceLink(), num = len(self.ics[i].getUniques())))
f.close()
file_4_mutex.release()
def acquire_polite(url):
pass;
def release_polite(domain):
pass;
def get_semaphore_index(url):
if "ics.uci.edu" in url:
return 0
elif "cs.uci.edu" in url:
return 1
elif "informatics.uci.edu" in url:
return 2
elif "stat.uci.edu" in url:
return 3
elif "today.uci.edu/department/information_computer_sciences/" in url:
return 4
else:
println("ERROR")
def q1(self, url):
# rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory
# this saves to the local directory, so I can constantly access the right file and check if it exists or not
path_to_script = os.path.dirname(os.path.abspath(__file__))
my_filename = os.path.join(path_to_script, "q1.txt")
# Will create a file of all the unique links and you can read the file and do lines = f.readlines() then len(lines) to get the number of unique links
if (os.path.exists(my_filename)):
f = open(my_filename, 'a')
f.write(removeFragment(url))
f.close()
else:
f = open(my_filename, 'w')
f.write(removeFragment(url))
f.close()
def q234(self, url, resp):
# rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory
# this saves to the local directory, so I can constantly access the right file and check if it exists or not
path_to_script = os.path.dirname(os.path.abspath(__file__))
my_filename = os.path.join(path_to_script, "q2.txt")
tempTok = tokenize(resp)
if len(tempTok) > self.max:
self.max = len(tempTok)
self.longest = url
f = open(my_filename, 'w')
f.write("Longest Page: {url}, length: {length}".format(url = self.longest, length = self.max))
f.close()
tempTok = removeStopWords(tempTok)
computeFrequencies(tempTok, self.grand_dict)
# rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory
# this saves to the local directory, so I can constantly access the right file and check if it exists or not
path_to_script = os.path.dirname(os.path.abspath(__file__))
my_filename = os.path.join(path_to_script, "q3.txt")
f = open(my_filename, "w")
sortedGrandDict = {k: v for k, v in sorted(self.grand_dict.items(), key=lambda item: item[1], reverse = True)}
i = 0
for k, v in sortedGrandDict.items():
if i == 50:
break
else:
f.write("{}: {}\n".format(k, v))
i += 1
f.close()
fragless = removeFragment(url)
domain = findDomains(fragless.netloc)
if domain[1] == 'ics':
if domain[0] not in self.ics:
self.ics[domain[0]] = urlData(url, domain[0], domain[1])
else:
if fragless not in self.ics[domain[0]].getUniques():
self.ics[domain[0]].appendUnique(fragless)
# rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory
# this saves to the local directory, so I can constantly access the right file and check if it exists or not
path_to_script = os.path.dirname(os.path.abspath(__file__))
my_filename = os.path.join(path_to_script, "q4.txt")
# creating text file for question 4
sortedDictKeys = sorted(self.ics.keys())
f = open(my_filename, "w")
for i in sortedDictKeys:
f.write("{url}, {num}".format(url = self.ics[i].getNiceLink(), num = len(self.ics[i].getUniques())))
f.close()

View File

@@ -18,16 +18,40 @@ class Worker(Thread):
def run(self): def run(self):
while True: while True:
tic = time.perf_counter()
tbd_url = self.frontier.get_tbd_url() tbd_url = self.frontier.get_tbd_url()
toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to get_tbd_url")
if not tbd_url: if not tbd_url:
self.logger.info("Frontier is empty. Stopping Crawler.") self.logger.info("Frontier is empty. Stopping Crawler.")
break break
tic = time.perf_counter()
resp = download(tbd_url, self.config, self.logger) resp = download(tbd_url, self.config, self.logger)
toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to do download url")
tic = time.perf_counter()
self.frontier.q1(tbd_url)
toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to do download url")
tic = time.perf_counter()
self.frontier.q234(tbd_url, resp)
toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to do download url")
self.logger.info( self.logger.info(
f"Downloaded {tbd_url}, status <{resp.status}>, " f"Downloaded {tbd_url}, status <{resp.status}>, "
f"using cache {self.config.cache_server}.") f"using cache {self.config.cache_server}.")
tic = time.perf_counter()
scraped_urls = scraper.scraper(tbd_url, resp) scraped_urls = scraper.scraper(tbd_url, resp)
toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to do scrape url")
tic = time.perf_counter()
for scraped_url in scraped_urls: for scraped_url in scraped_urls:
self.frontier.add_url(scraped_url) self.frontier.add_url(scraped_url)
self.frontier.mark_url_complete(tbd_url) self.frontier.mark_url_complete(tbd_url)
toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to do store stuffs")
time.sleep(self.config.time_delay) time.sleep(self.config.time_delay)

View File

@@ -0,0 +1,123 @@
import re
import os
import urllib.request
from urllib.parse import urlparse
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import words
import re
import html2text
import nltk
nltk.download('stopwords')
nltk.download('words')
nltk.download('punkt')
english_words = words.words()
english_stop_words = stopwords.words('english')
# there is another nltk.download() requirement but I removed it so i forgot what it was
# it'll show in the console/terminal if you run the code i believe
# it showed in mine
# To explain this class I have to start by explaining the container I decided on using to keep track of subdomains of ics.uci.edu
# I decided to use a dict. Long story short, I was trying to figure out what to make my key so it would uniquely identify what I needed it to do.
# I was going to use the parsed.netloc; however, we're taking into account that a link that looks like https://somename.vision.ics.uci.edu
# is a unique link of the subdomain vision.
# And so I made the key the subdomain that is before ics.uci.edu in the link, and the value of the dict is this class
# It's a very simple class, so I'm not going to commenting what it does
class urlData:
def __init__(self, url, subdomain, domain):
self.url = url
self.nicelink = "http://" + removeFragment(url).netloc
self.domain = domain
self.subdomain = subdomain
self.uniques = set()
self.uniques.add(removeFragment(url))
def getDomain(self):
return self.domain
def getURL(self):
return self.url
def getNiceLink(self):
return self.nicelink
def getSub(self):
return self.subdomain
def getUniques(self):
return self.uniques
def appendUnique(self, parse):
self.uniques.add(parse)
# Tried to find a libary that would do this for me, but couldn't
# It parses the url and uses the netloc to separat for domain and subdomain
def findDomains(url):
urlsplit = url.split('.')
if urlsplit[0].lower() == 'www':
urlsplit.remove('www')
for i in range(len(urlsplit)):
if urlsplit[i] == 'ics':
if i == 0:
return 0, 0
elif i == 1:
return urlsplit[0], urlsplit[1]
else:
return urlsplit[i-1], urlsplit[i] #something like random.vision.ics.uci.edu will be consider a unique page of vision
return None, None
else:
for i in range(len(urlsplit)):
if urlsplit[i] == 'ics':
if i == 0:
return 0, 0
elif i == 1:
return urlsplit[0], urlsplit[1]
else:
return urlsplit[i-1], urlsplit[i] #something like random.vision.ics.uci.edu will be consider a unique page of vision
return None, None
def tokenize(resp):
# getting connection from url
valid = re.compile(r'[^a-zA-Z0-9]+')
# named it tSoup for merge convience
# need the 'lxml' parser for this.
# When extract_next_links is called it returns a list full of links with no resp, and I had to find a way to get text from just link.
# Therefore, I decided to get the plain text this way.
tSoup = BeautifulSoup(resp.raw_response.content, 'lxml')
# Floyd (1 March 2021) Stackoverflow. https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python
# compared this with tSoup.get_text() and clean_text just provided content easier to tokenize and more inline with my intentions
clean_text = ' '.join(tSoup.stripped_strings)
token = word_tokenize(clean_text)
clean_token = list()
# This used the nltk.corpus and just removes the tokens that aren't words
#token = [i for i in token if i.lower() in english_words]
for word in token:
if not valid.match(word):
clean_token.append(word.lower())
return clean_token
#added this so the scraper code is not too redundant
def computeFrequencies(tokens, d):
for t in tokens:
if t not in d:
d[t] = 1
else:
d[t] += 1
def removeStopWords(toks):
return [t for t in toks if t.lower() if not t.lower() in english_stop_words]
def removeFragment(u):
# turn into a urlparse object
# removed fragment in order to have "unique" links
removefrag = urlparse(u)
removefrag = removefrag._replace(fragment = '')
return removefrag

View File

@@ -1,6 +1,9 @@
from distutils.filelist import findall from distutils.filelist import findall
from operator import truediv from operator import truediv
import re import re
import time
import urllib.request
from urllib import robotparser
from urllib.parse import urlparse from urllib.parse import urlparse
from urllib.parse import urljoin from urllib.parse import urljoin
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@@ -8,16 +11,25 @@ from robotsokay import *
def scraper(url, resp): def scraper(url, resp):
links = extract_next_links(url, resp) links = extract_next_links(url, resp)
links_valid = list()
valid_links = open("valid_links.txt",'a') links_valid = set()
invalid_links = open("invalid_links.txt",'a') #valid_links = open("valid_links.txt",'a')
#invalid_links = open("invalid_links.txt",'a')
for link in links: for link in links:
tic = time.perf_counter()
if is_valid(link): if is_valid(link):
links_valid.append(link) links_valid.add(link)
valid_links.write(link + "\n") toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to do validate url")
#valid_links.write(link + "\n")
else: else:
invalid_links.write("From: " + url + "\n") # invalid_links.write("From: " + url + "\n")
invalid_links.write(link + "\n") #invalid_links.write(link + "\n")
pass
return links_valid return links_valid
def extract_next_links(url, resp): def extract_next_links(url, resp):
@@ -30,11 +42,11 @@ def extract_next_links(url, resp):
# resp.raw_response.url: the url, again # resp.raw_response.url: the url, again
# resp.raw_response.content: the content of the page! # resp.raw_response.content: the content of the page!
# Return a list with the hyperlinks (as strings) scrapped from resp.raw_response.content # Return a list with the hyperlinks (as strings) scrapped from resp.raw_response.content
pages = list() pages = set()
if resp.status == 200: if resp.status == 200:
#do stuff #do stuff
soup = BeautifulSoup(resp.raw_response.content) soup = BeautifulSoup(resp.raw_response.content)
tempFile = open("test6.txt", 'a') #tempFile = open("test6.txt", 'a')
#Getting all the links, href = true means at least theres a href value, dont know what it is yet #Getting all the links, href = true means at least theres a href value, dont know what it is yet
for link in soup.find_all('a', href=True): for link in soup.find_all('a', href=True):
#There is a lot of relative paths stuff here gotta add them #There is a lot of relative paths stuff here gotta add them
@@ -59,10 +71,11 @@ def extract_next_links(url, resp):
parsed = urlparse(href_link) parsed = urlparse(href_link)
if not robots_are_ok(parsed): if not robots_are_ok(parsed):
continue continue
tempFile.write(href_link + "\n")
#tempFile.write(href_link + "\n")
#Adding to the boi wonder pages #Adding to the boi wonder pages
pages.append(href_link) pages.add(href_link)
else: else:
print("Page error !") print("Page error !")
return pages return pages