Compare commits

..

No commits in common. "main" and "data_collection" have entirely different histories.

13 changed files with 156 additions and 1084034 deletions

View File

@ -31,117 +31,3 @@
2022-04-20 04:01:31,386 - FRONTIER - INFO - Found save file frontier.shelve, deleting it. 2022-04-20 04:01:31,386 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 04:02:16,043 - FRONTIER - INFO - Found save file frontier.shelve, deleting it. 2022-04-20 04:02:16,043 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 18:08:59,911 - FRONTIER - INFO - Found save file frontier.shelve, deleting it. 2022-04-20 18:08:59,911 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:31:49,310 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:32:31,178 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:32:35,094 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:33:25,233 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:33:42,393 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:36:07,413 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:37:56,413 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:38:45,000 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:39:14,157 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:39:50,638 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:39:56,516 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:41:07,005 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:46:01,865 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:46:16,984 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:49:37,689 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:53:43,854 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:54:45,134 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:56:48,517 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:57:19,541 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 13:02:40,174 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 13:07:26,611 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:25:16,739 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:27:01,372 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:28:24,395 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:33:03,228 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:33:14,391 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:34:11,862 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:35:05,121 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:36:23,994 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:36:31,564 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:38:41,035 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:39:43,493 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:43:12,698 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:46:27,304 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:52:23,826 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:52:38,658 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:59:19,523 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 18:00:51,039 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 18:01:45,112 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 00:46:46,850 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 00:49:09,876 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 00:53:26,894 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 00:53:54,532 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:40:16,310 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:41:34,284 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:43:18,453 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:46:32,822 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:47:34,475 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:48:29,467 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:48:56,671 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:50:51,864 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:53:07,556 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:53:56,693 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:54:34,028 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:55:03,124 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:56:20,721 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:59:29,951 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:59:57,446 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 02:02:46,431 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 02:05:59,557 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:26:02,713 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:26:15,186 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:26:45,445 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:27:24,255 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:31:59,791 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:32:26,864 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:35:18,046 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:37:12,709 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:37:48,356 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:38:16,370 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:38:22,050 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:38:50,914 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:39:41,890 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:41:44,405 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:43:16,946 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:44:33,013 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:44:54,848 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:46:31,871 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:51:57,008 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:52:42,659 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:54:20,296 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:57:49,247 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:59:12,978 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:00:10,268 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:00:41,805 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:01:46,542 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:03:07,751 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:04:06,325 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:06:00,643 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:06:09,928 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:06:50,980 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:07:03,781 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:07:48,403 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:08:32,837 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:10:06,168 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:10:56,162 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:12:04,126 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:13:56,449 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:14:32,348 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:15:10,188 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:15:18,099 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:15:28,945 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:23:44,222 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:24:20,095 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:24:58,182 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:25:29,482 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:25:43,095 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:58:37,549 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:58:48,116 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 20:19:11,395 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 20:33:31,301 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 22:13:18,206 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-24 10:06:50,608 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-24 14:10:29,409 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.

File diff suppressed because it is too large Load Diff

View File

@ -1,14 +1,13 @@
[IDENTIFICATION] [IDENTIFICATION]
# Set your user agent string here. # Set your user agent string here.
USERAGENT = IR US22 19854690,44333574,95241547 USERAGENT = IR US22 19854690,44333574
[CONNECTION] [CONNECTION]
HOST = styx.ics.uci.edu HOST = styx.ics.uci.edu
PORT = 9000 PORT = 9000
[CRAWLER] [CRAWLER]
SEEDURL = https://www.ics.uci.edu,https://www.cs.uci.edu,https://www.informatics.uci.edu,https://www.stat.uci.edu,https://www.eecs.uci.edu SEEDURL = https://www.ics.uci.edu,https://www.cs.uci.edu,https://www.informatics.uci.edu,https://www.stat.uci.edu
# In seconds # In seconds
POLITENESS = 0.5 POLITENESS = 0.5
@ -17,5 +16,5 @@ POLITENESS = 0.5
SAVE = frontier.shelve SAVE = frontier.shelve
# IMPORTANT: DO NOT CHANGE IT IF YOU HAVE NOT IMPLEMENTED MULTITHREADING. # IMPORTANT: DO NOT CHANGE IT IF YOU HAVE NOT IMPLEMENTED MULTITHREADING.
THREADCOUNT = 5 THREADCOUNT = 1

View File

@ -1,11 +1,9 @@
import os import os
import shelve import shelve
from threading import Thread, Lock,Semaphore from threading import Thread, RLock
from queue import Queue, Empty from queue import Queue, Empty
import time
from utils import get_logger, get_urlhash, normalize from utils import get_logger, get_urlhash, normalize
from scraper import is_valid from scraper import is_valid
from datacollection import * from datacollection import *
@ -17,30 +15,18 @@ from datacollection import *
#*.stat.uci.edu/* 3 #*.stat.uci.edu/* 3
#today.uci.edu/department/information_computer_sciences/* 4 #today.uci.edu/department/information_computer_sciences/* 4
domain_semaphores = [Semaphore(3),Semaphore(3),Semaphore(3),Semaphore(3),Semaphore(3)]
data_mutex = Lock()
file_1_mutex = Lock()
file_2_mutex = Lock()
file_3_mutex = Lock()
file_4_mutex = LocK()
class Frontier(object): class Frontier(object):
def __init__(self, config, restart): def __init__(self, config, restart):
self.logger = get_logger("FRONTIER") self.logger = get_logger("FRONTIER")
self.config = config self.config = config
self.to_be_downloaded = list()
#Load balancer, list()
self.to_be_downloaded = [set(),set(),set(),set(),set()]
self.balance_index = 0
#Semaphore for each domain to keep each domain noice and tidy with politeness
self.domain_semaphores = [Lock(),Lock(),Lock(),Lock(),Lock()]
#Local data lock
self.data_mutex = Lock()
#FIle locks for data to make sure everything is thread-safe
self.file_1_mutex = Lock()
self.file_2_mutex = Lock()
self.file_3_mutex = Lock()
self.file_4_mutex = Lock()
# data collection is going to happen in the frontier # data collection is going to happen in the frontier
# uniques encompass overall unique links # uniques encompass overall unique links
@ -75,14 +61,13 @@ class Frontier(object):
for url in self.config.seed_urls: for url in self.config.seed_urls:
self.add_url(url) self.add_url(url)
def _parse_save_file(self): def _parse_save_file(self):
''' This function can be overridden for alternate saving techniques. ''' ''' This function can be overridden for alternate saving techniques. '''
total_count = len(self.save) total_count = len(self.save)
tbd_count = 0 tbd_count = 0
for url, completed in self.save.values(): for url, completed in self.save.values():
if not completed and is_valid(url): if not completed and is_valid(url):
self.to_be_downloaded[self.get_domain_index(url)].add(url) self.to_be_downloaded.append(url)
tbd_count += 1 tbd_count += 1
self.logger.info( self.logger.info(
f"Found {tbd_count} urls to be downloaded from {total_count} " f"Found {tbd_count} urls to be downloaded from {total_count} "
@ -90,37 +75,23 @@ class Frontier(object):
def get_tbd_url(self): def get_tbd_url(self):
###CRITICAL SECTION ###CRITICAL SECTION
self.data_mutex.acquire() data_mutex.acquire()
try: try:
#Load balancing return self.to_be_downloaded.pop()
loop = 10
while not self.to_be_downloaded[self.balance_index] and loop != 0:
self.balance_index = self.balance_index + 1
if self.balance_index > 4:
self.balance_index = 0
loop = loop - 1
if loop == 0:
self.data_mutex.release()
return None
hold = self.to_be_downloaded[self.balance_index].pop()
self.balance_index = self.balance_index + 1
self.data_mutex.release()
#print(hold)
return hold
except IndexError: except IndexError:
print("POPPING RANDOM SHIT BRO")
self.data_mutex.release()
return None return None
data_mutex.release()
def add_url(self, url): def add_url(self, url):
url = normalize(url) url = normalize(url)
urlhash = get_urlhash(url) urlhash = get_urlhash(url)
##CRITICAL SECTION ##CRITICAL SECTION
data_mutex.acquire()
if urlhash not in self.save: if urlhash not in self.save:
self.save[urlhash] = (url, False) self.save[urlhash] = (url, False)
self.save.sync() self.save.sync()
self.to_be_downloaded[self.get_domain_index(url)].add(url) self.to_be_downloaded.append(url)
data_mutex.release()
###CRITICAL SECTION ###CRITICAL SECTION
@ -128,51 +99,106 @@ class Frontier(object):
urlhash = get_urlhash(url) urlhash = get_urlhash(url)
##CRITICAL SECTION ##CRITICAL SECTION
data_mutex.acquire()
if urlhash not in self.save: if urlhash not in self.save:
# This should not happen. # This should not happen.
self.logger.error( self.logger.error(
f"Completed url {url}, but have not seen it before.") f"Completed url {url}, but have not seen it before.")
self.save[urlhash] = (url, True) self.save[urlhash] = (url, True)
self.save.sync() self.save.sync()
data_mutex.release()
##CRITICAL SECTION ##CRITICAL SECTION
def get_domain_index(self,url):
#yeah if you put ics.uci.edu in first it will add all informatics link into that instead
if "informatics.uci.edu" in url: # Q1
###CRITICAL SECTION
file_1_mutex.acquire()
self.uniques.add(removeFragment(url))
#Writing to local file
f = open("q1.txt", "w")
f.write("Number of unique pages: {length}\n".format(length = len(self.uniques)))
f.close()
file_1_mutex.release()
# Q2
file_2_mutex.acquire()
tempTok = tokenize(url)
if len(tempTok) > self.max:
self.max = len(tempTok)
self.longest = url
# creating text file for question 2
f = open("q2.txt", "w")
f.write("Largest page url: {url} \nLength of page: {length}".format(url = self.longest, length = self.max))
f.close()
file_2_mutex.release()
# Q3
file_3_mutex.acquire()
tempTok = removeStopWords(tempTok)
computeFrequencies(tempTok, self.grand_dict)
# creating text file for question 3
f = open("q3.txt", "w")
sortedGrandDict = {k: v for k, v in sorted(self.grand_dict.items(), key=lambda item: item[1], reverse = True)}
i = 0
for k, v in sortedGrandDict.items():
if i == 50:
break
else:
f.write("{}: {}\n".format(k, v))
i += 1
f.close()
file_3_mutex.release()
# Q4
file_4_mutex.acquire()
fragless = removeFragment(url)
domain = findDomains(fragless.netloc)
if domain[1] == 'ics':
if domain[0] not in self.ics:
self.ics[domain[0]] = urlData(url, domain[0], domain[1])
else:
if fragless not in self.ics[domain[0]].getUniques():
self.ics[domain[0]].appendUnique(fragless)
# creating text file for question 4
sortedDictKeys = sorted(self.ics.keys())
f = open("q4.txt", "w")
for i in sortedDictKeys:
f.write("{url}, {num}".format(url = self.ics[i].getNiceLink(), num = len(self.ics[i].getUniques())))
f.close()
file_4_mutex.release()
def acquire_polite(url):
pass;
def release_polite(domain):
pass;
def get_semaphore_index(url):
if "ics.uci.edu" in url:
return 0 return 0
elif "ics.uci.edu" in url:
return 1
elif "cs.uci.edu" in url: elif "cs.uci.edu" in url:
return 1
elif "informatics.uci.edu" in url:
return 2 return 2
elif "stat.uci.edu" in url: elif "stat.uci.edu" in url:
return 3 return 3
elif "today.uci.edu/department/information_computer_sciences/" in url: elif "today.uci.edu/department/information_computer_sciences/" in url:
return 4 return 4
else: else:
print(url) println("ERROR")
print("ERROR")
def acquire_polite(self,url):
return self.domain_semaphores[self.get_domain_index(url)].acquire()
def release_polite(self,url):
return self.domain_semaphores[self.get_domain_index(url)].release()
def acquire_data_mutex(self):
return self.data_mutex.acquire()
def release_data_mutex(self):
return self.data_mutex.release()
def acquire_234_mutex(self):
return self.file_2_3_4_mutex.acquire()
def release_234_mutex(self):
return self.file_2_3_4_mutex.release()
def q1(self, url): def q1(self, url):
# rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory # rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory
@ -181,56 +207,37 @@ class Frontier(object):
my_filename = os.path.join(path_to_script, "q1.txt") my_filename = os.path.join(path_to_script, "q1.txt")
# Will create a file of all the unique links and you can read the file and do lines = f.readlines() then len(lines) to get the number of unique links # Will create a file of all the unique links and you can read the file and do lines = f.readlines() then len(lines) to get the number of unique links
#Locking and releasing each file
self.file_1_mutex.acquire()
if (os.path.exists(my_filename)): if (os.path.exists(my_filename)):
f = open(my_filename, 'a') f = open(my_filename, 'a')
f.write(str(removeFragment(url)) + "\n") f.write(removeFragment(url))
f.close() f.close()
else: else:
f = open(my_filename, 'w') f = open(my_filename, 'w')
f.write(str(removeFragment(url)) + "\n") f.write(removeFragment(url))
f.close() f.close()
self.file_1_mutex.release()
def q234(self, url, resp): def q234(self, url, resp):
# rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory # rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory
# this saves to the local directory, so I can constantly access the right file and check if it exists or not # this saves to the local directory, so I can constantly access the right file and check if it exists or not
if resp.status != 200:
return
tic = time.perf_counter()
path_to_script = os.path.dirname(os.path.abspath(__file__)) path_to_script = os.path.dirname(os.path.abspath(__file__))
my_filename = os.path.join(path_to_script, "q2.txt") my_filename = os.path.join(path_to_script, "q2.txt")
try:
tempTok = tokenize(resp) tempTok = tokenize(resp)
self.file_2_mutex.acquire()
if len(tempTok) > self.max: if len(tempTok) > self.max:
self.max = len(tempTok) self.max = len(tempTok)
self.longest = url self.longest = url
f = open(my_filename, 'w') f = open(my_filename, 'w')
f.write("Longest Page: {url}, length: {length}".format(url = self.longest, length = self.max)) f.write("Longest Page: {url}, length: {length}".format(url = self.longest, length = self.max))
f.close() f.close()
except:
print("resp dying for some reason ?")
self.file_2_mutex.release()
toc = time.perf_counter()
#print(f"Took {toc - tic:0.4f} seconds to save file 2 !")
tic = time.perf_counter()
tempTok = removeStopWords(tempTok) tempTok = removeStopWords(tempTok)
self.file_3_mutex.acquire()
computeFrequencies(tempTok, self.grand_dict) computeFrequencies(tempTok, self.grand_dict)
# rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory # rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory
# this saves to the local directory, so I can constantly access the right file and check if it exists or not # this saves to the local directory, so I can constantly access the right file and check if it exists or not
path_to_script = os.path.dirname(os.path.abspath(__file__)) path_to_script = os.path.dirname(os.path.abspath(__file__))
my_filename = os.path.join(path_to_script, "q3.txt") my_filename = os.path.join(path_to_script, "q3.txt")
f = open(my_filename, "w") f = open(my_filename, "w")
sortedGrandDict = {k: v for k, v in sorted(self.grand_dict.items(), key=lambda item: item[1], reverse = True)} sortedGrandDict = {k: v for k, v in sorted(self.grand_dict.items(), key=lambda item: item[1], reverse = True)}
i = 0 i = 0
@ -241,16 +248,9 @@ class Frontier(object):
f.write("{}: {}\n".format(k, v)) f.write("{}: {}\n".format(k, v))
i += 1 i += 1
f.close() f.close()
self.file_3_mutex.release()
toc = time.perf_counter()
#print(f"Took {toc - tic:0.4f} seconds to save file 3 !")
tic = time.perf_counter()
fragless = removeFragment(url) fragless = removeFragment(url)
domain = findDomains(fragless.netloc) domain = findDomains(fragless.netloc)
self.file_4_mutex.acquire()
if domain[1] == 'ics': if domain[1] == 'ics':
if domain[0] not in self.ics: if domain[0] not in self.ics:
self.ics[domain[0]] = urlData(url, domain[0], domain[1]) self.ics[domain[0]] = urlData(url, domain[0], domain[1])
@ -267,10 +267,7 @@ class Frontier(object):
sortedDictKeys = sorted(self.ics.keys()) sortedDictKeys = sorted(self.ics.keys())
f = open(my_filename, "w") f = open(my_filename, "w")
for i in sortedDictKeys: for i in sortedDictKeys:
f.write("{url}, {num} + \n".format(url = self.ics[i].getNiceLink(), num = len(self.ics[i].getUniques()))) f.write("{url}, {num}".format(url = self.ics[i].getNiceLink(), num = len(self.ics[i].getUniques())))
f.close() f.close()
self.file_4_mutex.release()
toc = time.perf_counter()
#print(f"Took {toc - tic:0.4f} seconds to save file 4 !")

File diff suppressed because it is too large Load Diff

View File

@ -1 +0,0 @@
Longest Page: http://www.ics.uci.edu/~cs224, length: 83259

View File

@ -1,50 +0,0 @@
research: 71407
computer: 44358
science: 35764
ics: 31878
students: 31271
uci: 30946
events: 28911
news: 28680
student: 28244
information: 28159
informatics: 27680
graduate: 27322
0: 26001
school: 25154
2021: 24609
bren: 24296
data: 23332
us: 22961
undergraduate: 22912
faculty: 22357
2020: 22133
software: 22105
learning: 21218
policies: 20976
1: 19559
contact: 18653
2018: 17102
alumni: 17032
2: 16758
donald: 16690
projects: 16319
2019: 15778
computing: 15414
people: 15237
irvine: 15146
academic: 15127
support: 14680
2017: 14599
view: 14582
2016: 14330
ramesh: 14140
engineering: 13971
university: 13744
may: 13308
sciences: 13175
systems: 13164
course: 12868
statistics: 12582
media: 12577
new: 12501

View File

@ -1,86 +0,0 @@
http://Transformativeplay.ics.uci.edu, 1 +
http://accessibility.ics.uci.edu, 1 +
http://acoi.ics.uci.edu, 52 +
http://aiclub.ics.uci.edu, 1 +
http://archive.ics.uci.edu, 6 +
http://asterix.ics.uci.edu, 7 +
http://cbcl.ics.uci.edu, 23 +
http://cert.ics.uci.edu, 5 +
http://checkmate.ics.uci.edu, 1 +
http://chenli.ics.uci.edu, 9 +
http://cloudberry.ics.uci.edu, 45 +
http://cml.ics.uci.edu, 172 +
http://code.ics.uci.edu, 12 +
http://computableplant.ics.uci.edu, 33 +
http://cradl.ics.uci.edu, 20 +
http://create.ics.uci.edu, 6 +
http://cwicsocal18.ics.uci.edu, 12 +
http://cyberclub.ics.uci.edu, 14 +
http://dgillen.ics.uci.edu, 19 +
http://duttgroup.ics.uci.edu, 85 +
http://elms.ics.uci.edu, 1 +
http://emj.ics.uci.edu, 45 +
http://evoke.ics.uci.edu, 62 +
http://flamingo.ics.uci.edu, 11 +
http://fr.ics.uci.edu, 3 +
http://frost.ics.uci.edu, 1 +
http://futurehealth.ics.uci.edu, 72 +
http://graphics.ics.uci.edu, 4 +
http://hack.ics.uci.edu, 1 +
http://hai.ics.uci.edu, 3 +
http://helpdesk.ics.uci.edu, 3 +
http://hobbes.ics.uci.edu, 1 +
http://i-sensorium.ics.uci.edu, 1 +
http://iasl.ics.uci.edu, 17 +
http://industryshowcase.ics.uci.edu, 23 +
http://www.informatics.ics.uci.edu, 1 +
http://intranet.ics.uci.edu, 2 +
http://ipf.ics.uci.edu, 2 +
http://ipubmed.ics.uci.edu, 1 +
http://isg.ics.uci.edu, 104 +
http://jgarcia.ics.uci.edu, 23 +
http://luci.ics.uci.edu, 4 +
http://malek.ics.uci.edu, 1 +
http://mcs.ics.uci.edu, 31 +
http://mdogucu.ics.uci.edu, 1 +
http://mds.ics.uci.edu, 11 +
http://mhcid.ics.uci.edu, 16 +
http://mondego.ics.uci.edu, 3 +
http://motifmap.ics.uci.edu, 2 +
http://mse.ics.uci.edu, 2 +
http://mswe.ics.uci.edu, 16 +
http://mt-live.ics.uci.edu, 1634 +
http://nalini.ics.uci.edu, 7 +
http://ngs.ics.uci.edu, 2000 +
http://perennialpolycultures.ics.uci.edu, 1 +
http://plrg.ics.uci.edu, 14 +
http://psearch.ics.uci.edu, 1 +
http://radicle.ics.uci.edu, 1 +
http://redmiles.ics.uci.edu, 4 +
http://riscit.ics.uci.edu, 1 +
http://sconce.ics.uci.edu, 2 +
http://sdcl.ics.uci.edu, 205 +
http://seal.ics.uci.edu, 6 +
http://sherlock.ics.uci.edu, 7 +
http://sli.ics.uci.edu, 338 +
http://sourcerer.ics.uci.edu, 1 +
http://sprout.ics.uci.edu, 2 +
http://stairs.ics.uci.edu, 4 +
http://statconsulting.ics.uci.edu, 5 +
http://student-council.ics.uci.edu, 1 +
http://studentcouncil.ics.uci.edu, 3 +
http://support.ics.uci.edu, 4 +
http://swiki.ics.uci.edu, 42 +
http://tad.ics.uci.edu, 3 +
http://tastier.ics.uci.edu, 1 +
http://tippers.ics.uci.edu, 1 +
http://tippersweb.ics.uci.edu, 5 +
http://transformativeplay.ics.uci.edu, 58 +
http://tutors.ics.uci.edu, 44 +
http://ugradforms.ics.uci.edu, 3 +
http://unite.ics.uci.edu, 10 +
http://vision.ics.uci.edu, 200 +
http://wearablegames.ics.uci.edu, 11 +
http://wics.ics.uci.edu, 970 +
http://www-db.ics.uci.edu, 10 +
http://xtune.ics.uci.edu, 6 +

View File

@ -21,50 +21,37 @@ class Worker(Thread):
tic = time.perf_counter() tic = time.perf_counter()
tbd_url = self.frontier.get_tbd_url() tbd_url = self.frontier.get_tbd_url()
toc = time.perf_counter() toc = time.perf_counter()
#print(f"Took {toc - tic:0.4f} seconds to get_tbd_url") print(f"Took {toc - tic:0.4f} seconds to get_tbd_url")
if not tbd_url: if not tbd_url:
self.logger.info("Frontier is empty. Stopping Crawler.") self.logger.info("Frontier is empty. Stopping Crawler.")
break break
self.frontier.acquire_polite(tbd_url)
tic = time.perf_counter() tic = time.perf_counter()
resp = download(tbd_url, self.config, self.logger) resp = download(tbd_url, self.config, self.logger)
start = time.perf_counter()
toc = time.perf_counter() toc = time.perf_counter()
#print(f"Took {toc - tic:0.4f} seconds to do download url") print(f"Took {toc - tic:0.4f} seconds to do download url")
self.logger.info(
f"Downloaded {tbd_url}, status <{resp.status}>, "
f"using cache {self.config.cache_server}.")
tic = time.perf_counter()
scraped_urls = scraper.scraper(tbd_url, resp)
toc = time.perf_counter()
#print(f"Took {toc - tic:0.4f} seconds to do scrape url")
tic = time.perf_counter()
self.frontier.acquire_data_mutex()
for scraped_url in scraped_urls:
self.frontier.add_url(scraped_url)
self.frontier.mark_url_complete(tbd_url)
self.frontier.release_data_mutex()
toc = time.perf_counter()
#print(f"Took {toc - tic:0.4f} seconds to do add_url stuffs")
tic = time.perf_counter() tic = time.perf_counter()
self.frontier.q1(tbd_url) self.frontier.q1(tbd_url)
toc = time.perf_counter() toc = time.perf_counter()
#print(f"Took {toc - tic:0.4f} seconds to do log q1 url") print(f"Took {toc - tic:0.4f} seconds to do download url")
tic = time.perf_counter() tic = time.perf_counter()
self.frontier.q234(tbd_url, resp) self.frontier.q234(tbd_url, resp)
toc = time.perf_counter() toc = time.perf_counter()
#print(f"Took {toc - tic:0.4f} seconds to do log q234 url") print(f"Took {toc - tic:0.4f} seconds to do download url")
self.logger.info(
f"Downloaded {tbd_url}, status <{resp.status}>, "
f"using cache {self.config.cache_server}.")
tic = time.perf_counter()
scraped_urls = scraper.scraper(tbd_url, resp)
toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to do scrape url")
tic = time.perf_counter()
for scraped_url in scraped_urls:
self.frontier.add_url(scraped_url)
self.frontier.mark_url_complete(tbd_url)
while start + self.config.time_delay > time.perf_counter(): toc = time.perf_counter()
#print("Sleeping") print(f"Took {toc - tic:0.4f} seconds to do store stuffs")
time.sleep(self.config.time_delay/5) time.sleep(self.config.time_delay)
self.frontier.release_polite(tbd_url)

File diff suppressed because it is too large Load Diff

View File

@ -10,27 +10,25 @@ from bs4 import BeautifulSoup
from robotsokay import * from robotsokay import *
def scraper(url, resp): def scraper(url, resp):
links = extract_next_links(url, resp) links = extract_next_links(url, resp)
links_valid = set() links_valid = set()
valid_links = open("valid_links.txt",'a') #valid_links = open("valid_links.txt",'a')
invalid_links = open("invalid_links.txt",'a') #invalid_links = open("invalid_links.txt",'a')
tic = time.perf_counter()
for link in links: for link in links:
tic = time.perf_counter()
if is_valid(link): if is_valid(link):
links_valid.add(link) links_valid.add(link)
valid_links.write(link + "\n") toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to do validate url")
#valid_links.write(link + "\n")
else: else:
invalid_links.write("From: " + url + "\n") # invalid_links.write("From: " + url + "\n")
invalid_links.write(link + "\n") #invalid_links.write(link + "\n")
pass pass
toc = time.perf_counter()
#print(f"Took {toc - tic:0.4f} seconds to validate !!!")
return links_valid return links_valid
@ -47,8 +45,8 @@ def extract_next_links(url, resp):
pages = set() pages = set()
if resp.status == 200: if resp.status == 200:
#do stuff #do stuff
soup = BeautifulSoup(resp.raw_response.content,'lxml') soup = BeautifulSoup(resp.raw_response.content)
#tempFile = open("test.txt", 'a') #tempFile = open("test6.txt", 'a')
#Getting all the links, href = true means at least theres a href value, dont know what it is yet #Getting all the links, href = true means at least theres a href value, dont know what it is yet
for link in soup.find_all('a', href=True): for link in soup.find_all('a', href=True):
#There is a lot of relative paths stuff here gotta add them #There is a lot of relative paths stuff here gotta add them
@ -64,26 +62,16 @@ def extract_next_links(url, resp):
if(href_link.startswith("/")): if(href_link.startswith("/")):
href_link = urljoin(url,href_link) href_link = urljoin(url,href_link)
if(href_link.startswith("www.")):
href_link = "https://" + href_link
#skipping query with specific actions which mutate the websites and cause a trap #skipping query with specific actions which mutate the websites and cause a trap
if "do=" in href_link: if "do=" in href_link:
continue continue
# don't know if this is too expensive, otherwise idk # don't know if this is too expensive, otherwise idk
# takes parsed url and if not ok on robots goes next, else we can write file # takes parsed url and if not ok on robots goes next, else we can write file
"""
#For now robot checking too time expensive and incorrectly implemented
parsed = urlparse(href_link) parsed = urlparse(href_link)
tic = time.perf_counter()
print(parsed)
if not robots_are_ok(parsed): if not robots_are_ok(parsed):
continue continue
toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to robots_are_ok !!!")
"""
#tempFile.write(href_link + "\n") #tempFile.write(href_link + "\n")
#Adding to the boi wonder pages #Adding to the boi wonder pages
@ -106,6 +94,7 @@ def is_valid(url):
# There are already some conditions that return False. # There are already some conditions that return False.
try: try:
#Gotta check if they are in the domain
parsed = urlparse(url) parsed = urlparse(url)
url_parsed_path = parsed.path.lower() # this may help speed things up a little bit (less calls to parsed.path) url_parsed_path = parsed.path.lower() # this may help speed things up a little bit (less calls to parsed.path)
if parsed.scheme not in set(["http", "https"]): if parsed.scheme not in set(["http", "https"]):
@ -120,64 +109,34 @@ def is_valid(url):
+ r"|thmx|mso|arff|rtf|jar|csv" + r"|thmx|mso|arff|rtf|jar|csv"
+ r"|rm|smil|wmv|swf|wma|zip|rar|gz)$",parsed.path.lower()): + r"|rm|smil|wmv|swf|wma|zip|rar|gz)$",parsed.path.lower()):
return False return False
elif re.match(
#turns out some query also try to download files, which filled stuff with random stuff thats uncessary
r".*\.(css|js|bmp|gif|jpe?g|ico"
+ r"|png|tiff?|mid|mp2|mp3|mp4"
+ r"|wav|avi|mov|mpeg|ram|m4v|mkv|ogg|ogv|pdf"
+ r"|ps|eps|tex|ppt|pptx|doc|docx|xls|xlsx|names"
+ r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso"
+ r"|epub|dll|cnf|tgz|sha1"
+ r"|thmx|mso|arff|rtf|jar|csv"
+ r"|rm|smil|wmv|swf|wma|zip|rar|gz)$",parsed.query.lower()):
return False
elif not re.match( elif not re.match(
#Making sure domains are respected r".*ics.uci.edu/.*"
r".*[./]ics.uci.edu/.*" + r"|.*cs.uci.edu/.*"
+ r"|.*[./]cs.uci.edu/.*" + r"|.*informatics.uci.edu/.*"
+ r"|.*[./]informatics.uci.edu/.*" + r"|.*stat.uci.edu/.*"
+ r"|.*[./]stat.uci.edu/.*"
+ r"|today.uci.edu/department/information_computer_sciences/.*",url): + r"|today.uci.edu/department/information_computer_sciences/.*",url):
return False return False
#Querying dates return usually bad information
#anything that ends with a date also usually returns junk also returns nothing useful most of the time
#/events/caterogy/ all gives random pages of no information
elif re.match(
r".*\d{4}-\d{2}-\d{2}",url):
return False
elif re.match(
r".*\d{4}-\d{2}-\d{2}/",url):
return False
elif re.match(
r".*\d{4}-\d{2}-\d{2}",parsed.query):
return False
elif re.match(
r".*\/events/category/.*",url):
return False
elif parsed.fragment: elif parsed.fragment:
return False return False
# https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression # https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression
# length check for looping filters and queries (could add hash check for similarity or regex, but don't know if we want to as this works well enought) # length check for looping filters and queries (could add hash check for similarity or regex, but don't know if we want to as this works well enought)
# we can adjust it based on what the cralwer does as well # we can adjust it based on what the cralwer does as well
if len(url) > 250: if len(url) > 169:
return False return False
# this fixes any search box that keeps going page to page, currenty allow a depth of 0 filters # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters
# any filter just give you uncesssary information since the original page already has all information for all poeple if re.match(r".*(&filter%.*){3,}",url_parsed_path):
if re.match(r".*(&filter%.*){1,}",url):
return False return False
# this is for urls which when opened, download a file (do we want to download these files and tokenize them) # this is for urls which when opened, download a file (do we want to download these files and tokenize them)
# elif re.match(r"^.*\&format=(\D{3,4})\Z$",url_parsed_path): # elif re.match(r"^.*\&format=(\D{3,4})\Z$",url_parsed_path):
# return False # return False
# another looping directory check but more advanced than the one contained in is_a_trap # another looping directory check but more advanced than the one contained in is_a_trap
if re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",url): if re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",url_parsed_path):
return False return False
# extra directories check (we can add as we find) # extra directories check (we can add as we find)
if re.match(r"^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", url): if re.match(r"^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", url_parsed_path):
return False return False
# calendar checks plus adding or downloading calendar (ical) # calendar checks plus adding or downloading calendar (ical)
if re.match(r"^.*calendar.*$",url): if re.match(r"^.*calendar.*$",url_parsed_path):
return False return False
if parsed.query.find('ical') != -1: if parsed.query.find('ical') != -1:
return False return False

File diff suppressed because it is too large Load Diff