Compare commits

..

1 Commits
main ... traps

Author SHA1 Message Date
Lacerum
9c31a901b7 another attempt at robots, merged regex as well 2022-04-23 14:44:47 -07:00
16 changed files with 60 additions and 1084360 deletions

View File

@ -1,147 +0,0 @@
2022-04-20 02:33:03,517 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:34:22,952 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:34:41,682 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:37:03,708 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:38:32,080 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:47:05,419 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:47:20,616 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:50:20,425 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 02:58:18,494 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:06:07,345 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:07:45,102 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:08:46,162 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:11:05,040 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:11:21,977 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:12:18,315 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:15:53,501 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:18:48,477 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:20:00,913 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:23:14,356 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:36:00,708 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:37:24,758 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:37:24,758 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:37:42,260 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:42:16,880 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:42:23,013 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:45:40,958 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:49:54,245 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:50:02,043 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:59:18,104 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 04:00:14,031 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 04:01:31,386 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 04:02:16,043 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 18:08:59,911 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:31:49,310 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:32:31,178 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:32:35,094 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:33:25,233 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:33:42,393 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:36:07,413 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:37:56,413 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:38:45,000 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:39:14,157 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:39:50,638 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:39:56,516 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:41:07,005 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:46:01,865 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:46:16,984 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:49:37,689 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:53:43,854 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:54:45,134 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:56:48,517 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 12:57:19,541 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 13:02:40,174 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 13:07:26,611 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:25:16,739 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:27:01,372 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:28:24,395 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:33:03,228 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:33:14,391 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:34:11,862 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:35:05,121 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:36:23,994 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:36:31,564 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:38:41,035 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:39:43,493 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:43:12,698 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:46:27,304 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:52:23,826 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:52:38,658 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 17:59:19,523 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 18:00:51,039 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-22 18:01:45,112 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 00:46:46,850 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 00:49:09,876 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 00:53:26,894 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 00:53:54,532 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:40:16,310 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:41:34,284 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:43:18,453 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:46:32,822 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:47:34,475 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:48:29,467 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:48:56,671 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:50:51,864 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:53:07,556 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:53:56,693 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:54:34,028 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:55:03,124 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:56:20,721 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:59:29,951 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 01:59:57,446 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 02:02:46,431 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 02:05:59,557 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:26:02,713 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:26:15,186 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:26:45,445 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:27:24,255 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:31:59,791 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:32:26,864 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:35:18,046 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:37:12,709 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:37:48,356 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:38:16,370 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:38:22,050 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:38:50,914 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:39:41,890 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:41:44,405 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:43:16,946 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:44:33,013 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:44:54,848 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:46:31,871 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:51:57,008 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:52:42,659 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:54:20,296 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:57:49,247 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 16:59:12,978 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:00:10,268 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:00:41,805 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:01:46,542 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:03:07,751 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:04:06,325 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:06:00,643 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:06:09,928 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:06:50,980 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:07:03,781 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:07:48,403 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:08:32,837 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:10:06,168 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:10:56,162 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:12:04,126 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:13:56,449 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:14:32,348 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:15:10,188 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:15:18,099 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:15:28,945 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:23:44,222 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:24:20,095 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:24:58,182 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:25:29,482 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:25:43,095 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:58:37,549 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 17:58:48,116 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 20:19:11,395 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 20:33:31,301 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-23 22:13:18,206 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-24 10:06:50,608 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-24 14:10:29,409 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.

File diff suppressed because it is too large Load Diff

View File

@ -1,14 +1,13 @@
[IDENTIFICATION]
# Set your user agent string here.
USERAGENT = IR US22 19854690,44333574,95241547
USERAGENT = IR US22 19854690,44333574
[CONNECTION]
HOST = styx.ics.uci.edu
PORT = 9000
[CRAWLER]
SEEDURL = https://www.ics.uci.edu,https://www.cs.uci.edu,https://www.informatics.uci.edu,https://www.stat.uci.edu,https://www.eecs.uci.edu
SEEDURL = https://www.ics.uci.edu,https://www.cs.uci.edu,https://www.informatics.uci.edu,https://www.stat.uci.edu
# In seconds
POLITENESS = 0.5
@ -17,5 +16,5 @@ POLITENESS = 0.5
SAVE = frontier.shelve
# IMPORTANT: DO NOT CHANGE IT IF YOU HAVE NOT IMPLEMENTED MULTITHREADING.
THREADCOUNT = 5
THREADCOUNT = 1

View File

@ -1,57 +1,17 @@
import os
import shelve
from threading import Thread, Lock,Semaphore
from threading import Thread, RLock
from queue import Queue, Empty
import time
from utils import get_logger, get_urlhash, normalize
from scraper import is_valid
from datacollection import *
#*.ics.uci.edu/* 0
#*.cs.uci.edu/* 1
#*.informatics.uci.edu/* 2
#*.stat.uci.edu/* 3
#today.uci.edu/department/information_computer_sciences/* 4
class Frontier(object):
def __init__(self, config, restart):
self.logger = get_logger("FRONTIER")
self.config = config
#Load balancer, list()
self.to_be_downloaded = [set(),set(),set(),set(),set()]
self.balance_index = 0
#Semaphore for each domain to keep each domain noice and tidy with politeness
self.domain_semaphores = [Lock(),Lock(),Lock(),Lock(),Lock()]
#Local data lock
self.data_mutex = Lock()
#FIle locks for data to make sure everything is thread-safe
self.file_1_mutex = Lock()
self.file_2_mutex = Lock()
self.file_3_mutex = Lock()
self.file_4_mutex = Lock()
# data collection is going to happen in the frontier
# uniques encompass overall unique links
self.uniques = set()
# grand_dict encompasses all the words over the entire set of links
self.grand_dict = dict()
# ics dict contains all subdomains of ics
self.ics = dict()
# used to find the longest page
self.max = -9999
self.longest = None
self.to_be_downloaded = list()
if not os.path.exists(self.config.save_file) and not restart:
# Save file does not exist, but request to load save.
@ -75,202 +35,38 @@ class Frontier(object):
for url in self.config.seed_urls:
self.add_url(url)
def _parse_save_file(self):
''' This function can be overridden for alternate saving techniques. '''
total_count = len(self.save)
tbd_count = 0
for url, completed in self.save.values():
if not completed and is_valid(url):
self.to_be_downloaded[self.get_domain_index(url)].add(url)
self.to_be_downloaded.append(url)
tbd_count += 1
self.logger.info(
f"Found {tbd_count} urls to be downloaded from {total_count} "
f"total urls discovered.")
def get_tbd_url(self):
###CRITICAL SECTION
self.data_mutex.acquire()
try:
#Load balancing
loop = 10
while not self.to_be_downloaded[self.balance_index] and loop != 0:
self.balance_index = self.balance_index + 1
if self.balance_index > 4:
self.balance_index = 0
loop = loop - 1
if loop == 0:
self.data_mutex.release()
return None
hold = self.to_be_downloaded[self.balance_index].pop()
self.balance_index = self.balance_index + 1
self.data_mutex.release()
#print(hold)
return hold
return self.to_be_downloaded.pop()
except IndexError:
print("POPPING RANDOM SHIT BRO")
self.data_mutex.release()
return None
def add_url(self, url):
url = normalize(url)
urlhash = get_urlhash(url)
##CRITICAL SECTION
if urlhash not in self.save:
self.save[urlhash] = (url, False)
self.save.sync()
self.to_be_downloaded[self.get_domain_index(url)].add(url)
###CRITICAL SECTION
self.to_be_downloaded.append(url)
def mark_url_complete(self, url):
urlhash = get_urlhash(url)
##CRITICAL SECTION
if urlhash not in self.save:
# This should not happen.
self.logger.error(
f"Completed url {url}, but have not seen it before.")
self.save[urlhash] = (url, True)
self.save.sync()
##CRITICAL SECTION
def get_domain_index(self,url):
#yeah if you put ics.uci.edu in first it will add all informatics link into that instead
if "informatics.uci.edu" in url:
return 0
elif "ics.uci.edu" in url:
return 1
elif "cs.uci.edu" in url:
return 2
elif "stat.uci.edu" in url:
return 3
elif "today.uci.edu/department/information_computer_sciences/" in url:
return 4
else:
print(url)
print("ERROR")
def acquire_polite(self,url):
return self.domain_semaphores[self.get_domain_index(url)].acquire()
def release_polite(self,url):
return self.domain_semaphores[self.get_domain_index(url)].release()
def acquire_data_mutex(self):
return self.data_mutex.acquire()
def release_data_mutex(self):
return self.data_mutex.release()
def acquire_234_mutex(self):
return self.file_2_3_4_mutex.acquire()
def release_234_mutex(self):
return self.file_2_3_4_mutex.release()
def q1(self, url):
# rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory
# this saves to the local directory, so I can constantly access the right file and check if it exists or not
path_to_script = os.path.dirname(os.path.abspath(__file__))
my_filename = os.path.join(path_to_script, "q1.txt")
# Will create a file of all the unique links and you can read the file and do lines = f.readlines() then len(lines) to get the number of unique links
#Locking and releasing each file
self.file_1_mutex.acquire()
if (os.path.exists(my_filename)):
f = open(my_filename, 'a')
f.write(str(removeFragment(url)) + "\n")
f.close()
else:
f = open(my_filename, 'w')
f.write(str(removeFragment(url)) + "\n")
f.close()
self.file_1_mutex.release()
def q234(self, url, resp):
# rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory
# this saves to the local directory, so I can constantly access the right file and check if it exists or not
if resp.status != 200:
return
tic = time.perf_counter()
path_to_script = os.path.dirname(os.path.abspath(__file__))
my_filename = os.path.join(path_to_script, "q2.txt")
try:
tempTok = tokenize(resp)
self.file_2_mutex.acquire()
if len(tempTok) > self.max:
self.max = len(tempTok)
self.longest = url
f = open(my_filename, 'w')
f.write("Longest Page: {url}, length: {length}".format(url = self.longest, length = self.max))
f.close()
except:
print("resp dying for some reason ?")
self.file_2_mutex.release()
toc = time.perf_counter()
#print(f"Took {toc - tic:0.4f} seconds to save file 2 !")
tic = time.perf_counter()
tempTok = removeStopWords(tempTok)
self.file_3_mutex.acquire()
computeFrequencies(tempTok, self.grand_dict)
# rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory
# this saves to the local directory, so I can constantly access the right file and check if it exists or not
path_to_script = os.path.dirname(os.path.abspath(__file__))
my_filename = os.path.join(path_to_script, "q3.txt")
f = open(my_filename, "w")
sortedGrandDict = {k: v for k, v in sorted(self.grand_dict.items(), key=lambda item: item[1], reverse = True)}
i = 0
for k, v in sortedGrandDict.items():
if i == 50:
break
else:
f.write("{}: {}\n".format(k, v))
i += 1
f.close()
self.file_3_mutex.release()
toc = time.perf_counter()
#print(f"Took {toc - tic:0.4f} seconds to save file 3 !")
tic = time.perf_counter()
fragless = removeFragment(url)
domain = findDomains(fragless.netloc)
self.file_4_mutex.acquire()
if domain[1] == 'ics':
if domain[0] not in self.ics:
self.ics[domain[0]] = urlData(url, domain[0], domain[1])
else:
if fragless not in self.ics[domain[0]].getUniques():
self.ics[domain[0]].appendUnique(fragless)
# rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory
# this saves to the local directory, so I can constantly access the right file and check if it exists or not
path_to_script = os.path.dirname(os.path.abspath(__file__))
my_filename = os.path.join(path_to_script, "q4.txt")
# creating text file for question 4
sortedDictKeys = sorted(self.ics.keys())
f = open(my_filename, "w")
for i in sortedDictKeys:
f.write("{url}, {num} + \n".format(url = self.ics[i].getNiceLink(), num = len(self.ics[i].getUniques())))
f.close()
self.file_4_mutex.release()
toc = time.perf_counter()
#print(f"Took {toc - tic:0.4f} seconds to save file 4 !")

File diff suppressed because it is too large Load Diff

View File

@ -1 +0,0 @@
Longest Page: http://www.ics.uci.edu/~cs224, length: 83259

View File

@ -1,50 +0,0 @@
research: 71407
computer: 44358
science: 35764
ics: 31878
students: 31271
uci: 30946
events: 28911
news: 28680
student: 28244
information: 28159
informatics: 27680
graduate: 27322
0: 26001
school: 25154
2021: 24609
bren: 24296
data: 23332
us: 22961
undergraduate: 22912
faculty: 22357
2020: 22133
software: 22105
learning: 21218
policies: 20976
1: 19559
contact: 18653
2018: 17102
alumni: 17032
2: 16758
donald: 16690
projects: 16319
2019: 15778
computing: 15414
people: 15237
irvine: 15146
academic: 15127
support: 14680
2017: 14599
view: 14582
2016: 14330
ramesh: 14140
engineering: 13971
university: 13744
may: 13308
sciences: 13175
systems: 13164
course: 12868
statistics: 12582
media: 12577
new: 12501

View File

@ -1,86 +0,0 @@
http://Transformativeplay.ics.uci.edu, 1 +
http://accessibility.ics.uci.edu, 1 +
http://acoi.ics.uci.edu, 52 +
http://aiclub.ics.uci.edu, 1 +
http://archive.ics.uci.edu, 6 +
http://asterix.ics.uci.edu, 7 +
http://cbcl.ics.uci.edu, 23 +
http://cert.ics.uci.edu, 5 +
http://checkmate.ics.uci.edu, 1 +
http://chenli.ics.uci.edu, 9 +
http://cloudberry.ics.uci.edu, 45 +
http://cml.ics.uci.edu, 172 +
http://code.ics.uci.edu, 12 +
http://computableplant.ics.uci.edu, 33 +
http://cradl.ics.uci.edu, 20 +
http://create.ics.uci.edu, 6 +
http://cwicsocal18.ics.uci.edu, 12 +
http://cyberclub.ics.uci.edu, 14 +
http://dgillen.ics.uci.edu, 19 +
http://duttgroup.ics.uci.edu, 85 +
http://elms.ics.uci.edu, 1 +
http://emj.ics.uci.edu, 45 +
http://evoke.ics.uci.edu, 62 +
http://flamingo.ics.uci.edu, 11 +
http://fr.ics.uci.edu, 3 +
http://frost.ics.uci.edu, 1 +
http://futurehealth.ics.uci.edu, 72 +
http://graphics.ics.uci.edu, 4 +
http://hack.ics.uci.edu, 1 +
http://hai.ics.uci.edu, 3 +
http://helpdesk.ics.uci.edu, 3 +
http://hobbes.ics.uci.edu, 1 +
http://i-sensorium.ics.uci.edu, 1 +
http://iasl.ics.uci.edu, 17 +
http://industryshowcase.ics.uci.edu, 23 +
http://www.informatics.ics.uci.edu, 1 +
http://intranet.ics.uci.edu, 2 +
http://ipf.ics.uci.edu, 2 +
http://ipubmed.ics.uci.edu, 1 +
http://isg.ics.uci.edu, 104 +
http://jgarcia.ics.uci.edu, 23 +
http://luci.ics.uci.edu, 4 +
http://malek.ics.uci.edu, 1 +
http://mcs.ics.uci.edu, 31 +
http://mdogucu.ics.uci.edu, 1 +
http://mds.ics.uci.edu, 11 +
http://mhcid.ics.uci.edu, 16 +
http://mondego.ics.uci.edu, 3 +
http://motifmap.ics.uci.edu, 2 +
http://mse.ics.uci.edu, 2 +
http://mswe.ics.uci.edu, 16 +
http://mt-live.ics.uci.edu, 1634 +
http://nalini.ics.uci.edu, 7 +
http://ngs.ics.uci.edu, 2000 +
http://perennialpolycultures.ics.uci.edu, 1 +
http://plrg.ics.uci.edu, 14 +
http://psearch.ics.uci.edu, 1 +
http://radicle.ics.uci.edu, 1 +
http://redmiles.ics.uci.edu, 4 +
http://riscit.ics.uci.edu, 1 +
http://sconce.ics.uci.edu, 2 +
http://sdcl.ics.uci.edu, 205 +
http://seal.ics.uci.edu, 6 +
http://sherlock.ics.uci.edu, 7 +
http://sli.ics.uci.edu, 338 +
http://sourcerer.ics.uci.edu, 1 +
http://sprout.ics.uci.edu, 2 +
http://stairs.ics.uci.edu, 4 +
http://statconsulting.ics.uci.edu, 5 +
http://student-council.ics.uci.edu, 1 +
http://studentcouncil.ics.uci.edu, 3 +
http://support.ics.uci.edu, 4 +
http://swiki.ics.uci.edu, 42 +
http://tad.ics.uci.edu, 3 +
http://tastier.ics.uci.edu, 1 +
http://tippers.ics.uci.edu, 1 +
http://tippersweb.ics.uci.edu, 5 +
http://transformativeplay.ics.uci.edu, 58 +
http://tutors.ics.uci.edu, 44 +
http://ugradforms.ics.uci.edu, 3 +
http://unite.ics.uci.edu, 10 +
http://vision.ics.uci.edu, 200 +
http://wearablegames.ics.uci.edu, 11 +
http://wics.ics.uci.edu, 970 +
http://www-db.ics.uci.edu, 10 +
http://xtune.ics.uci.edu, 6 +

View File

@ -18,53 +18,16 @@ class Worker(Thread):
def run(self):
while True:
tic = time.perf_counter()
tbd_url = self.frontier.get_tbd_url()
toc = time.perf_counter()
#print(f"Took {toc - tic:0.4f} seconds to get_tbd_url")
if not tbd_url:
self.logger.info("Frontier is empty. Stopping Crawler.")
break
self.frontier.acquire_polite(tbd_url)
tic = time.perf_counter()
resp = download(tbd_url, self.config, self.logger)
start = time.perf_counter()
toc = time.perf_counter()
#print(f"Took {toc - tic:0.4f} seconds to do download url")
self.logger.info(
f"Downloaded {tbd_url}, status <{resp.status}>, "
f"using cache {self.config.cache_server}.")
tic = time.perf_counter()
scraped_urls = scraper.scraper(tbd_url, resp)
toc = time.perf_counter()
#print(f"Took {toc - tic:0.4f} seconds to do scrape url")
tic = time.perf_counter()
self.frontier.acquire_data_mutex()
for scraped_url in scraped_urls:
self.frontier.add_url(scraped_url)
self.frontier.mark_url_complete(tbd_url)
self.frontier.release_data_mutex()
toc = time.perf_counter()
#print(f"Took {toc - tic:0.4f} seconds to do add_url stuffs")
tic = time.perf_counter()
self.frontier.q1(tbd_url)
toc = time.perf_counter()
#print(f"Took {toc - tic:0.4f} seconds to do log q1 url")
tic = time.perf_counter()
self.frontier.q234(tbd_url, resp)
toc = time.perf_counter()
#print(f"Took {toc - tic:0.4f} seconds to do log q234 url")
while start + self.config.time_delay > time.perf_counter():
#print("Sleeping")
time.sleep(self.config.time_delay/5)
self.frontier.release_polite(tbd_url)
time.sleep(self.config.time_delay)

View File

@ -1,123 +0,0 @@
import re
import os
import urllib.request
from urllib.parse import urlparse
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import words
import re
import html2text
import nltk
nltk.download('stopwords')
nltk.download('words')
nltk.download('punkt')
english_words = words.words()
english_stop_words = stopwords.words('english')
# there is another nltk.download() requirement but I removed it so i forgot what it was
# it'll show in the console/terminal if you run the code i believe
# it showed in mine
# To explain this class I have to start by explaining the container I decided on using to keep track of subdomains of ics.uci.edu
# I decided to use a dict. Long story short, I was trying to figure out what to make my key so it would uniquely identify what I needed it to do.
# I was going to use the parsed.netloc; however, we're taking into account that a link that looks like https://somename.vision.ics.uci.edu
# is a unique link of the subdomain vision.
# And so I made the key the subdomain that is before ics.uci.edu in the link, and the value of the dict is this class
# It's a very simple class, so I'm not going to commenting what it does
class urlData:
def __init__(self, url, subdomain, domain):
self.url = url
self.nicelink = "http://" + removeFragment(url).netloc
self.domain = domain
self.subdomain = subdomain
self.uniques = set()
self.uniques.add(removeFragment(url))
def getDomain(self):
return self.domain
def getURL(self):
return self.url
def getNiceLink(self):
return self.nicelink
def getSub(self):
return self.subdomain
def getUniques(self):
return self.uniques
def appendUnique(self, parse):
self.uniques.add(parse)
# Tried to find a libary that would do this for me, but couldn't
# It parses the url and uses the netloc to separat for domain and subdomain
def findDomains(url):
urlsplit = url.split('.')
if urlsplit[0].lower() == 'www':
urlsplit.remove('www')
for i in range(len(urlsplit)):
if urlsplit[i] == 'ics':
if i == 0:
return 0, 0
elif i == 1:
return urlsplit[0], urlsplit[1]
else:
return urlsplit[i-1], urlsplit[i] #something like random.vision.ics.uci.edu will be consider a unique page of vision
return None, None
else:
for i in range(len(urlsplit)):
if urlsplit[i] == 'ics':
if i == 0:
return 0, 0
elif i == 1:
return urlsplit[0], urlsplit[1]
else:
return urlsplit[i-1], urlsplit[i] #something like random.vision.ics.uci.edu will be consider a unique page of vision
return None, None
def tokenize(resp):
# getting connection from url
valid = re.compile(r'[^a-zA-Z0-9]+')
# named it tSoup for merge convience
# need the 'lxml' parser for this.
# When extract_next_links is called it returns a list full of links with no resp, and I had to find a way to get text from just link.
# Therefore, I decided to get the plain text this way.
tSoup = BeautifulSoup(resp.raw_response.content, 'lxml')
# Floyd (1 March 2021) Stackoverflow. https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python
# compared this with tSoup.get_text() and clean_text just provided content easier to tokenize and more inline with my intentions
clean_text = ' '.join(tSoup.stripped_strings)
token = word_tokenize(clean_text)
clean_token = list()
# This used the nltk.corpus and just removes the tokens that aren't words
#token = [i for i in token if i.lower() in english_words]
for word in token:
if not valid.match(word):
clean_token.append(word.lower())
return clean_token
#added this so the scraper code is not too redundant
def computeFrequencies(tokens, d):
for t in tokens:
if t not in d:
d[t] = 1
else:
d[t] += 1
def removeStopWords(toks):
return [t for t in toks if t.lower() if not t.lower() in english_stop_words]
def removeFragment(u):
# turn into a urlparse object
# removed fragment in order to have "unique" links
removefrag = urlparse(u)
removefrag = removefrag._replace(fragment = '')
return removefrag

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,6 @@
import re
from urllib import robotparser
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from collections import defaultdict
import requests
@ -9,6 +10,7 @@ import requests
# https://docs.python.org/3/library/urllib.robotparser.html#urllib.robotparser.RobotFileParser
# http://pymotw.com/2/robotparser/
# https://stackoverflow.com/questions/43085744/parsing-robots-txt-in-python
'''This is ver 1.0
robots_seen = dict() # all robots go here (global so we can store over all site)
def robots_ok(parsed)->bool:
global robots_seen # global dict for files
@ -32,4 +34,30 @@ def robots_are_ok(parsed):
return robots_ok(parsed)
else:
return robots_seen[parsed.netloc] # if it has been read return its value
'''
# Ver 1.1 maybe if I am understanding this correctly
robots_seen = dict() # dict of all seen robot files and store not allowed
def robots_ok(url)->bool:
try:
parsed = urlparse(url) # parse url
except:
print("Error in parse for: " + url)
robotstxt = "" # string for location of file
try:
robottxts = parsed.scheme + "://" + parsed.hostname + "/robots.txt" # location of file
except:
print("Error in parse for robots.txt: " + parsed)
if robotstxt not in robots_seen: # if url not in dict add to dict
robots_seen[robotstxt] = robotparser.RobotFileParser(robotstxt)
try:
robots_seen[robotstxt] = robotparser.RobotFileParser.read(robotstxt)
except:
del robots_seen[robotstxt]
return True
try:
return robots_seen[robotstxt].can_fetch('*', url)
except:
print("There was an error with: " + url)
return True

View File

@ -1,37 +1,23 @@
from distutils.filelist import findall
from operator import truediv
import re
import time
import urllib.request
from urllib import robotparser
from urllib.parse import urlparse
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from robotsokay import *
def scraper(url, resp):
links = extract_next_links(url, resp)
links_valid = set()
links_valid = list()
valid_links = open("valid_links.txt",'a')
invalid_links = open("invalid_links.txt",'a')
tic = time.perf_counter()
for link in links:
if is_valid(link):
links_valid.add(link)
links_valid.append(link)
valid_links.write(link + "\n")
else:
invalid_links.write("From: " + url + "\n")
invalid_links.write(link + "\n")
pass
toc = time.perf_counter()
#print(f"Took {toc - tic:0.4f} seconds to validate !!!")
return links_valid
def extract_next_links(url, resp):
@ -44,11 +30,11 @@ def extract_next_links(url, resp):
# resp.raw_response.url: the url, again
# resp.raw_response.content: the content of the page!
# Return a list with the hyperlinks (as strings) scrapped from resp.raw_response.content
pages = set()
pages = list()
if resp.status == 200:
#do stuff
soup = BeautifulSoup(resp.raw_response.content,'lxml')
#tempFile = open("test.txt", 'a')
soup = BeautifulSoup(resp.raw_response.content)
tempFile = open("test6.txt", 'a')
#Getting all the links, href = true means at least theres a href value, dont know what it is yet
for link in soup.find_all('a', href=True):
#There is a lot of relative paths stuff here gotta add them
@ -64,30 +50,13 @@ def extract_next_links(url, resp):
if(href_link.startswith("/")):
href_link = urljoin(url,href_link)
if(href_link.startswith("www.")):
href_link = "https://" + href_link
#skipping query with specific actions which mutate the websites and cause a trap
if "do=" in href_link:
continue
# don't know if this is too expensive, otherwise idk
# takes parsed url and if not ok on robots goes next, else we can write file
"""
#For now robot checking too time expensive and incorrectly implemented
parsed = urlparse(href_link)
tic = time.perf_counter()
print(parsed)
if not robots_are_ok(parsed):
continue
toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to robots_are_ok !!!")
"""
#tempFile.write(href_link + "\n")
tempFile.write(href_link + "\n")
#Adding to the boi wonder pages
pages.add(href_link)
pages.append(href_link)
else:
print("Page error !")
return pages
@ -106,6 +75,7 @@ def is_valid(url):
# There are already some conditions that return False.
try:
#Gotta check if they are in the domain
parsed = urlparse(url)
url_parsed_path = parsed.path.lower() # this may help speed things up a little bit (less calls to parsed.path)
if parsed.scheme not in set(["http", "https"]):
@ -118,72 +88,36 @@ def is_valid(url):
+ r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso"
+ r"|epub|dll|cnf|tgz|sha1"
+ r"|thmx|mso|arff|rtf|jar|csv"
+ r"|rm|smil|wmv|swf|wma|zip|rar|gz)$",parsed.path.lower()):
return False
elif re.match(
#turns out some query also try to download files, which filled stuff with random stuff thats uncessary
r".*\.(css|js|bmp|gif|jpe?g|ico"
+ r"|png|tiff?|mid|mp2|mp3|mp4"
+ r"|wav|avi|mov|mpeg|ram|m4v|mkv|ogg|ogv|pdf"
+ r"|ps|eps|tex|ppt|pptx|doc|docx|xls|xlsx|names"
+ r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso"
+ r"|epub|dll|cnf|tgz|sha1"
+ r"|thmx|mso|arff|rtf|jar|csv"
+ r"|rm|smil|wmv|swf|wma|zip|rar|gz)$",parsed.query.lower()):
+ r"|rm|smil|wmv|swf|wma|zip|rar|gz)$",url_parsed_path):
return False
elif not re.match(
#Making sure domains are respected
r".*[./]ics.uci.edu/.*"
+ r"|.*[./]cs.uci.edu/.*"
+ r"|.*[./]informatics.uci.edu/.*"
+ r"|.*[./]stat.uci.edu/.*"
r".*ics.uci.edu/.*"
+ r"|.*cs.uci.edu/.*"
+ r"|.*informatics.uci.edu/.*"
+ r"|.*stat.uci.edu/.*"
+ r"|today.uci.edu/department/information_computer_sciences/.*",url):
return False
#Querying dates return usually bad information
#anything that ends with a date also usually returns junk also returns nothing useful most of the time
#/events/caterogy/ all gives random pages of no information
elif re.match(
r".*\d{4}-\d{2}-\d{2}",url):
return False
elif re.match(
r".*\d{4}-\d{2}-\d{2}/",url):
return False
elif re.match(
r".*\d{4}-\d{2}-\d{2}",parsed.query):
return False
elif re.match(
r".*\/events/category/.*",url):
return False
elif parsed.fragment:
return False
# https://support.archive-it.org/hc/en-us/articles/208332963-Modify-crawl-scope-with-a-Regular-Expression
# length check for looping filters and queries (could add hash check for similarity or regex, but don't know if we want to as this works well enought)
# we can adjust it based on what the cralwer does as well
if len(url) > 250:
if len(url) > 169:
return False
# this fixes any search box that keeps going page to page, currenty allow a depth of 0 filters
# any filter just give you uncesssary information since the original page already has all information for all poeple
if re.match(r".*(&filter%.*){1,}",url):
if robots_ok(url) == False: # if robots returns false than no go
return False
if re.match(r".*(&filter%.*){3,}" # this fixes any search box that keeps going page to page, currenty allow a depth of 2 filters
+ r"|^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$" # looping directory check
+ r"|^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$" # extra directories check (we can add as we find)
+ r"|^.*calendar.*$",url_parsed_path): # calendar checks plus adding or downloading calendar (ical)
return False
# this is for urls which when opened, download a file (do we want to download these files and tokenize them)
# elif re.match(r"^.*\&format=(\D{3,4})\Z$",url_parsed_path):
# return False
# another looping directory check but more advanced than the one contained in is_a_trap
if re.match(r"^.*?(/.+?/).*?\1.*$|^.*?/(.+?/)\2.*$",url):
return False
# extra directories check (we can add as we find)
if re.match(r"^.*(/misc|/sites|/all|/themes|/modules|/profiles|/css|/field|/node|/theme){3}.*$", url):
return False
# calendar checks plus adding or downloading calendar (ical)
if re.match(r"^.*calendar.*$",url):
return False
if parsed.query.find('ical') != -1:
return False
else:
return True
except TypeError:
print ("TypeError for ", parsed)
raise

File diff suppressed because it is too large Load Diff