diff --git a/spacetime-crawler4py-master/crawler/frontier.py b/spacetime-crawler4py-master/crawler/frontier.py index a1fff94..2106d21 100644 --- a/spacetime-crawler4py-master/crawler/frontier.py +++ b/spacetime-crawler4py-master/crawler/frontier.py @@ -108,6 +108,7 @@ class Frontier(object): self.save.sync() data_mutex.release() ##CRITICAL SECTION + @@ -196,4 +197,76 @@ class Frontier(object): elif "today.uci.edu/department/information_computer_sciences/" in url: return 4 else: - println("ERROR") \ No newline at end of file + println("ERROR") + + def q1(self, url): + # rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory + # this saves to the local directory, so I can constantly access the right file and check if it exists or not + path_to_script = os.path.dirname(os.path.abspath(__file__)) + my_filename = os.path.join(path_to_script, "q1.txt") + + # Will create a file of all the unique links and you can read the file and do lines = f.readlines() then len(lines) to get the number of unique links + if (os.path.exists(my_filename)): + f = open(my_filename, 'a') + f.write(removeFragment(url)) + f.close() + else: + f = open(my_filename, 'w') + f.write(removeFragment(url)) + f.close() + + def q234(self, url, resp): + # rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory + # this saves to the local directory, so I can constantly access the right file and check if it exists or not + path_to_script = os.path.dirname(os.path.abspath(__file__)) + my_filename = os.path.join(path_to_script, "q2.txt") + + tempTok = tokenize(resp) + if len(tempTok) > self.max: + self.max = len(tempTok) + self.longest = url + f = open(my_filename, 'w') + f.write("Longest Page: {url}, length: {length}".format(url = self.longest, length = self.max)) + f.close() + + tempTok = removeStopWords(tempTok) + computeFrequencies(tempTok, self.grand_dict) + + # rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory + # this saves to the local directory, so I can constantly access the right file and check if it exists or not + path_to_script = os.path.dirname(os.path.abspath(__file__)) + my_filename = os.path.join(path_to_script, "q3.txt") + + f = open(my_filename, "w") + sortedGrandDict = {k: v for k, v in sorted(self.grand_dict.items(), key=lambda item: item[1], reverse = True)} + i = 0 + for k, v in sortedGrandDict.items(): + if i == 50: + break + else: + f.write("{}: {}\n".format(k, v)) + i += 1 + f.close() + + fragless = removeFragment(url) + domain = findDomains(fragless.netloc) + if domain[1] == 'ics': + if domain[0] not in self.ics: + self.ics[domain[0]] = urlData(url, domain[0], domain[1]) + else: + if fragless not in self.ics[domain[0]].getUniques(): + self.ics[domain[0]].appendUnique(fragless) + + # rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory + # this saves to the local directory, so I can constantly access the right file and check if it exists or not + path_to_script = os.path.dirname(os.path.abspath(__file__)) + my_filename = os.path.join(path_to_script, "q4.txt") + + # creating text file for question 4 + sortedDictKeys = sorted(self.ics.keys()) + f = open(my_filename, "w") + for i in sortedDictKeys: + f.write("{url}, {num}".format(url = self.ics[i].getNiceLink(), num = len(self.ics[i].getUniques()))) + f.close() + + \ No newline at end of file diff --git a/spacetime-crawler4py-master/crawler/worker.py b/spacetime-crawler4py-master/crawler/worker.py index 42818a5..a64120a 100644 --- a/spacetime-crawler4py-master/crawler/worker.py +++ b/spacetime-crawler4py-master/crawler/worker.py @@ -32,6 +32,16 @@ class Worker(Thread): toc = time.perf_counter() print(f"Took {toc - tic:0.4f} seconds to do download url") + tic = time.perf_counter() + self.frontier.q1(tbd_url) + toc = time.perf_counter() + print(f"Took {toc - tic:0.4f} seconds to do download url") + + tic = time.perf_counter() + self.frontier.q234(tbd_url, resp) + toc = time.perf_counter() + print(f"Took {toc - tic:0.4f} seconds to do download url") + self.logger.info( f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") diff --git a/spacetime-crawler4py-master/datacollection.py b/spacetime-crawler4py-master/datacollection.py index 2e5f87f..b6204ae 100644 --- a/spacetime-crawler4py-master/datacollection.py +++ b/spacetime-crawler4py-master/datacollection.py @@ -1,5 +1,5 @@ import re - +import os import urllib.request from urllib.parse import urlparse from urllib.parse import urljoin @@ -80,16 +80,14 @@ def findDomains(url): return urlsplit[i-1], urlsplit[i] #something like random.vision.ics.uci.edu will be consider a unique page of vision return None, None -def tokenize(url): +def tokenize(resp): # getting connection from url - page = urllib.request.urlopen(url) - data = page.read() valid = re.compile(r'[^a-zA-Z0-9]+') # named it tSoup for merge convience # need the 'lxml' parser for this. # When extract_next_links is called it returns a list full of links with no resp, and I had to find a way to get text from just link. # Therefore, I decided to get the plain text this way. - tSoup = BeautifulSoup(data, 'lxml') + tSoup = BeautifulSoup(resp.raw_response.content, 'lxml') # Floyd (1 March 2021) Stackoverflow. https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python # compared this with tSoup.get_text() and clean_text just provided content easier to tokenize and more inline with my intentions