Merge branch 'data_collection'
This commit is contained in:
commit
58c923f075
@ -108,6 +108,7 @@ class Frontier(object):
|
|||||||
self.save.sync()
|
self.save.sync()
|
||||||
data_mutex.release()
|
data_mutex.release()
|
||||||
##CRITICAL SECTION
|
##CRITICAL SECTION
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -196,4 +197,76 @@ class Frontier(object):
|
|||||||
elif "today.uci.edu/department/information_computer_sciences/" in url:
|
elif "today.uci.edu/department/information_computer_sciences/" in url:
|
||||||
return 4
|
return 4
|
||||||
else:
|
else:
|
||||||
println("ERROR")
|
println("ERROR")
|
||||||
|
|
||||||
|
def q1(self, url):
|
||||||
|
# rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory
|
||||||
|
# this saves to the local directory, so I can constantly access the right file and check if it exists or not
|
||||||
|
path_to_script = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
my_filename = os.path.join(path_to_script, "q1.txt")
|
||||||
|
|
||||||
|
# Will create a file of all the unique links and you can read the file and do lines = f.readlines() then len(lines) to get the number of unique links
|
||||||
|
if (os.path.exists(my_filename)):
|
||||||
|
f = open(my_filename, 'a')
|
||||||
|
f.write(removeFragment(url))
|
||||||
|
f.close()
|
||||||
|
else:
|
||||||
|
f = open(my_filename, 'w')
|
||||||
|
f.write(removeFragment(url))
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
def q234(self, url, resp):
|
||||||
|
# rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory
|
||||||
|
# this saves to the local directory, so I can constantly access the right file and check if it exists or not
|
||||||
|
path_to_script = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
my_filename = os.path.join(path_to_script, "q2.txt")
|
||||||
|
|
||||||
|
tempTok = tokenize(resp)
|
||||||
|
if len(tempTok) > self.max:
|
||||||
|
self.max = len(tempTok)
|
||||||
|
self.longest = url
|
||||||
|
f = open(my_filename, 'w')
|
||||||
|
f.write("Longest Page: {url}, length: {length}".format(url = self.longest, length = self.max))
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
tempTok = removeStopWords(tempTok)
|
||||||
|
computeFrequencies(tempTok, self.grand_dict)
|
||||||
|
|
||||||
|
# rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory
|
||||||
|
# this saves to the local directory, so I can constantly access the right file and check if it exists or not
|
||||||
|
path_to_script = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
my_filename = os.path.join(path_to_script, "q3.txt")
|
||||||
|
|
||||||
|
f = open(my_filename, "w")
|
||||||
|
sortedGrandDict = {k: v for k, v in sorted(self.grand_dict.items(), key=lambda item: item[1], reverse = True)}
|
||||||
|
i = 0
|
||||||
|
for k, v in sortedGrandDict.items():
|
||||||
|
if i == 50:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
f.write("{}: {}\n".format(k, v))
|
||||||
|
i += 1
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
fragless = removeFragment(url)
|
||||||
|
domain = findDomains(fragless.netloc)
|
||||||
|
if domain[1] == 'ics':
|
||||||
|
if domain[0] not in self.ics:
|
||||||
|
self.ics[domain[0]] = urlData(url, domain[0], domain[1])
|
||||||
|
else:
|
||||||
|
if fragless not in self.ics[domain[0]].getUniques():
|
||||||
|
self.ics[domain[0]].appendUnique(fragless)
|
||||||
|
|
||||||
|
# rakslice (8 May 2013) Stackoverflow. https://stackoverflow.com/questions/16430258/creating-a-python-file-in-a-local-directory
|
||||||
|
# this saves to the local directory, so I can constantly access the right file and check if it exists or not
|
||||||
|
path_to_script = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
my_filename = os.path.join(path_to_script, "q4.txt")
|
||||||
|
|
||||||
|
# creating text file for question 4
|
||||||
|
sortedDictKeys = sorted(self.ics.keys())
|
||||||
|
f = open(my_filename, "w")
|
||||||
|
for i in sortedDictKeys:
|
||||||
|
f.write("{url}, {num}".format(url = self.ics[i].getNiceLink(), num = len(self.ics[i].getUniques())))
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
|
@ -32,6 +32,16 @@ class Worker(Thread):
|
|||||||
toc = time.perf_counter()
|
toc = time.perf_counter()
|
||||||
print(f"Took {toc - tic:0.4f} seconds to do download url")
|
print(f"Took {toc - tic:0.4f} seconds to do download url")
|
||||||
|
|
||||||
|
tic = time.perf_counter()
|
||||||
|
self.frontier.q1(tbd_url)
|
||||||
|
toc = time.perf_counter()
|
||||||
|
print(f"Took {toc - tic:0.4f} seconds to do download url")
|
||||||
|
|
||||||
|
tic = time.perf_counter()
|
||||||
|
self.frontier.q234(tbd_url, resp)
|
||||||
|
toc = time.perf_counter()
|
||||||
|
print(f"Took {toc - tic:0.4f} seconds to do download url")
|
||||||
|
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
f"Downloaded {tbd_url}, status <{resp.status}>, "
|
f"Downloaded {tbd_url}, status <{resp.status}>, "
|
||||||
f"using cache {self.config.cache_server}.")
|
f"using cache {self.config.cache_server}.")
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import re
|
import re
|
||||||
|
import os
|
||||||
import urllib.request
|
import urllib.request
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
@ -80,16 +80,14 @@ def findDomains(url):
|
|||||||
return urlsplit[i-1], urlsplit[i] #something like random.vision.ics.uci.edu will be consider a unique page of vision
|
return urlsplit[i-1], urlsplit[i] #something like random.vision.ics.uci.edu will be consider a unique page of vision
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
def tokenize(url):
|
def tokenize(resp):
|
||||||
# getting connection from url
|
# getting connection from url
|
||||||
page = urllib.request.urlopen(url)
|
|
||||||
data = page.read()
|
|
||||||
valid = re.compile(r'[^a-zA-Z0-9]+')
|
valid = re.compile(r'[^a-zA-Z0-9]+')
|
||||||
# named it tSoup for merge convience
|
# named it tSoup for merge convience
|
||||||
# need the 'lxml' parser for this.
|
# need the 'lxml' parser for this.
|
||||||
# When extract_next_links is called it returns a list full of links with no resp, and I had to find a way to get text from just link.
|
# When extract_next_links is called it returns a list full of links with no resp, and I had to find a way to get text from just link.
|
||||||
# Therefore, I decided to get the plain text this way.
|
# Therefore, I decided to get the plain text this way.
|
||||||
tSoup = BeautifulSoup(data, 'lxml')
|
tSoup = BeautifulSoup(resp.raw_response.content, 'lxml')
|
||||||
|
|
||||||
# Floyd (1 March 2021) Stackoverflow. https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python
|
# Floyd (1 March 2021) Stackoverflow. https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python
|
||||||
# compared this with tSoup.get_text() and clean_text just provided content easier to tokenize and more inline with my intentions
|
# compared this with tSoup.get_text() and clean_text just provided content easier to tokenize and more inline with my intentions
|
||||||
|
Loading…
Reference in New Issue
Block a user