Change more syntax to get data collection working, check extracturl and sorted links into sets instead of lists to signifcantly reduce url extractions

This commit is contained in:
Hieuhuy Pham 2022-04-20 04:03:58 -07:00
parent d0dde4a4db
commit 58d15918d5
4 changed files with 19 additions and 9 deletions

View File

@ -26,3 +26,7 @@
2022-04-20 03:45:40,958 - FRONTIER - INFO - Found save file frontier.shelve, deleting it. 2022-04-20 03:45:40,958 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:49:54,245 - FRONTIER - INFO - Found save file frontier.shelve, deleting it. 2022-04-20 03:49:54,245 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:50:02,043 - FRONTIER - INFO - Found save file frontier.shelve, deleting it. 2022-04-20 03:50:02,043 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 03:59:18,104 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 04:00:14,031 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 04:01:31,386 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
2022-04-20 04:02:16,043 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.

View File

@ -28,3 +28,9 @@
2022-04-20 03:46:32,554 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/from-theory-to-practice-ucis-machine-learning-hackathon-delivers, status <200>, using cache ('styx.ics.uci.edu', 9009). 2022-04-20 03:46:32,554 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/from-theory-to-practice-ucis-machine-learning-hackathon-delivers, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:49:54,351 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009). 2022-04-20 03:49:54,351 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:50:02,145 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009). 2022-04-20 03:50:02,145 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:52:31,224 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 03:59:18,220 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 04:00:14,134 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 04:01:31,499 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 04:02:16,153 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
2022-04-20 04:02:35,264 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/stephan-mandt-and-collaborators-receive-3-5-million-to-study-machine-learning-for-climate-science, status <200>, using cache ('styx.ics.uci.edu', 9009).

View File

@ -111,17 +111,17 @@ class Frontier(object):
f = open("q1.txt", "w") f = open("q1.txt", "w")
f.write("Number of unique pages: {length}\n".format(length = len(uniques))) f.write("Number of unique pages: {length}\n".format(length = len(self.uniques)))
f.close() f.close()
# creating text file for question 2 # creating text file for question 2
f = open("q2.txt", "w") f = open("q2.txt", "w")
f.write("Largest page url: {url} \nLength of page: {length}".format(url = longest, length = max)) f.write("Largest page url: {url} \nLength of page: {length}".format(url = self.longest, length = self.max))
f.close() f.close()
# creating text file for question 3 # creating text file for question 3
f = open("q3.txt", "w") f = open("q3.txt", "w")
sortedGrandDict = {k: v for k, v in sorted(grand_dict.items(), key=lambda item: item[1], reverse = True)} sortedGrandDict = {k: v for k, v in sorted(self.grand_dict.items(), key=lambda item: item[1], reverse = True)}
i = 0 i = 0
for k, v in sortedGrandDict.items(): for k, v in sortedGrandDict.items():
if i == 50: if i == 50:
@ -132,10 +132,10 @@ class Frontier(object):
f.close() f.close()
# creating text file for question 4 # creating text file for question 4
sortedDictKeys = sorted(ics.keys()) sortedDictKeys = sorted(self.ics.keys())
f = open("q4.txt", "w") f = open("q4.txt", "w")
for i in sortedDictKeys: for i in sortedDictKeys:
f.write("{url}, {num}".format(url = ics[i].getNiceLink(), num = len(ics[i].getUniques()))) f.write("{url}, {num}".format(url = self.ics[i].getNiceLink(), num = len(self.ics[i].getUniques())))
f.close() f.close()

View File

@ -24,7 +24,7 @@ from datacollection import *
def scraper(url, resp): def scraper(url, resp):
links = extract_next_links(url, resp) links = extract_next_links(url, resp)
links_valid = list() links_valid = set()
#valid_links = open("valid_links.txt",'a') #valid_links = open("valid_links.txt",'a')
#invalid_links = open("invalid_links.txt",'a') #invalid_links = open("invalid_links.txt",'a')
@ -32,7 +32,7 @@ def scraper(url, resp):
for link in links: for link in links:
tic = time.perf_counter() tic = time.perf_counter()
if is_valid(link): if is_valid(link):
links_valid.append(link) links_valid.add(link)
toc = time.perf_counter() toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to do validate url") print(f"Took {toc - tic:0.4f} seconds to do validate url")
#valid_links.write(link + "\n") #valid_links.write(link + "\n")
@ -83,7 +83,7 @@ def extract_next_links(url, resp):
# resp.raw_response.url: the url, again # resp.raw_response.url: the url, again
# resp.raw_response.content: the content of the page! # resp.raw_response.content: the content of the page!
# Return a list with the hyperlinks (as strings) scrapped from resp.raw_response.content # Return a list with the hyperlinks (as strings) scrapped from resp.raw_response.content
pages = list() pages = set()
if resp.status == 200: if resp.status == 200:
#do stuff #do stuff
soup = BeautifulSoup(resp.raw_response.content) soup = BeautifulSoup(resp.raw_response.content)
@ -115,7 +115,7 @@ def extract_next_links(url, resp):
#tempFile.write(href_link + "\n") #tempFile.write(href_link + "\n")
#Adding to the boi wonder pages #Adding to the boi wonder pages
pages.append(href_link) pages.add(href_link)
else: else:
print("Page error !") print("Page error !")
return pages return pages