Change more syntax to get data collection working, check extracturl and sorted links into sets instead of lists to signifcantly reduce url extractions
This commit is contained in:
parent
d0dde4a4db
commit
58d15918d5
@ -26,3 +26,7 @@
|
||||
2022-04-20 03:45:40,958 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||
2022-04-20 03:49:54,245 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||
2022-04-20 03:50:02,043 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||
2022-04-20 03:59:18,104 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||
2022-04-20 04:00:14,031 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||
2022-04-20 04:01:31,386 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||
2022-04-20 04:02:16,043 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||
|
@ -28,3 +28,9 @@
|
||||
2022-04-20 03:46:32,554 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/from-theory-to-practice-ucis-machine-learning-hackathon-delivers, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
||||
2022-04-20 03:49:54,351 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
||||
2022-04-20 03:50:02,145 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
||||
2022-04-20 03:52:31,224 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
||||
2022-04-20 03:59:18,220 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
||||
2022-04-20 04:00:14,134 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
||||
2022-04-20 04:01:31,499 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
||||
2022-04-20 04:02:16,153 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
||||
2022-04-20 04:02:35,264 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/stephan-mandt-and-collaborators-receive-3-5-million-to-study-machine-learning-for-climate-science, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
||||
|
@ -111,17 +111,17 @@ class Frontier(object):
|
||||
|
||||
|
||||
f = open("q1.txt", "w")
|
||||
f.write("Number of unique pages: {length}\n".format(length = len(uniques)))
|
||||
f.write("Number of unique pages: {length}\n".format(length = len(self.uniques)))
|
||||
f.close()
|
||||
|
||||
# creating text file for question 2
|
||||
f = open("q2.txt", "w")
|
||||
f.write("Largest page url: {url} \nLength of page: {length}".format(url = longest, length = max))
|
||||
f.write("Largest page url: {url} \nLength of page: {length}".format(url = self.longest, length = self.max))
|
||||
f.close()
|
||||
|
||||
# creating text file for question 3
|
||||
f = open("q3.txt", "w")
|
||||
sortedGrandDict = {k: v for k, v in sorted(grand_dict.items(), key=lambda item: item[1], reverse = True)}
|
||||
sortedGrandDict = {k: v for k, v in sorted(self.grand_dict.items(), key=lambda item: item[1], reverse = True)}
|
||||
i = 0
|
||||
for k, v in sortedGrandDict.items():
|
||||
if i == 50:
|
||||
@ -132,10 +132,10 @@ class Frontier(object):
|
||||
f.close()
|
||||
|
||||
# creating text file for question 4
|
||||
sortedDictKeys = sorted(ics.keys())
|
||||
sortedDictKeys = sorted(self.ics.keys())
|
||||
f = open("q4.txt", "w")
|
||||
for i in sortedDictKeys:
|
||||
f.write("{url}, {num}".format(url = ics[i].getNiceLink(), num = len(ics[i].getUniques())))
|
||||
f.write("{url}, {num}".format(url = self.ics[i].getNiceLink(), num = len(self.ics[i].getUniques())))
|
||||
f.close()
|
||||
|
||||
|
||||
|
@ -24,7 +24,7 @@ from datacollection import *
|
||||
def scraper(url, resp):
|
||||
links = extract_next_links(url, resp)
|
||||
|
||||
links_valid = list()
|
||||
links_valid = set()
|
||||
#valid_links = open("valid_links.txt",'a')
|
||||
#invalid_links = open("invalid_links.txt",'a')
|
||||
|
||||
@ -32,7 +32,7 @@ def scraper(url, resp):
|
||||
for link in links:
|
||||
tic = time.perf_counter()
|
||||
if is_valid(link):
|
||||
links_valid.append(link)
|
||||
links_valid.add(link)
|
||||
toc = time.perf_counter()
|
||||
print(f"Took {toc - tic:0.4f} seconds to do validate url")
|
||||
#valid_links.write(link + "\n")
|
||||
@ -83,7 +83,7 @@ def extract_next_links(url, resp):
|
||||
# resp.raw_response.url: the url, again
|
||||
# resp.raw_response.content: the content of the page!
|
||||
# Return a list with the hyperlinks (as strings) scrapped from resp.raw_response.content
|
||||
pages = list()
|
||||
pages = set()
|
||||
if resp.status == 200:
|
||||
#do stuff
|
||||
soup = BeautifulSoup(resp.raw_response.content)
|
||||
@ -115,7 +115,7 @@ def extract_next_links(url, resp):
|
||||
|
||||
#tempFile.write(href_link + "\n")
|
||||
#Adding to the boi wonder pages
|
||||
pages.append(href_link)
|
||||
pages.add(href_link)
|
||||
else:
|
||||
print("Page error !")
|
||||
return pages
|
||||
|
Loading…
Reference in New Issue
Block a user