Change more syntax to get data collection working, check extracturl and sorted links into sets instead of lists to signifcantly reduce url extractions
This commit is contained in:
parent
d0dde4a4db
commit
58d15918d5
@ -26,3 +26,7 @@
|
|||||||
2022-04-20 03:45:40,958 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
2022-04-20 03:45:40,958 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
2022-04-20 03:49:54,245 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
2022-04-20 03:49:54,245 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
2022-04-20 03:50:02,043 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
2022-04-20 03:50:02,043 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-20 03:59:18,104 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-20 04:00:14,031 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-20 04:01:31,386 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
2022-04-20 04:02:16,043 - FRONTIER - INFO - Found save file frontier.shelve, deleting it.
|
||||||
|
@ -28,3 +28,9 @@
|
|||||||
2022-04-20 03:46:32,554 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/from-theory-to-practice-ucis-machine-learning-hackathon-delivers, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
2022-04-20 03:46:32,554 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/from-theory-to-practice-ucis-machine-learning-hackathon-delivers, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
||||||
2022-04-20 03:49:54,351 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
2022-04-20 03:49:54,351 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
||||||
2022-04-20 03:50:02,145 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
2022-04-20 03:50:02,145 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
||||||
|
2022-04-20 03:52:31,224 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
||||||
|
2022-04-20 03:59:18,220 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
||||||
|
2022-04-20 04:00:14,134 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
||||||
|
2022-04-20 04:01:31,499 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
||||||
|
2022-04-20 04:02:16,153 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
||||||
|
2022-04-20 04:02:35,264 - Worker-0 - INFO - Downloaded https://www.stat.uci.edu/stephan-mandt-and-collaborators-receive-3-5-million-to-study-machine-learning-for-climate-science, status <200>, using cache ('styx.ics.uci.edu', 9009).
|
||||||
|
@ -111,17 +111,17 @@ class Frontier(object):
|
|||||||
|
|
||||||
|
|
||||||
f = open("q1.txt", "w")
|
f = open("q1.txt", "w")
|
||||||
f.write("Number of unique pages: {length}\n".format(length = len(uniques)))
|
f.write("Number of unique pages: {length}\n".format(length = len(self.uniques)))
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
# creating text file for question 2
|
# creating text file for question 2
|
||||||
f = open("q2.txt", "w")
|
f = open("q2.txt", "w")
|
||||||
f.write("Largest page url: {url} \nLength of page: {length}".format(url = longest, length = max))
|
f.write("Largest page url: {url} \nLength of page: {length}".format(url = self.longest, length = self.max))
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
# creating text file for question 3
|
# creating text file for question 3
|
||||||
f = open("q3.txt", "w")
|
f = open("q3.txt", "w")
|
||||||
sortedGrandDict = {k: v for k, v in sorted(grand_dict.items(), key=lambda item: item[1], reverse = True)}
|
sortedGrandDict = {k: v for k, v in sorted(self.grand_dict.items(), key=lambda item: item[1], reverse = True)}
|
||||||
i = 0
|
i = 0
|
||||||
for k, v in sortedGrandDict.items():
|
for k, v in sortedGrandDict.items():
|
||||||
if i == 50:
|
if i == 50:
|
||||||
@ -132,10 +132,10 @@ class Frontier(object):
|
|||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
# creating text file for question 4
|
# creating text file for question 4
|
||||||
sortedDictKeys = sorted(ics.keys())
|
sortedDictKeys = sorted(self.ics.keys())
|
||||||
f = open("q4.txt", "w")
|
f = open("q4.txt", "w")
|
||||||
for i in sortedDictKeys:
|
for i in sortedDictKeys:
|
||||||
f.write("{url}, {num}".format(url = ics[i].getNiceLink(), num = len(ics[i].getUniques())))
|
f.write("{url}, {num}".format(url = self.ics[i].getNiceLink(), num = len(self.ics[i].getUniques())))
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
|
|
||||||
|
@ -24,7 +24,7 @@ from datacollection import *
|
|||||||
def scraper(url, resp):
|
def scraper(url, resp):
|
||||||
links = extract_next_links(url, resp)
|
links = extract_next_links(url, resp)
|
||||||
|
|
||||||
links_valid = list()
|
links_valid = set()
|
||||||
#valid_links = open("valid_links.txt",'a')
|
#valid_links = open("valid_links.txt",'a')
|
||||||
#invalid_links = open("invalid_links.txt",'a')
|
#invalid_links = open("invalid_links.txt",'a')
|
||||||
|
|
||||||
@ -32,7 +32,7 @@ def scraper(url, resp):
|
|||||||
for link in links:
|
for link in links:
|
||||||
tic = time.perf_counter()
|
tic = time.perf_counter()
|
||||||
if is_valid(link):
|
if is_valid(link):
|
||||||
links_valid.append(link)
|
links_valid.add(link)
|
||||||
toc = time.perf_counter()
|
toc = time.perf_counter()
|
||||||
print(f"Took {toc - tic:0.4f} seconds to do validate url")
|
print(f"Took {toc - tic:0.4f} seconds to do validate url")
|
||||||
#valid_links.write(link + "\n")
|
#valid_links.write(link + "\n")
|
||||||
@ -83,7 +83,7 @@ def extract_next_links(url, resp):
|
|||||||
# resp.raw_response.url: the url, again
|
# resp.raw_response.url: the url, again
|
||||||
# resp.raw_response.content: the content of the page!
|
# resp.raw_response.content: the content of the page!
|
||||||
# Return a list with the hyperlinks (as strings) scrapped from resp.raw_response.content
|
# Return a list with the hyperlinks (as strings) scrapped from resp.raw_response.content
|
||||||
pages = list()
|
pages = set()
|
||||||
if resp.status == 200:
|
if resp.status == 200:
|
||||||
#do stuff
|
#do stuff
|
||||||
soup = BeautifulSoup(resp.raw_response.content)
|
soup = BeautifulSoup(resp.raw_response.content)
|
||||||
@ -115,7 +115,7 @@ def extract_next_links(url, resp):
|
|||||||
|
|
||||||
#tempFile.write(href_link + "\n")
|
#tempFile.write(href_link + "\n")
|
||||||
#Adding to the boi wonder pages
|
#Adding to the boi wonder pages
|
||||||
pages.append(href_link)
|
pages.add(href_link)
|
||||||
else:
|
else:
|
||||||
print("Page error !")
|
print("Page error !")
|
||||||
return pages
|
return pages
|
||||||
|
Loading…
Reference in New Issue
Block a user