added functionality for unique links
This commit is contained in:
parent
3e8f57bd34
commit
f2cdf66de1
@ -4,6 +4,10 @@ from urllib.parse import urljoin
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
def scraper(url, resp):
|
def scraper(url, resp):
|
||||||
|
#initialize set for unique links
|
||||||
|
#used a set for elimatining duplicates
|
||||||
|
uniques = set()
|
||||||
|
|
||||||
links = extract_next_links(url, resp)
|
links = extract_next_links(url, resp)
|
||||||
links_valid = list()
|
links_valid = list()
|
||||||
valid_links = open("valid_links.txt",'a')
|
valid_links = open("valid_links.txt",'a')
|
||||||
@ -12,9 +16,21 @@ def scraper(url, resp):
|
|||||||
if is_valid(link):
|
if is_valid(link):
|
||||||
links_valid.append(link)
|
links_valid.append(link)
|
||||||
valid_links.write(link + "\n")
|
valid_links.write(link + "\n")
|
||||||
|
|
||||||
|
#turn into a urlparse object
|
||||||
|
#removed fragment in order to have "unique" links
|
||||||
|
remove_frag = urlparse(url)
|
||||||
|
remove_frag = remove_frag._replace(fragment = '')
|
||||||
|
uniques.add(remove_frag)
|
||||||
else:
|
else:
|
||||||
invalid_links.write("From: " + url + "\n")
|
invalid_links.write("From: " + url + "\n")
|
||||||
invalid_links.write(link + "\n")
|
invalid_links.write(link + "\n")
|
||||||
|
|
||||||
|
#creating text file that includes the number of unique links
|
||||||
|
f = open("numUniqueLinks.txt", "w")
|
||||||
|
f.write("{length}".format(length = len(uniques)))
|
||||||
|
f.close()
|
||||||
|
|
||||||
return links_valid
|
return links_valid
|
||||||
|
|
||||||
def extract_next_links(url, resp):
|
def extract_next_links(url, resp):
|
||||||
|
Loading…
Reference in New Issue
Block a user