finished datacollection

This commit is contained in:
unknown 2022-04-19 22:59:14 -07:00
parent f2cdf66de1
commit 44c86eb51a
2 changed files with 205 additions and 10 deletions

View File

@ -0,0 +1,114 @@
import re
import urllib.request
from urllib.parse import urlparse
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import words
import html2text
import nltk
# nltk.download('stopwords')
# nltk.download('words')
# there is another nltk.download() requirement but I removed it so i forgot what it was
# it'll show in the console/terminal if you run the code i believe
# it showed in mine
# To explain this class I have to start by explaining the container I decided on using to keep track of subdomains of ics.uci.edu
# I decided to use a dict. Long story short, I was trying to figure out what to make my key so it would uniquely identify what I needed it to do.
# I was going to use the parsed.netloc; however, we're taking into account that a link that looks like https://somename.vision.ics.uci.edu
# is a unique link of the subdomain vision.
# And so I made the key the subdomain that is before ics.uci.edu in the link, and the value of the dict is this class
# It's a very simple class, so I'm not going to commenting what it does
class urlData:
def __init__(self, url, subdomain, domain):
self.url = url
self.nicelink = "http://" + removeFragment(url).netloc
self.domain = domain
self.subdomain = subdomain
self.uniques = set()
self.uniques.add(removeFragment(url))
def getDomain(self):
return self.domain
def getURL(self):
return self.url
def getNiceLink(self):
return self.nicelink
def getSub(self):
return self.subdomain
def getUniques(self):
return self.uniques
def appendUnique(self, parse):
self.uniques.add(parse)
# Tried to find a libary that would do this for me, but couldn't
# It parses the url and uses the netloc to separat for domain and subdomain
def findDomains(url):
urlsplit = url.split('.')
if urlsplit[0].lower() == 'www':
urlsplit.remove('www')
for i in range(len(urlsplit)):
if urlsplit[i] == 'ics':
if i == 0:
return 0, 0
elif i == 1:
return urlsplit[0], urlsplit[1]
else:
return urlsplit[i-1], urlsplit[i] #something like random.vision.ics.uci.edu will be consider a unique page of vision
return None, None
else:
for i in range(len(urlsplit)):
if urlsplit[i] == 'ics':
if i == 0:
return 0, 0
elif i == 1:
return urlsplit[0], urlsplit[1]
else:
return urlsplit[i-1], urlsplit[i] #something like random.vision.ics.uci.edu will be consider a unique page of vision
return None, None
def tokenize(url):
# getting connection from url
page = urllib.request.urlopen(url)
data = page.read()
# named it tSoup for merge convience
# need the 'lxml' parser for this.
# When extract_next_links is called it returns a list full of links with no resp, and I had to find a way to get text from just link.
# Therefore, I decided to get the plain text this way.
tSoup = BeautifulSoup(data, 'lxml')
# Floyd (1 March 2021) Stackoverflow. https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python
# compared this with tSoup.get_text() and clean_text just provided content easier to tokenize and more inline with my intentions
clean_text = ' '.join(tSoup.stripped_strings)
token = word_tokenize(clean_text)
# This used the nltk.corpus and just removes the tokens that aren't words
token = [i for i in token if i.lower() in words.words()]
return token
#added this so the scraper code is not too redundant
def computeFrequencies(tokens, d):
for t in tokens:
if t not in d:
d[t] = 1
else:
d[t] += 1
def removeStopWords(toks):
stopWords = set(stopwords.words('english'))
return [t for t in toks if t.lower() if not t.lower() in stopWords]
def removeFragment(u):
# turn into a urlparse object
# removed fragment in order to have "unique" links
removefrag = urlparse(u)
removefrag = removefrag._replace(fragment = '')
return removefrag

View File

@ -1,34 +1,115 @@
import re import re
import urllib.request
from urllib.parse import urlparse from urllib.parse import urlparse
from urllib.parse import urljoin from urllib.parse import urljoin
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import words
import html2text
import nltk
#moved all my code to a separted py file and imported it here
from datacollection import *
# nltk.download('stopwords')
# nltk.download('words')
# there is another nltk.download() requirement but I removed it so i forgot what it was
# it'll show in the console/terminal if you run the code i believe. it appeared in mine
def scraper(url, resp): def scraper(url, resp):
#initialize set for unique links # initialize set for unique links
#used a set for elimatining duplicates # used a set for elimatining duplicates
uniques = set() uniques = set()
# have to add the original url to the unique set
copyoriginal = url
uniques.add(removeFragment(copyoriginal))
# initializing longest for finding the longest page
max = -9999
longest = None
# have to do this for the original url
tok = tokenize(url)
if len(tok) > max:
max = len(tok)
longest = url
# grand_dict is a dictionary that contains every word over the entire set of pages (excluding stop words)
# key: word , value: frequencies
grand_dict = dict()
tok = removeStopWords(tok)
computeFrequencies(tok, grand_dict)
# ics is a dict with subdomains
ics = dict()
links = extract_next_links(url, resp) links = extract_next_links(url, resp)
links_valid = list() links_valid = list()
valid_links = open("valid_links.txt",'a') valid_links = open("valid_links.txt",'a')
invalid_links = open("invalid_links.txt",'a') invalid_links = open("invalid_links.txt",'a')
for link in links: for link in links:
if is_valid(link): if is_valid(link):
links_valid.append(link) links_valid.append(link)
valid_links.write(link + "\n") valid_links.write(link + "\n")
#turn into a urlparse object # Answering q1 for report
#removed fragment in order to have "unique" links uniques.add(removeFragment(link))
remove_frag = urlparse(url)
remove_frag = remove_frag._replace(fragment = '') # Answering q2
uniques.add(remove_frag) tempTok = tokenize(link)
if len(tempTok) > max:
max = len(tempTok)
longest = link
# Answering q3
tempTok = removeStopWords(tempTok)
computeFrequencies(tempTok, grand_dict)
# Answering q4
fragless = removeFragment(link)
domain = findDomains(fragless.netloc)
if domain[1] == 'ics':
if domain[0] not in ics:
ics[domain[0]] = urlData(link, domain[0], domain[1])
else:
if fragless not in ics[domain[0]].getUniques():
ics[domain[0]].appendUnique(fragless)
else: else:
invalid_links.write("From: " + url + "\n") invalid_links.write("From: " + url + "\n")
invalid_links.write(link + "\n") invalid_links.write(link + "\n")
#creating text file that includes the number of unique links # creating text file that includes the number of unique links
f = open("numUniqueLinks.txt", "w") f = open("q1.txt", "w")
f.write("{length}".format(length = len(uniques))) f.write("Number of unique pages: {length}".format(length = len(uniques)))
f.close()
# creating text file for question 2
f = open("q2.txt", "w")
f.write("Largest page url: {url} \nLength of page: {length}".format(url = longest, length = max))
f.close()
# creating text file for question 3
f = open("q3.txt", "w")
sortedGrandDict = {k: v for k, v in sorted(grand_dict.items(), key=lambda item: item[1], reverse = True)}
i = 0
for k, v in sortedGrandDict.items():
if i == 50:
break
else:
f.write(k, ':', v)
f.close()
# creating text file for question 4
sortedDictKeys = sorted(ics.keys())
f = open("q4.txt", "w")
for i in sortedDictKeys:
f.write("{url}, {num}".format(url = ics[i].getNiceLink(), num = len(ics[i].getUniques())))
f.close() f.close()
return links_valid return links_valid