diff --git a/spacetime-crawler4py-master/datacollection.py b/spacetime-crawler4py-master/datacollection.py new file mode 100644 index 0000000..2aabb80 --- /dev/null +++ b/spacetime-crawler4py-master/datacollection.py @@ -0,0 +1,114 @@ +import re +import urllib.request +from urllib.parse import urlparse +from urllib.parse import urljoin +from bs4 import BeautifulSoup +from nltk.tokenize import word_tokenize +from nltk.corpus import stopwords +from nltk.corpus import words +import html2text +import nltk +# nltk.download('stopwords') +# nltk.download('words') +# there is another nltk.download() requirement but I removed it so i forgot what it was +# it'll show in the console/terminal if you run the code i believe +# it showed in mine + +# To explain this class I have to start by explaining the container I decided on using to keep track of subdomains of ics.uci.edu +# I decided to use a dict. Long story short, I was trying to figure out what to make my key so it would uniquely identify what I needed it to do. +# I was going to use the parsed.netloc; however, we're taking into account that a link that looks like https://somename.vision.ics.uci.edu +# is a unique link of the subdomain vision. +# And so I made the key the subdomain that is before ics.uci.edu in the link, and the value of the dict is this class +# It's a very simple class, so I'm not going to commenting what it does +class urlData: + def __init__(self, url, subdomain, domain): + self.url = url + self.nicelink = "http://" + removeFragment(url).netloc + self.domain = domain + self.subdomain = subdomain + self.uniques = set() + self.uniques.add(removeFragment(url)) + + def getDomain(self): + return self.domain + + def getURL(self): + return self.url + + def getNiceLink(self): + return self.nicelink + + def getSub(self): + return self.subdomain + + def getUniques(self): + return self.uniques + + def appendUnique(self, parse): + self.uniques.add(parse) + +# Tried to find a libary that would do this for me, but couldn't +# It parses the url and uses the netloc to separat for domain and subdomain +def findDomains(url): + urlsplit = url.split('.') + if urlsplit[0].lower() == 'www': + urlsplit.remove('www') + for i in range(len(urlsplit)): + if urlsplit[i] == 'ics': + if i == 0: + return 0, 0 + elif i == 1: + return urlsplit[0], urlsplit[1] + else: + return urlsplit[i-1], urlsplit[i] #something like random.vision.ics.uci.edu will be consider a unique page of vision + return None, None + else: + for i in range(len(urlsplit)): + if urlsplit[i] == 'ics': + if i == 0: + return 0, 0 + elif i == 1: + return urlsplit[0], urlsplit[1] + else: + return urlsplit[i-1], urlsplit[i] #something like random.vision.ics.uci.edu will be consider a unique page of vision + return None, None + +def tokenize(url): + # getting connection from url + page = urllib.request.urlopen(url) + data = page.read() + + # named it tSoup for merge convience + # need the 'lxml' parser for this. + # When extract_next_links is called it returns a list full of links with no resp, and I had to find a way to get text from just link. + # Therefore, I decided to get the plain text this way. + tSoup = BeautifulSoup(data, 'lxml') + + # Floyd (1 March 2021) Stackoverflow. https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python + # compared this with tSoup.get_text() and clean_text just provided content easier to tokenize and more inline with my intentions + clean_text = ' '.join(tSoup.stripped_strings) + token = word_tokenize(clean_text) + + # This used the nltk.corpus and just removes the tokens that aren't words + token = [i for i in token if i.lower() in words.words()] + + return token + +#added this so the scraper code is not too redundant +def computeFrequencies(tokens, d): + for t in tokens: + if t not in d: + d[t] = 1 + else: + d[t] += 1 + +def removeStopWords(toks): + stopWords = set(stopwords.words('english')) + return [t for t in toks if t.lower() if not t.lower() in stopWords] + +def removeFragment(u): + # turn into a urlparse object + # removed fragment in order to have "unique" links + removefrag = urlparse(u) + removefrag = removefrag._replace(fragment = '') + return removefrag diff --git a/spacetime-crawler4py-master/scraper.py b/spacetime-crawler4py-master/scraper.py index 36fcee1..52730f7 100644 --- a/spacetime-crawler4py-master/scraper.py +++ b/spacetime-crawler4py-master/scraper.py @@ -1,34 +1,115 @@ import re +import urllib.request from urllib.parse import urlparse from urllib.parse import urljoin from bs4 import BeautifulSoup +from nltk.tokenize import word_tokenize +from nltk.corpus import stopwords +from nltk.corpus import words +import html2text +import nltk +#moved all my code to a separted py file and imported it here +from datacollection import * + +# nltk.download('stopwords') +# nltk.download('words') +# there is another nltk.download() requirement but I removed it so i forgot what it was +# it'll show in the console/terminal if you run the code i believe. it appeared in mine def scraper(url, resp): - #initialize set for unique links - #used a set for elimatining duplicates + # initialize set for unique links + # used a set for elimatining duplicates uniques = set() + # have to add the original url to the unique set + copyoriginal = url + uniques.add(removeFragment(copyoriginal)) + + # initializing longest for finding the longest page + max = -9999 + longest = None + + # have to do this for the original url + tok = tokenize(url) + if len(tok) > max: + max = len(tok) + longest = url + + # grand_dict is a dictionary that contains every word over the entire set of pages (excluding stop words) + # key: word , value: frequencies + grand_dict = dict() + tok = removeStopWords(tok) + computeFrequencies(tok, grand_dict) + + # ics is a dict with subdomains + ics = dict() + links = extract_next_links(url, resp) links_valid = list() valid_links = open("valid_links.txt",'a') invalid_links = open("invalid_links.txt",'a') + + for link in links: if is_valid(link): links_valid.append(link) valid_links.write(link + "\n") - #turn into a urlparse object - #removed fragment in order to have "unique" links - remove_frag = urlparse(url) - remove_frag = remove_frag._replace(fragment = '') - uniques.add(remove_frag) + # Answering q1 for report + uniques.add(removeFragment(link)) + + # Answering q2 + tempTok = tokenize(link) + if len(tempTok) > max: + max = len(tempTok) + longest = link + + + # Answering q3 + tempTok = removeStopWords(tempTok) + computeFrequencies(tempTok, grand_dict) + + # Answering q4 + fragless = removeFragment(link) + domain = findDomains(fragless.netloc) + if domain[1] == 'ics': + if domain[0] not in ics: + ics[domain[0]] = urlData(link, domain[0], domain[1]) + else: + if fragless not in ics[domain[0]].getUniques(): + ics[domain[0]].appendUnique(fragless) + + else: invalid_links.write("From: " + url + "\n") invalid_links.write(link + "\n") - #creating text file that includes the number of unique links - f = open("numUniqueLinks.txt", "w") - f.write("{length}".format(length = len(uniques))) + # creating text file that includes the number of unique links + f = open("q1.txt", "w") + f.write("Number of unique pages: {length}".format(length = len(uniques))) + f.close() + + # creating text file for question 2 + f = open("q2.txt", "w") + f.write("Largest page url: {url} \nLength of page: {length}".format(url = longest, length = max)) + f.close() + + # creating text file for question 3 + f = open("q3.txt", "w") + sortedGrandDict = {k: v for k, v in sorted(grand_dict.items(), key=lambda item: item[1], reverse = True)} + i = 0 + for k, v in sortedGrandDict.items(): + if i == 50: + break + else: + f.write(k, ':', v) + f.close() + + # creating text file for question 4 + sortedDictKeys = sorted(ics.keys()) + f = open("q4.txt", "w") + for i in sortedDictKeys: + f.write("{url}, {num}".format(url = ics[i].getNiceLink(), num = len(ics[i].getUniques()))) f.close() return links_valid