Added way to save ngrams to index
This commit is contained in:
		
							
								
								
									
										74
									
								
								indexer.py
									
									
									
									
									
								
							
							
						
						
									
										74
									
								
								indexer.py
									
									
									
									
									
								
							| @@ -17,6 +17,7 @@ from bs4 import BeautifulSoup | ||||
| from time import perf_counter | ||||
| import time | ||||
| import threading | ||||
| import pickle | ||||
|  | ||||
|  | ||||
| #Data process | ||||
| @@ -36,10 +37,25 @@ from worker import Worker | ||||
| class Indexer(): | ||||
| 	def __init__(self,restart,trimming): | ||||
| 		#Config stuffs | ||||
| 		self.path = "data/DEV/" | ||||
| 		self.path = "D:/Visual Studio Workspace/CS121/assignment3/data/DEV/" | ||||
| 		self.restart = restart | ||||
| 		self.trimming = trimming | ||||
| 		self.stemmer = PorterStemmer() | ||||
| 		self.id = list() | ||||
|  | ||||
| 		# Creates a pickle file that is a list of urls where the index of the url is the id that the posting refers to. | ||||
| 		p = os.path.dirname(os.path.abspath(__file__)) | ||||
| 		my_filename = os.path.join(p, "urlID.pkl") | ||||
| 		if os.path.exists(my_filename): | ||||
| 			os.remove(my_filename) | ||||
| 		 | ||||
| 		# Creates file and closes it | ||||
| 		self.f = open(my_filename, "wb") | ||||
| 		pickle.dump(id, self.f) | ||||
| 		self.f.close() | ||||
|  | ||||
| 		# Opens for reading for the entire duration of indexer for worker to use  | ||||
| 		self.f = open(my_filename, "rb+") | ||||
| 		 | ||||
| 		#Shelves for index | ||||
| 		#https://www3.nd.edu/~busiforc/handouts/cryptography/letterfrequencies.html | ||||
| @@ -79,6 +95,9 @@ class Indexer(): | ||||
| 		print(len(list(self.save_4.keys()))) | ||||
| 		print(len(list(self.save_5.keys()))) | ||||
|  | ||||
| 	def get_url_id(self, url): | ||||
| 		return self.id.index(url) | ||||
|  | ||||
| 	def save_index(self,word,posting): | ||||
| 		cur_save = self.get_save_file(word) | ||||
| 		lock = self.get_save_lock(word) | ||||
| @@ -88,7 +107,9 @@ class Indexer(): | ||||
| 			shelve_list = cur_save[word] | ||||
| 			shelve_list.append(posting) | ||||
| 			tic = perf_counter() | ||||
| 			shelve_list.sort(key=lambda x: x.tf_idf, reverse = True) | ||||
| 			# Sort by url id to help with query search | ||||
| 			shelve_list.sort(key=lambda x: x.url) | ||||
| 			# shelve_list.sort(key=lambda x: x.tf_idf, reverse = True) | ||||
| 			toc = perf_counter() | ||||
| 			if toc - tic > 1 : | ||||
| 				print("Took " + str(toc - tic) + "seconds to sort shelve list !") | ||||
| @@ -137,33 +158,22 @@ class Indexer(): | ||||
| 	# 4am | ||||
| 	# https://stackoverflow.com/questions/34449127/sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-documen | ||||
|  | ||||
| 	# Andy: added paramenter imporant_words in order to do multiplication of score | ||||
| 	def get_tf_idf(self,words,word, important_words): | ||||
| 		#tf_idf | ||||
| 		#words = whole text | ||||
| 		#word the word we finding the score for | ||||
| 		#return the score | ||||
| 	# removed parameter "word" since it wasn't used | ||||
| 	# TODO: Add important words scaling | ||||
| 	def get_tf_idf(self, words): | ||||
| 		# words = [whole text] one element list | ||||
| 		# return the score | ||||
| 		try: | ||||
| 			tfidf = TfidfVectorizer() | ||||
| 			tfidf_matrix = tfidf.fit_transform(words) | ||||
| 			df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out()) | ||||
| 			score = df.iloc[0][''.join(word)] | ||||
| 			for k,v in important_words.items(): | ||||
| 				if k == 'b' and word in v: | ||||
| 					score = score * 1.2 | ||||
| 				elif k == 'h1' and word in v: | ||||
| 					score = score * 1.75 | ||||
| 				elif k == 'h2' and word in v: | ||||
| 					score = score * 1.5 | ||||
| 				elif k == 'h3' and word in v: | ||||
| 					score = score * 1.2 | ||||
| 				elif k == 'title' and word in v: | ||||
| 					score = score * 2 | ||||
| 			return(score) | ||||
| 			#print(df) | ||||
| 		except KeyError:  | ||||
| 			return -1 | ||||
|  | ||||
| 			tfidf = TfidfVectorizer(ngram_range=(1,3)) # ngram_range is range of n-values for different n-grams to be extracted (1,3) gets unigrams, bigrams, trigrams | ||||
| 			tfidf_matrix = tfidf.fit_transform(words)  # fit trains the model, transform creates matrix | ||||
| 			df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out()) # store value of matrix to associated word/n-gram | ||||
| 			#return(df.iloc[0][''.join(word)]) #used for finding single word in dataset | ||||
| 			data = df.to_dict() # transform dataframe to dict *could be expensive the larger the data gets, tested on ~1000 word doc and took 0.002 secs to run | ||||
| 			return data			# returns the dict of words/n-grams with tf-idf | ||||
| 			#print(df)			# debugging  | ||||
| 		except: 		 | ||||
| 			print("Error in tf_idf!") | ||||
| 			return | ||||
|  | ||||
| 	def get_data(self): | ||||
|  | ||||
| @@ -179,6 +189,11 @@ class Indexer(): | ||||
| 				index = 0 | ||||
| 				while True: | ||||
| 					file_path = self.path + "" + directory + "/"+file | ||||
| 					# Add url to id here so that there isn't any problems when worker is multi-threaded | ||||
| 					load = open(file_path) | ||||
| 					data = json.load(load) | ||||
| 					if data["url"] not in self.id: | ||||
| 						self.id.append(data["url"]) | ||||
| 					if len(threads) < num_threads: | ||||
| 						thread = Worker(self,file_path) | ||||
| 						threads.append(thread) | ||||
| @@ -194,7 +209,8 @@ class Indexer(): | ||||
| 							if(index >= num_threads): | ||||
| 								index = 0 | ||||
| 							time.sleep(.1) | ||||
| 	 | ||||
| 		pickle.dump(self.id, self.f) | ||||
| 		# should I self.f.close() here? | ||||
| 	#Found 55770 documents | ||||
| 	# | ||||
|  | ||||
|   | ||||
							
								
								
									
										0
									
								
								save_1.shelve.bak
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								save_1.shelve.bak
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										0
									
								
								save_1.shelve.dat
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								save_1.shelve.dat
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										0
									
								
								save_1.shelve.dir
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								save_1.shelve.dir
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										0
									
								
								save_2.shelve.bak
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								save_2.shelve.bak
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										0
									
								
								save_2.shelve.dat
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								save_2.shelve.dat
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										0
									
								
								save_2.shelve.dir
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								save_2.shelve.dir
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										0
									
								
								save_3.shelve.bak
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								save_3.shelve.bak
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										0
									
								
								save_3.shelve.dat
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								save_3.shelve.dat
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										0
									
								
								save_3.shelve.dir
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								save_3.shelve.dir
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										0
									
								
								save_4.shelve.bak
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								save_4.shelve.bak
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										0
									
								
								save_4.shelve.dat
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								save_4.shelve.dat
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										0
									
								
								save_4.shelve.dir
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								save_4.shelve.dir
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										0
									
								
								save_5.shelve.bak
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								save_5.shelve.bak
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										0
									
								
								save_5.shelve.dat
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								save_5.shelve.dat
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										0
									
								
								save_5.shelve.dir
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								save_5.shelve.dir
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										63
									
								
								search.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										63
									
								
								search.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,63 @@ | ||||
| #Data input | ||||
| import json | ||||
| import os | ||||
| import shelve | ||||
| from bs4 import BeautifulSoup | ||||
| from time import perf_counter | ||||
| import time | ||||
| import threading | ||||
|  | ||||
|  | ||||
| #Data process | ||||
| from nltk.tokenize import word_tokenize | ||||
| from nltk.stem import PorterStemmer | ||||
| from sklearn.feature_extraction.text import TfidfVectorizer | ||||
| import pandas as pd | ||||
| import numpy as np | ||||
|  | ||||
| import re | ||||
|  | ||||
| #Logging postings | ||||
| from posting import Posting | ||||
| from worker import Worker | ||||
|  | ||||
| class Search(): | ||||
|  | ||||
|     def __init__(self): | ||||
|         self.save_1 = shelve.open("save_1.shelve") | ||||
|         self.save_2 = shelve.open("save_2.shelve") | ||||
|         self.save_3 = shelve.open("save_3.shelve") | ||||
|         self.save_4 = shelve.open("save_4.shelve") | ||||
|         self.save_5 = shelve.open("save_5.shelve") | ||||
|  | ||||
|     def get_save_file(self, word): | ||||
|         word_lower = word.lower() | ||||
|  | ||||
|         if re.match(r"^[a-d0-1].*", word_lower): | ||||
|             return self.save_1 | ||||
|         elif re.match(r"^[e-k2-3].*", word_lower): | ||||
|             return self.save_2 | ||||
|         elif re.match(r"^[l-q4-7].*", word_lower): | ||||
|             return self.save_3 | ||||
|         elif re.match(r"^[r-z8-9].*", word_lower): | ||||
|             return self.save_4 | ||||
|         else: | ||||
|             return self.save_5 | ||||
|      | ||||
|     def get_userinput(): | ||||
|         return | ||||
|  | ||||
|     def get_tf_idf(self, words): | ||||
|         try: | ||||
|             tfidf = TfidfVectorizer(ngram_range=(1,3)) | ||||
|  | ||||
|     def search(query): | ||||
|         x = [query] | ||||
|          | ||||
|         file = self.get_save_file() | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
							
								
								
									
										28
									
								
								test1.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								test1.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,28 @@ | ||||
| import json | ||||
| import os | ||||
| import shelve | ||||
| from bs4 import BeautifulSoup | ||||
| from time import perf_counter | ||||
| import time | ||||
| import threading | ||||
| import pickle | ||||
|  | ||||
|  | ||||
| #Data process | ||||
| from nltk.tokenize import word_tokenize | ||||
| from nltk.stem import PorterStemmer | ||||
| from sklearn.feature_extraction.text import TfidfVectorizer | ||||
| import pandas as pd | ||||
| import numpy as np | ||||
| from porter2stemmer import Porter2Stemmer | ||||
|  | ||||
| import re | ||||
|  | ||||
| save_1 = shelve.open("save_1.shelve") | ||||
| save_2 = shelve.open("save_2.shelve") | ||||
| save_3 = shelve.open("save_3.shelve") | ||||
| save_4 = shelve.open("save_4.shelve") | ||||
| save_5 = shelve.open("save_5.shelve") | ||||
|  | ||||
| key = list(save_1.keys()) | ||||
| print(key) | ||||
							
								
								
									
										91
									
								
								worker.py
									
									
									
									
									
								
							
							
						
						
									
										91
									
								
								worker.py
									
									
									
									
									
								
							| @@ -5,6 +5,7 @@ import shelve | ||||
| from bs4 import BeautifulSoup | ||||
| from time import perf_counter | ||||
| import time | ||||
| import pickle | ||||
|  | ||||
| import re | ||||
|  | ||||
| @@ -30,80 +31,26 @@ class Worker(Thread): | ||||
|  | ||||
| 	def run(self): | ||||
| 		print("Target: " + str(self.file)) | ||||
| 		ticker = perf_counter() | ||||
| 		tic = perf_counter() | ||||
|  | ||||
| 		file_load = open(self.file) | ||||
| 		data = json.load(file_load) | ||||
| 		soup = BeautifulSoup(data["content"],features="lxml") | ||||
| 		words = word_tokenize(soup.get_text()) | ||||
| 		toc = perf_counter() | ||||
| 		if toc - tic > 1 : | ||||
| 			print("Took " + str(toc - tic) + "seconds to tokenize text !") | ||||
| 		# Gets a cleaner version text comparative to soup.get_text() | ||||
| 		clean_text = ' '.join(soup.stripped_strings) | ||||
| 		# Looks for large white space, tabbed space, and other forms of spacing and removes it | ||||
| 		# Regex expression matches for space characters excluding a single space or words | ||||
| 		clean_text = re.sub(r'\s[^ \w]', '', clean_text) | ||||
| 		# Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended | ||||
| 		clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)]) | ||||
| 		# Stems tokenized text | ||||
| 		clean_text = " ".join([self.indexer.stemmer.stem(i) for i in clean_text.split()]) | ||||
| 		# Put clean_text as an element in a list because get_tf_idf workers properly with single element lists | ||||
| 		x = [clean_text] | ||||
| 		# ngrams is a dict | ||||
| 		# structure looks like {ngram : {0: tf-idf score}} | ||||
| 		ngrams = self.indexer.get_tf_idf(x) | ||||
|  | ||||
| 		tokenized_words = list() | ||||
| 		stemmed_words = list() | ||||
| 		for ngram, tfidf in ngrams.items(): | ||||
| 			posting = Posting(self.indexer.get_url_id(data["url"]), tfidf[0]) | ||||
| 			self.indexer.save_index(ngram,posting) | ||||
|  | ||||
| 		important = {'b' : [], 'h1' : [], 'h2' : [], 'h3' : [], 'title' : []} | ||||
| 		for key_words in important.keys(): | ||||
| 			for i in soup.findAll(key_words): | ||||
| 				for word in word_tokenize(i.text): | ||||
| 					important[key_words].append(self.indexer.stemmer.stem(word)) | ||||
|  | ||||
| 		tic = perf_counter() | ||||
| 		for word in words: | ||||
| 			if word != "" and re.fullmatch('[A-Za-z0-9]+',word): | ||||
| 				#So all the tokenized words are here, | ||||
| 				tokenized_words.append(word) | ||||
| 		toc = perf_counter() | ||||
| 		if toc - tic > 1 : | ||||
| 			print("Took " + str(toc - tic) + "seconds to isalnum text !") | ||||
| 		#YOUR CODE HERE | ||||
|  | ||||
| 		tic = perf_counter() | ||||
| 		for word in tokenized_words: | ||||
| 			stemmed_words.append(self.indexer.stemmer.stem(word)) | ||||
| 			#stemming, | ||||
| 			#tf_idf | ||||
| 			#get_tf_idf(stemmed_words,word) | ||||
| 			#post = Posting() | ||||
| 		toc = perf_counter() | ||||
| 		if toc - tic > 1 : | ||||
| 			print("Took " + str(toc - tic) + "seconds to stemmed text !") | ||||
|  | ||||
| 		counts = Counter(stemmed_words) | ||||
| 		size = len(stemmed_words) | ||||
| 		for word in counts: | ||||
| 			#posting = Posting(data["url"],self.get_tf_idf(list(' '.join(stemmed_words)),word)) | ||||
| 			tic = perf_counter() | ||||
| 			weight = 1.0 | ||||
| 			index = 0 | ||||
| 			""" | ||||
| 			for group in important: | ||||
| 				for word_important in group: | ||||
| 					if word_important.lower() == word.lower(): | ||||
| 						if index == 0: | ||||
| 							weight = 1.2 | ||||
| 						elif index == 1: | ||||
| 							weight = 1.8 | ||||
| 						elif index == 2: | ||||
| 							weight = 1.5 | ||||
| 						elif index == 3: | ||||
| 							weight = 1.3 | ||||
| 						elif index == 4: | ||||
| 							weight = 2.0 | ||||
| 				index = index + 1 | ||||
| 			""" | ||||
| 			 | ||||
| 			posting = Posting(data["url"],counts[word]/size*weight) | ||||
| 			toc = perf_counter() | ||||
| 			if toc - tic > 1 : | ||||
| 				print("Took " + str(toc - tic) + "seconds to tf_idf text !") | ||||
|  | ||||
| 			tic = perf_counter() | ||||
| 			self.indexer.save_index(word,posting) | ||||
| 			toc = perf_counter() | ||||
| 			if toc - tic > 1 : | ||||
| 				print("Took " + str(toc - tic) + "seconds to save text !") | ||||
|  | ||||
| 		tocker = perf_counter() | ||||
| 		print("Finished " + data['url'] + "\n" + str(tocker-ticker)) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 unknown
					unknown