Update indexer.py

had incorrect implementation
Changed tf_idf model into the new one, try it on the current dataset
2022-05-12 17:58:31 -07:00 · 2022-05-12 15:00:09 -07:00 · 2022-05-12 14:30:22 -07:00 · 2022-05-11 14:46:32 -07:00
12 changed files with 291 additions and 700 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,3 @@
 /data/
 *.shelve
 /__pycache__/
-/test/
-merged*
--- a/README.txt
+++ b/README.txt
@@ -1,8 +0,0 @@
-### To create index:
- 1. Make sure that all requirements are installed, check `requirements.txt` and install using `pip install reqirements.txt`. 
- 2. Run `python indexer.py` to build index, this may take some time to run.
- 3. Index is now created.
-### Start search interface:
-Run `python launcher.py` to start the search interface.
-### Perform query:
-To perfrom a search simply enter a query in the textbox and click search. The top results will be displayed.  
--- a/TEST.txt
+++ b/TEST.txt
@@ -1,52 +0,0 @@
-
-### Bad:
- computer science - common
- university of california irvine -common
- donald bren - common
- uci - common
- informatics - common
- The Donald Bren School of Information and Computer Sciences - long and common
- toilet - not likely to be found easily 
- perfume - not likely to be found
- SPY×FAMILY - should not exist in data
- undergraduate - likely to be on tons of pages
-### Good to Meh:
- liquids in labs - uncommon word with common
- Alberto Krone-Martins - should have a good amount of results but not absurd 
- Advising & Planning - should be specific but not too common
- Honors Program - ^
- Papaefthymiou - similar to the martins query 
- General information - there should be quite a few pages with this but not tons
- Prerequisite Clearing System - has some common and uncommon terms
- Recruiting - not stupid common
- counseling - ^ and should only be on a subset of pages
- social justice - specific terms that should appear without being costly
-### Others tested:
- masters of computer science - not super common but will have a good amount of pages
- thornton ics46 notes - name + class + common
- Theory of Computation - two terms which have high count in papers
- facility distribution  - two terms which don't really make sense together
- artificial intelligence history - two common terms with semi-common
- prospective alumni - should have very few instances of both terms but should be found together
- enrollment window - should be on only a couple of pages
- available capstone sponsorship - ^
- spring seminars - common with term that may be somewhat restricted
- hackuci - two terms into one that exists in dataset
- ucinetid help  - specific term with common 
- course restrictions - specific pages
- project management - a course name
- yelan research - term should not exist + common
- hybrid-learning - common phrase 
- genshin is a computer game - contains terms that exist and others that don't 
- computable AI machine learning big data - sentence of CS buzz words (really really common)
- Publications & Technical Reports - in json file
- Tutor coordinators - in many json (bold, title, and body)
- Death Image Service - in some weird areas
- send anonymous email - only in some
-### Things done for improvement
- 1. Create index of index for substantial gain in efficiency and speed.
- 2.  Split TF-IDF into TF and IDF for more specific calculations when needed without the whole computation. This also removes the relevance on external library for TF-IDF.
- 3. Switched from using IDF & weight, to TF & weight for helping with the overall weight.
- 4. Dropped indexing and searching of unigram, bigram, and trigrams.
- 5. Add length of document during indexing for improved speed via normalization calculation.
-
--- a/docs.weight
+++ b/docs.weight
--- a/indexer.py
+++ b/indexer.py
@@ -17,8 +17,6 @@ from bs4 import BeautifulSoup
 from time import perf_counter
 import time
 import threading
-from threading import Lock
-import math


 #Data process
@@ -34,196 +32,204 @@ import re
 from posting import Posting
 from worker import Worker

-class Node():
-	index_value = ''
-	postings = list()
-
-class Index():
-	length = 0
-	index = list()

 class Indexer():
-	def __init__(self,list_partials,weight,data_paths,worker_factory=Worker):
+	def __init__(self,restart,trimming):
 		#Config stuffs
-		self.path = "data/DEV"
-		self.num_doc = 0
-		self.list_partials = list_partials
-		self.weight = weight
-		self.data_paths = data_paths
+		self.path = "data/DEV/"
+		self.restart = restart
+		self.trimming = trimming
 		self.stemmer = PorterStemmer()
-		self.data_paths_lock = Lock()
-		self.list_partials_lock = Lock()

-		self.workers = list()
-		self.worker_factory = worker_factory
+		#Shelves for index
+		#https://www3.nd.edu/~busiforc/handouts/cryptography/letterfrequencies.html
+		#https://www.irishtimes.com/news/science/how-many-numbers-begin-with-a-1-more-than-30-per-cent-1.4162466
+		#According to this will be how we split things
+		#Save #1 = ABCD + (1) ~ 18.3% of words
+		#Save #2 = EFGHIJK + (2-3)~ 27.1% of words
+		#Save #3 = LMNOPQ + (4-7) ~ 25.4% of words
+		#Save #4 = RSTUVWXYZ + (8-9)~ 29.2% of words
+		#Save #5 = Special characters
+		if os.path.exists("save_1.shelve") and restart:
+			os.remove("save_1.shelve")
+		if os.path.exists("save_2.shelve") and restart:
+			os.remove("save_2.shelve")
+		if os.path.exists("save_3.shelve") and restart:
+			os.remove("save_3.shelve")
+		if os.path.exists("save_4.shelve") and restart:
+			os.remove("save_4.shelve")
+		if os.path.exists("save_5.shelve") and restart:
+			os.remove("save_5.shelve")


-	def start_async(self):
-		self.workers = [
-			self.worker_factory(worker_id,self)
-			for worker_id in range(8)]
-		for worker in self.workers:
-			worker.start()
+		self.save_1 = shelve.open("save_1.shelve")
+		self.save_1_lock = threading.Lock()
+		self.save_2 = shelve.open("save_2.shelve")
+		self.save_2_lock = threading.Lock()
+		self.save_3 = shelve.open("save_3.shelve")
+		self.save_3_lock = threading.Lock()
+		self.save_4 = shelve.open("save_4.shelve")
+		self.save_4_lock = threading.Lock()
+		self.save_5 = shelve.open("save_5.shelve")
+		self.save_5_lock = threading.Lock()

-	def start(self):
-		self.start_async()
-		self.join()
+		print(len(list(self.save_1.keys())))
+		print(len(list(self.save_2.keys())))
+		print(len(list(self.save_3.keys())))
+		print(len(list(self.save_4.keys())))
+		print(len(list(self.save_5.keys())))

-	def join(self):
-		for worker in self.workers:
-			worker.join()
+	def save_index(self,word,posting):
+		cur_save = self.get_save_file(word)
+		lock = self.get_save_lock(word)
+		lock.acquire()
+		shelve_list = list()
+		try:
+			shelve_list = cur_save[word]
+			shelve_list.append(posting)
+			tic = perf_counter()
+			shelve_list.sort(key=lambda x: x.tf_idf, reverse = True)
+			toc = perf_counter()
+			if toc - tic > 1 :
+				print("Took " + str(toc - tic) + "seconds to sort shelve list !")
+			cur_save.sync()
+			lock.release()
+		except:
+			shelve_list.append(posting)
+			cur_save[word] = shelve_list
+			cur_save.sync()
+			lock.release()
+
+	def get_save_file(self,word):
+		#return the correct save depending on the starting letter of word
+		word_lower = word.lower()
+
+		if re.match(r"^[a-d0-1].*",word_lower):
+			return self.save_1
+		elif re.match(r"^[e-k2-3].*",word_lower):
+			return self.save_2
+		elif re.match(r"^[l-q4-7].*",word_lower):
+			return self.save_3
+		elif re.match(r"^[r-z8-9].*",word_lower):
+			return self.save_4
+		else:
+			print(word)
+			print("You have somehow went beyond the magic")
+			return self.save_5
+
+	def get_save_lock(self,word):
+		word_lower = word.lower()
+		if re.match(r"^[a-d0-1].*",word_lower):
+			return self.save_1_lock
+		elif re.match(r"^[e-k2-3].*",word_lower):
+			return self.save_2_lock
+		elif re.match(r"^[l-q4-7].*",word_lower):
+			return self.save_3_lock
+		elif re.match(r"^[r-z8-9].*",word_lower):
+			return self.save_4_lock
+		else:
+			print(word)
+			print("You have somehow went beyond the magic")
+			return self.save_5_lock.acquire()
+
+	# I have a test file (mytest.py) with pandas but couldn't figure out how to grab just a single cell.
+	# so I came up with this, if anyone knows how to get a single cell and can explain it to
+	# me I would love to know, as I think that method might be quicker, maybe, idk it like
+	# 4am
+
+	# retuns a dict of words/n-grams with their assosiated tf-idf score *can also return just a single score or a pandas dataframe
+	# https://stackoverflow.com/questions/34449127/sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-documen
+
+	# Andy: added paramenter imporant_words in order to do multiplication of score
+	def get_tf_idf(self,words,word, important_words):
+		#tf_idf
+		#words = whole text
+		#word the word we finding the score for
+		#return the score
+		try:
+			'''
+			tfidf = TfidfVectorizer()
+			tfidf_matrix = tfidf.fit_transform(words)
+			df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out())
+			score = df.iloc[0][''.join(word)]
+			for k,v in important_words.items():
+				if k == 'b' and word in v:
+					score = score * 1.2
+				elif k == 'h1' and word in v:
+					score = score * 1.75
+				elif k == 'h2' and word in v:
+					score = score * 1.5
+				elif k == 'h3' and word in v:
+					score = score * 1.2
+				elif k == 'title' and word in v:
+					score = score * 2
+			return(score)
+			#print(df)
+		except KeyError: 
+			return -1
+			'''
+		try:	
+			tfidf = TfidfVectorizer(ngram_range=(1,3)) # ngram_range is range of n-values for different n-grams to be extracted (1,3) gets unigrams, bigrams, trigrams
+			tfidf_matrix = tfidf.fit_transform(words)  # fit trains the model, transform creates matrix
+			df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out()) # store value of matrix to associated word/n-gram
+			#return(df.iloc[0][''.join(word)]) #used for finding single word in dataset
+			tfidf_dict = df.to_dict() # transform dataframe to dict *could be expensive the larger the data gets, tested on ~1000 word doc and took 0.002 secs to run
+			return tfidf_dict			# returns the dict of words/n-grams with tf-idf as value
+			#print(df)			# debugging 
+		except: 		
+			print("Error in tf_idf!")
+			return


-	def get_postings(self,index):
-		merged_index_index = open("merged_index.index" ,'r')
-		merged_index = open("merged_index.full",'r')
-		merged_index_index.seek(0,0)
-		json_value = merged_index_index.readline()
-		data = json.loads(json_value)
-		index_index = dict(data['index'])
-		to_seek = index_index[index]
-		merged_index.seek(to_seek,0)
-		json_value = merged_index.readline()
-		data = json.loads(json_value)
-		return data['postings']
+	def get_data(self):

-	def set_weight(self):
-		weight_file = open('docs.weight','w')
-		jsonStr =json.dumps(self.weight, default=lambda o: o.__dict__,sort_keys=False)
-		weight_file.write(jsonStr)
-		weight_file.close()
+		num_threads = 1
+		threads = list()

-	def get_weight(self,doc_id):
-		weight = open('docs.weight','r')
-		weight.seek(0,0)
-		json_value = weight.readline()
-		data = json.loads(json_value)
-		return data[doc_id]
-
-	def get_data_path(self):
 		for directory in os.listdir(self.path):
 			for file in os.listdir(self.path + "/" + directory + "/"):
-				self.data_paths.append("data/DEV/" + directory + "/"+file)
-		self.num_doc = len(self.data_paths)
-
-	def get_next_file(self):
-		self.data_paths_lock.acquire()
-		try:
-			holder = self.data_paths.pop()
-			self.data_paths_lock.release()
-			return holder
-		except IndexError:
-			self.data_paths_lock.release()
-			return None
-	
-	def add_partial_index(self,partial_index):
-		self.list_partials_lock.acquire()
-		self.list_partials.append(partial_index)
-		self.list_partials_lock.release()
+				#Actual files here
+				#JSON["url"] = url of crawled page, ignore fragments
+				#JSON["content"] = actual HTML
+				#JSON["encoding"] = ENCODING
+				index = 0
+				while True:
+					file_path = self.path + "" + directory + "/"+file
+					if len(threads) < num_threads:
+						thread = Worker(self,file_path)
+						threads.append(thread)
+						thread.start()
+						break
+					else:
+						if not threads[index].is_alive():
+							threads[index] = Worker(self,file_path)
+							threads[index].start()
+							break
+						else:
+							index = index + 1
+							if(index >= num_threads):
+								index = 0
+							time.sleep(.1)
 	
 	#Found 55770 documents
 	#
+
 				#getting important tokens
 				
-	def merge(self):
-		partial_files = list()
-		partial_index_files = list()
-		parital_index_indices = list()
 						
-		num_indices = len(self.list_partials)

-		#Full Index.Index and Length
-		full_index = Index()
-		full_index.index = list()
-		full_index.length = 0
 		
-		for partial_index in self.list_partials:
-			file = open("temp/" + partial_index+'.partial','r')
-			partial_files.append(file)
-			index = open("temp/" + partial_index+'.index','r')
-			partial_index_files.append(index)

-		for partial_index_file in partial_index_files:
-			partial_index_file.seek(0,0)
-			parital_index_indices.append(json.loads(partial_index_file.readline()))

-		#Start all indexes at 0
-		for partial_file in partial_files:
-			partial_file.seek(0,0)

-		pointers = [0]*num_indices
-		merged_index = open("merged_index.full",'w')
-		merged_index_index = open("merged_index.index" ,'w')
 				
-		while(True):

-			#Get all values from all indices to find min
-			value = None
-			values = list()
-			for i in range(num_indices):
-				if pointers[i] < parital_index_indices[i]['length']:
-					values.append(parital_index_indices[i]['index'][pointers[i]][0])

-			if(len(values) == 0):
-				break
-			value = min(values)
-
-			#Get data from the min value of all indices if exists then save to mergedIndex
-			if value == None:
-				print("I have crashed some how by not getting min value")
-				break
-
-			node = Node()
-			node.index_value = value
-			for i in range(num_indices):
-				if pointers[i] < parital_index_indices[i]['length'] and parital_index_indices[i]['index'][pointers[i]][0] == value:
-					to_seek = parital_index_indices[i]['index'][pointers[i]][1]
-					partial_files[i].seek(to_seek,0)
-					json_value = partial_files[i].readline()
-					temp_node = json.loads(json_value)
-					node.postings = node.postings + temp_node['postings']
-					pointers[i] = pointers[i] + 1
-			#Change postings here with tf*idf idf = log (n/df(t)) 
-			node.postings.sort(key=lambda y:y['doc_id'])
-			for posting in node.postings:
-				posting['tf_idf'] = posting['tf_raw']*math.log(self.num_doc/len(node.postings))
-			full_index.index.append((value,merged_index.tell()))
-			full_index.length = full_index.length + 1
-			jsonStr = json.dumps(node,default=lambda o: o.__dict__,sort_keys=False)
-			merged_index.write(jsonStr + '\n')
-
-		full_index.index.sort(key=lambda y:y[0])
-		jsonStr =json.dumps(full_index, default=lambda o: o.__dict__,sort_keys=False)
-		merged_index_index.write(jsonStr)
-
-		for partial_index in self.list_partials:
-			os.remove("temp/" + partial_index+'.partial')
-			os.remove("temp/" + partial_index+'.index')
-
-		merged_index_index.close()
-		merged_index.close()


 def main():
-	indexer = Indexer(list(),dict(),list())
-	indexer.get_data_path()
-	print("We have " + str(len(indexer.data_paths)) + " documents to go through !" )
-	indexer.start()
-	indexer.merge()
-	print("Finished merging into 1 big happy family")
-	indexer.set_weight()
-
-	tic = time.perf_counter()
-	indexer.get_postings('artifici')
-	toc = time.perf_counter()
-	print(f"Took {toc - tic:0.4f} seconds to get postings for artifici")
-	tic = time.perf_counter()
-	indexer.get_weight('00ba3af6a00b7cfb4928e5d234342c5dc46b4e31714d4a8f315a2dd4d8e49860')
-	print(f"Took {toc - tic:0.4f} seconds to get weight for some random page ")
-	toc = time.perf_counter()
-
-	
-
+	indexer = Indexer(True,0)
+	indexer.get_data()

 if __name__ == "__main__":
 	main()
--- a/posting.py
+++ b/posting.py
@@ -1,16 +1,9 @@
 #Posting class for indexer, will probably be more complex as we keep adding crap to it

 class Posting():
-	def __init__(self,doc_id,url,tf_raw,tf_idf,positionals):
-		self.doc_id = doc_id
+	def __init__(self,url,tf_idf):
 		self.url = url
-		self.tf_raw = tf_raw
 		self.tf_idf = tf_idf
-		self.positionals = positionals
-	def __repr__(self):
-		return "Doc_id:" + str(self.doc_id) + " URL:" + self.url + " tf_raw:" + str(self.tf_raw) + " tf_idf:" + str(self.tf_idf) + " positionals:" + str(self.positionals)
-	def __str__(self):
-		return "Doc_id:" + str(self.doc_id) + " URL:" + self.url + " tf_raw:" + str(self.tf_raw) + " tf_idf:" + str(self.tf_idf) + " positionals:" + str(self.positionals)
 		
 	def comparator(self):
 		#Some custom comparator for sorting postings later
--- a/search.py
+++ b/search.py
@@ -1,111 +0,0 @@
-#Data input
-import json
-import os
-import shelve
-from bs4 import BeautifulSoup
-from time import perf_counter
-import time
-import threading
-import pickle
-
-
-#Data process
-from nltk.tokenize import word_tokenize
-from nltk.stem import PorterStemmer
-from sklearn.feature_extraction.text import TfidfVectorizer
-import pandas as pd
-import numpy as np
-
-import re
-
-#Logging postings
-from posting import Posting
-from worker import Worker
-
-class Search():
-    # wrote the code for testing in the file searchtesting.py so many of the variables and function calls are wrong.
-    def __init__(self):
-        self.stemmer = PorterStemmer()
-        p = os.path.dirname(os.path.abspath(__file__))
-        my_filename = os.path.join(p, "urlID.pkl")
-        self.f = open(my_filename, "rb+")
-        self.id = pickle.load(self.f)
-
-    # takes a list of posting lists returns a list of indexes that correspond to search temp list
-    def two_shortest(self, l_posting):
-        short = []
-        location = []
-        for postings in l_posting:
-            short.append(len(postings))
-        
-        for i in range(2):
-            x = short.index(min(short))
-            location.append(x)
-            short[x] = float('inf')
-        
-        return location
-
-    # len(list1) <= len(list2) So the code in this function works with that in mind
-    def merge(self, list1, list2):
-        merged = []
-        i = 0
-        j = 0
-        # TODO: optimize by having a pointer to the current index+4
-        while i < len(list1) or j < len(list2):
-            if j == len(list2):
-                break
-            if i == len(list1):
-                break
-            # Since list1 is shorter it will hit its max index sooner, 
-            #   so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2
-            if i == len(list1)-1:
-                if list1[i].url == list2[j].url:
-                    merged.append(list1[i])
-                    j += 1
-                    i += 1
-                elif list1[i].url < list2[j].url:
-                    break
-                else:
-                    j += 1
-            else:
-                if list1[i].url == list2[j].url:
-                    merged.append(list1[i])
-                    i += 1
-                    j += 1
-                elif list1[i].url < list2[j].url:
-                    break
-                else:
-                    i += 1
-                    j += 1
-        return merged
-
-    # query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
-    def search(self, query):
-        temp = []
-        for token in query:
-            temp.append(get_index(token))
-        
-        l = two_shortest(temp)
-        m = merge(temp[l[0]], temp[l[1]])
-
-        while len(temp) > 1:
-            # delete from temp the already merged lists
-            del temp[l[0]]
-            del temp[l[1]]
-            temp.append(m)
-
-            l = two_shortest(temp)
-            m = merge(temp[l[0]], temp[l[1]])
-
-        for p in m:
-            print(p.url)
-        
-        # For now going to do a loop through each query's index and match it with the merged list (can be faster if i implement something during merge/search in order to keep track of the postings)
-
-
-
-
-
-
-
-
--- a/searchtesting.py
+++ b/searchtesting.py
@@ -1,117 +0,0 @@
-import math
-import json
-import os
-import shelve
-from bs4 import BeautifulSoup
-from time import perf_counter
-import time
-import threading
-import pickle
-
-
-#Data process
-from nltk.tokenize import word_tokenize
-from nltk.stem import PorterStemmer
-from sklearn.feature_extraction.text import TfidfVectorizer
-import pandas as pd
-import numpy as np
-
-import re
-
-class Posting():
-	def __init__(self, url, rtf, position):
-		self.url = url
-		self.rtf = rtf
-		self.tf = 1
-		self.tfidf = 0
-		self.positions = [position]
-
-
-d = {
-    'a' : [Posting(0, 1, 1), Posting(2, 1, 1), Posting(3, 1, 1), Posting(8, 1, 1)], 
-    'b' :[Posting(0, 1, 1), Posting(8, 1, 1)],
-    'c' : [Posting(0, 1, 1), Posting(1, 1, 1), Posting(2, 1, 1), Posting(8, 1, 1)]
-    }
-
-def get_index(word):
-    for k, v in d.items():
-        if k == word:
-            return v
-
-# takes a list of posting lists returns a list of indexes that correspond to search temp list
-def two_shortest(l_posting):
-    short = []
-    location = []
-    for postings in l_posting:
-        short.append(len(postings))
-    
-    for i in range(2):
-        x = short.index(min(short))
-        location.append(x)
-        short[x] = float('inf')
-    
-    return location
-
-# len(list1) <= len(list2) So the code in this function works with that in mind
-def merge(list1, list2):
-    merged = []
-    i = 0
-    j = 0
-    # TODO: optimize by having a pointer to the current index+4
-    while i < len(list1) or j < len(list2):
-        if j == len(list2):
-            break
-        if i == len(list1):
-            break
-        # Since list1 is shorter it will hit its max index sooner, 
-        #   so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2
-        if i == len(list1)-1:
-            if list1[i].url == list2[j].url:
-                merged.append(list1[i])
-                j += 1
-                i += 1
-            elif list1[i].url < list2[j].url:
-                break
-            else:
-                j += 1
-        else:
-            if list1[i].url == list2[j].url:
-                merged.append(list1[i])
-                i += 1
-                j += 1
-            elif list1[i].url < list2[j].url:
-                break
-            else:
-                i += 1
-                j += 1
-    return merged, 
-
-# query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
-def search(query):
-    temp = []
-    for token in query:
-        temp.append(get_index(token))
-    
-    l = two_shortest(temp)
-    m = merge(temp[l[0]], temp[l[1]])
-
-    while len(temp) > 1:
-        # delete from temp the already merged lists
-        del temp[l[0]]
-        del temp[l[1]]
-        temp.append(m)
-
-        l = two_shortest(temp)
-        m = merge(temp[l[0]], temp[l[1]])
-
-    for p in m:
-        print(p.url)
-    
-    # For now going to do a loop through each query's index and match it with the merged list (can be faster if i implement something during merge/search in order to keep track of the postings)
-    
-    
-
-
-
-search(["a", "b", "c"])
-            
--- a/stemmer.py
+++ b/stemmer.py
@@ -0,0 +1,18 @@
+#Multiple implementation of stemming here please
+class Stemmer():
+
+	def __init__(self,mode, data):
+		#Different type of stemmer = different modes
+		self.mode = mode
+		self.data = data
+
+	def stem(self):
+		#Do stuff here
+		if(self.mode == 0):
+			#Do stemmer 1
+			return #stemmed data
+		#....
+
+	def #name of stemmer 1
+
+	def #name of stemmer 2
--- a/test.py
+++ b/test.py
@@ -1,13 +1,17 @@
-from threading import Thread
-import json
-import os
-import shelve
-import sys
-from bs4 import BeautifulSoup
-from time import perf_counter
-from nltk.stem import PorterStemmer
-import nltk
-import time
-from posting import Posting
-
 import re
+import os
+
+for i in range(99):
+	word_lower = chr(i % 26 + 97) + chr(i % 26 + 97 + 1)
+	print(word_lower)
+	if re.match(r"^[a-d1-1].*",word_lower):
+		print("SAVE 1")
+	elif re.match(r"^[e-k2-3].*",word_lower):
+		print("SAVE 2")
+	elif re.match(r"^[l-q4-7].*",word_lower):
+		print("SAVE 3")
+	elif re.match(r"^[r-z8-9].*",word_lower):
+		print("SAVE 4")
+
+path = "data/DEV/"
+print(os.listdir(path))
--- a/test_merge.py
+++ b/test_merge.py
@@ -1,116 +0,0 @@
-import json
-from posting import Posting
-import math
-import sys
-import random
-from nltk.corpus import words
-random_list = [1,2,3,4,5,6,7,8,9,10]
-
-
-test_data = words.words()
-random.shuffle(test_data)
-
-
-def random_posting(id):
-	return Posting(id,random.choice(random_list),random.choice(random_list),[random.choice(random_list),random.choice(random_list),random.choice(random_list),random.choice(random_list),
-	random.choice(random_list),random.choice(random_list),random.choice(random_list),random.choice(random_list)])
-
-class Node():
-	index_value = 'Something'
-	postings = list()
-
-class Index():
-	length = 0
-	index = list()
-
-def random_partial_index(name):
-	part_index = Index()
-	part_index.index = list()
-	part_index.length = 0
-	with open(name +'.partial', 'w') as f:
-		for i in range(1000):
-
-			node1 = Node()
-			node1.index_value = random.choice(test_data).lower()
-			node1.postings = list()
-			for i in range(10):
-				node1.postings.append(random_posting(i))
-
-			jsonStr = json.dumps(node1, default=lambda o: o.__dict__,sort_keys=False)
-			
-			part_index.index.append((node1.index_value,f.tell()))
-			f.write(jsonStr + '\n')
-			part_index.length = part_index.length + 1
-
-	part_index.index.sort(key=lambda y:y[0])
-	jsonStr =json.dumps(part_index, default=lambda o: o.__dict__,sort_keys=False)
-	with open(name + '.index','w') as f:
-		f.write(jsonStr)
-
-def merge(partial_indices):
-	partial_files = list()
-	partial_index_files = list()
-	parital_index_indices = list()
-	merged_index = open("merged_index.full",'w')
-	num_indices = len(partial_indices)
-
-	#Full Index.Index and Length
-	full_index = Index()
-	full_index.index = list()
-	full_index.length = 0
-
-	for partial_index in partial_indices:
-		file = open(partial_index+'.partial','r')
-		partial_files.append(file)
-		index = open(partial_index+'.index','r')
-		partial_index_files.append(index)
-
-	for partial_index_file in partial_index_files:
-		partial_index_file.seek(0,0)
-		parital_index_indices.append(json.loads(partial_index_file.readline()))
-
-	#Start all indexes at 0
-	for partial_file in partial_files:
-		partial_file.seek(0,0)
-
-	pointers = [0]*num_indices
-
-	while(True):
-
-		#Get all values from all indices to find min
-		value = None
-		values = list()
-		for i in range(num_indices):
-			if pointers[i] < parital_index_indices[i]['length']:
-				values.append(parital_index_indices[i]['index'][pointers[i]][0])
-			
-		if(len(values) == 0):
-			break
-		value = min(values)
-
-		#Get data from the min value of all indices if exists then save to mergedIndex
-		if value == None:
-			print("I have crashed some how by not getting min value")
-			break
-
-		node = Node()
-		node.index_value = value
-		for i in range(num_indices):
-			if pointers[i] < parital_index_indices[i]['length'] and parital_index_indices[i]['index'][pointers[i]][0] == value:
-				to_seek = parital_index_indices[i]['index'][pointers[i]][1]
-				partial_files[i].seek(to_seek,0)
-				json_value = partial_files[i].readline()
-				temp_node = json.loads(json_value)
-				node.postings = node.postings + temp_node['postings']
-				pointers[i] = pointers[i] + 1
-		
-		node.postings.sort(key=lambda y:y['doc_id'])
-		full_index.index.append((value,merged_index.tell()))
-		full_index.length = full_index.length + 1
-		jsonStr = json.dumps(node,default=lambda o: o.__dict__,sort_keys=False)
-		merged_index.write(jsonStr + '\n')
-
-	full_index.index.sort(key=lambda y:y[0])
-	jsonStr =json.dumps(full_index, default=lambda o: o.__dict__,sort_keys=False)
-	with open("merged_index.index" ,'w') as f:
-		f.write(jsonStr)
--- a/worker.py
+++ b/worker.py
@@ -1,137 +1,114 @@
 from threading import Thread
 import json
 import os
-
+import shelve
 from bs4 import BeautifulSoup
+from time import perf_counter
+import time
+
 import re


 #Data process
 from nltk.tokenize import word_tokenize
 from nltk.stem import PorterStemmer
+from sklearn.feature_extraction.text import TfidfVectorizer
+import pandas as pd
+import numpy as np
+from collections import Counter

 from posting import Posting

-import math

 import sys

-class Node():
-	index_value = ''
-	postings = list()
-
-class Index():
-	length = 0
-	index = list()
-
 class Worker(Thread):
-	def __init__(self,worker_id,indexer):
+	def __init__(self,indexer,target):
+		self.file = target
 		self.indexer = indexer
-		self.stemmer = PorterStemmer()
-		self.worker_id = worker_id
-		self.num_partial = 0
-		self.index = dict()
 		super().__init__(daemon=True)

-	def dump(self):
-		part_index = Index()
-		part_index.length = 0
-		part_index.index = list()
-
-		cur_partial_index_str = "temp/" + str(self.worker_id) + "_" + str(self.num_partial) + '.partial'
-		cur_partial_index_index_str = "temp/" +  str(self.worker_id) + "_" + str(self.num_partial) + '.index'
-
-
-		cur_partial_index = open(cur_partial_index_str,'w')
-		cur_partial_index_index = open(cur_partial_index_index_str,'w')
-
-		for key in self.index:
-			node = Node()
-			node.index_value = key
-			node.postings = self.index[key]
-
-			jsonStr = json.dumps(node, default=lambda o: o.__dict__,sort_keys=False)
-
-			part_index.index.append((node.index_value,cur_partial_index.tell()))
-			cur_partial_index.write(jsonStr + '\n')
-			part_index.length = part_index.length + 1
-
-		part_index.index.sort(key=lambda y:y[0])
-		jsonStr =json.dumps(part_index, default=lambda o: o.__dict__,sort_keys=False)
-		cur_partial_index_index.write(jsonStr)
-
-		self.indexer.add_partial_index(str(self.worker_id) + "_" + str(self.num_partial))
-		self.num_partial = self.num_partial + 1
-		self.index.clear()
-
-
 	def run(self):
-		while True:
-			target = self.indexer.get_next_file()
-			if not target:
-				self.dump()
-				print("Worker " + str(self.worker_id) + " died")
-				break
-			file_load = open(target)
+		print("Target: " + str(self.file))
+		ticker = perf_counter()
+		tic = perf_counter()
+		file_load = open(self.file)
 		data = json.load(file_load)
 		soup = BeautifulSoup(data["content"],features="lxml")
-			doc_id = target[target.rfind('/')+1:-5]
-			url = data['url']
-			print("Worker " + str(self.worker_id) + " working on " + url)
+		words = word_tokenize(soup.get_text())
+		toc = perf_counter()
+		if toc - tic > 1 :
+			print("Took " + str(toc - tic) + "seconds to tokenize text !")
+
+		tokenized_words = list()
+		stemmed_words = list()
+
 		important = {'b' : [], 'h1' : [], 'h2' : [], 'h3' : [], 'title' : []}
 		for key_words in important.keys():
 			for i in soup.findAll(key_words):
 				for word in word_tokenize(i.text):
-						important[key_words].append(self.stemmer.stem(word))
+					important[key_words].append(self.indexer.stemmer.stem(word))

-			# Gets a cleaner version text comparative to soup.get_text()
-			clean_text = ' '.join(soup.stripped_strings)
-			# Looks for large white space, tabbed space, and other forms of spacing and removes it
-			# Regex expression matches for space characters excluding a single space or words
-			clean_text = re.sub(r'\s[^ \w]', '', clean_text)
-			# Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended
-			clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)])
-			# Stems tokenized text
-			clean_text = " ".join([self.stemmer.stem(i) for i in clean_text.split()])
-			# Put clean_text as an element in a list because get_tf_idf workers properly with single element lists
+		tic = perf_counter()
+		for word in words:
+			if word != "" and re.fullmatch('[A-Za-z0-9]+',word):
+				tokenized_words.append(word)
+		toc = perf_counter()
+		if toc - tic > 1 :
+			print("Took " + str(toc - tic) + "seconds to isalnum text !")

-			tokens = word_tokenize(clean_text)
+		tic = perf_counter()
+		for word in tokenized_words:
+			stemmed_words.append(self.indexer.stemmer.stem(word))

-			#counter(count,positionals)
+		toc = perf_counter()
+		if toc - tic > 1 :
+			print("Took " + str(toc - tic) + "seconds to stemmed text !")

-			counter = dict()
-			#We calculating tf_raw, and positionals here
-			for i in range(len(tokens)):
-				word = tokens[i]
-				if word in counter:
-					counter[word][0] = counter[word][0] + 1
-					counter[word][1].append(i)
-				else:
-					counter[word] = [1,list()]
-					counter[word][1].append(i)
+			"""
+		tfidf = TfidfVectorizer(ngram_range=(1,3)) # ngram_range is range of n-values for different n-grams to be extracted (1,3) gets unigrams, bigrams, trigrams
+		tfidf_matrix = tfidf.fit_transform(stemmed_words)  # fit trains the model, transform creates matrix
+		#df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out()) # store value of matrix to associated word/n-gram
+		tfidf.sget_feature_names_out()
+		#tf_idf_dict = df.to_dict() # transform dataframe to dict *could be expensive the larger the data gets, tested on ~1000 word doc and took 0.002 secs to run
 		
-			doc_length = len(tokens)
-			total = 0
-			for index in counter:
-				tf = counter[index][0]/doc_length
-				log_tf = 1 + math.log(tf)
-				total = total + log_tf * log_tf
-				if index in self.index:
-					postings = self.index[index]
-					postings.append(Posting(doc_id,url,tf,0,counter[index][1]))
-				else:
-					self.index[index] = list()
-					self.index[index].append(Posting(doc_id,url,tf,0,counter[index][1]))
-					self.index[index].sort(key=lambda y:y.doc_id)
+		print(tfidf_matrix)
+		"""

-			self.indexer.weight[doc_id] = math.sqrt(total)
-
-			#10 Megabytes index (in Ram approx)
-			if sys.getsizeof(self.index) > 1000000:
-				self.dump()
+		tfIdfVectorizer=TfidfVectorizer(use_idf=True)
+		tfIdf = tfIdfVectorizer.fit_transform(stemmed_words)
+		df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names_out(), columns=["TF-IDF"])
+		df = df.sort_values('TF-IDF', ascending=False)

+		print(df.head(25))

+		for word in tf_idf_dict.keys():
+			tic = perf_counter()
+			print(tf_idf_dict)
+			weight = 1.0
+			for k,v in important.items():
+				if k == 'b' and word in v:
+					weight = 1.2
+				elif k == 'h1' and word in v:
+					weight = 1.75
+				elif k == 'h2' and word in v:
+					weight = 1.5
+				elif k == 'h3' and word in v:
+					weight = 1.2
+				elif k == 'title' and word in v:
+					weight = 2
 			
+			posting = Posting(data["url"],tf_idf_dict[word]*weight)
 			
+			toc = perf_counter()
+			if toc - tic > 1 :
+				print("Took " + str(toc - tic) + "seconds to tf_idf text !")

+			tic = perf_counter()
+			self.indexer.save_index(word,posting)
+			toc = perf_counter()
+			if toc - tic > 1 :
+				print("Took " + str(toc - tic) + "seconds to save text !")

+		tocker = perf_counter()
+		print("Finished " + data['url'] + "\n" + str(tocker-ticker))
Author	SHA1	Message	Date
Aaron	e7c4170cc2	Update indexer.py had incorrect implementation	2022-05-12 17:58:31 -07:00
inocturnis	c4b3512df7	Changed tf_idf model into the new one, try it on the current dataset	2022-05-12 15:00:09 -07:00
iNocturnis	c8640001c7	Merge branch 'tf_idf'	2022-05-12 14:30:22 -07:00
Lacerum	f5610eaa62	tf-idf ngrams and now returns dict rather than score	2022-05-11 14:46:32 -07:00