changing the way to calculate weight

2022-05-27 09:53:25 -07:00 · 2022-05-27 09:53:25 -07:00 · cf81cf5c6a
commit cf81cf5c6a
parent fb88efd510
5 changed files with 147 additions and 77 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,5 @@
 /data/
 *.shelve
 /__pycache__/
+/test/
+merged*
--- a/docs.weight
+++ b/docs.weight
--- a/indexer.py
+++ b/indexer.py
@ -33,6 +33,7 @@ import re
 #Logging postings
 from posting import Posting
 from worker import Worker
+from worker_weight import Worker_Weight

 class Node():
 	index_value = ''
@ -43,9 +44,9 @@ class Index():
 	index = list()

 class Indexer():
-	def __init__(self,list_partials,weight,data_paths,worker_factory=Worker):
+	def __init__(self,list_partials,weight,data_paths,worker_factory=Worker,worker_weight_factory=Worker_Weight):
 		#Config stuffs
-		self.path = "test/"
+		self.path = "data/DEV/"
 		self.num_doc = 0
 		self.list_partials = list_partials
 		self.weight = weight
@ -54,10 +55,12 @@ class Indexer():
 		self.data_paths_lock = Lock()
 		self.list_partials_lock = Lock()
 		self.workers = list()
-		self.merged_index = open("merged_index.full",'w')
-		self.merged_index_index = open("merged_index.index" ,'w')
+		
 		self.worker_factory = worker_factory

+		self.weight_workers = list()
+		self.worker_weight_factory = worker_weight_factory
+
 	def start_async(self):
 		self.workers = [
 			self.worker_factory(worker_id,self)
@ -73,6 +76,21 @@ class Indexer():
 		for worker in self.workers:
 			worker.join()

+	def join_weight(self):
+		for worker in self.weight_workers:
+			worker.join()
+
+	def start_async_weight(self):
+		self.weight_workers = [
+			self.worker_weight_factory(worker_id,self)
+			for worker_id in range(1)]
+		for worker in self.weight_workers:
+			worker.start()
+
+	def start_weight(self):
+		self.start_async_weight()
+		self.join_weight()
+
 	def get_postings(self,index):
 		merged_index_index = open("merged_index.index" ,'r')
 		merged_index = open("merged_index.full",'r')
@ -88,50 +106,8 @@ class Indexer():

 	def set_total_weight(self):
 		self.get_data_path()
-		merged_index_index = open("merged_index.index" ,'r')
-		merged_index = open("merged_index.full",'r')
-		merged_index_index.seek(0,0)
-		json_value = merged_index_index.readline()
-		data = json.loads(json_value)
-		index_index = dict(data['index'])
+		self.start_weight()
 		
-		for doc in self.data_paths:
-			file_load = open(doc)
-			data = json.load(file_load)
-			soup = BeautifulSoup(data["content"],features="lxml")
-			url = data['url']
-			doc_id = doc[doc.rfind('/')+1:-5]
-			# Gets a cleaner version text comparative to soup.get_text()
-			clean_text = ' '.join(soup.stripped_strings)
-			# Looks for large white space, tabbed space, and other forms of spacing and removes it
-			# Regex expression matches for space characters excluding a single space or words
-			clean_text = re.sub(r'\s[^ \w]', '', clean_text)
-			# Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended
-			clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)])
-			# Stems tokenized text
-			clean_text = " ".join([self.stemmer.stem(i) for i in clean_text.split()])
-			# Put clean_text as an element in a list because get_tf_idf workers properly with single element lists
-
-			tokens = word_tokenize(clean_text)
-
-			tokens = set(tokens)
-
-			total = 0
-			for token in tokens:
-				to_seek = index_index[token]
-				merged_index.seek(to_seek,0)
-				json_value = merged_index.readline()
-				data = json.loads(json_value)
-				
-				for posting in data['postings']:
-					if posting['doc_id'] == doc_id:
-						total = total + posting['tf_idf']* posting['tf_idf']
-						break
-
-			self.weight[doc_id] = math.sqrt(total)
-
-		with open('docs.weight','w') as f:
-			f.write(json.dumps(self.weight))
 			

 	def get_weight(self,doc_id):
@ -193,6 +169,8 @@ class Indexer():
 			partial_file.seek(0,0)

 		pointers = [0]*num_indices
+		merged_index = open("merged_index.full",'w')
+		merged_index_index = open("merged_index.index" ,'w')

 		while(True):

@ -226,30 +204,30 @@ class Indexer():
 			node.postings.sort(key=lambda y:y['doc_id'])
 			for posting in node.postings:
 				posting['tf_idf'] = posting['tf_raw']*math.log(self.num_doc/len(node.postings))
-			full_index.index.append((value,self.merged_index.tell()))
+			full_index.index.append((value,merged_index.tell()))
 			full_index.length = full_index.length + 1
 			jsonStr = json.dumps(node,default=lambda o: o.__dict__,sort_keys=False)
-			self.merged_index.write(jsonStr + '\n')
+			merged_index.write(jsonStr + '\n')

 		full_index.index.sort(key=lambda y:y[0])
 		jsonStr =json.dumps(full_index, default=lambda o: o.__dict__,sort_keys=False)
-		self.merged_index_index.write(jsonStr)
+		merged_index_index.write(jsonStr)

 		for partial_index in self.list_partials:
 			os.remove("temp/" + partial_index+'.partial')
 			os.remove("temp/" + partial_index+'.index')

-		self.merged_index_index.close()
-		self.merged_index.close()
-
+		merged_index_index.close()
+		merged_index.close()


 def main():
 	indexer = Indexer(list(),dict(),list())
-	indexer.get_data_path()
-	print("We have " + str(len(indexer.data_paths)) + " documents to go through !" )
-	indexer.start()
-	indexer.merge()
+	#indexer.get_data_path()
+	#print("We have " + str(len(indexer.data_paths)) + " documents to go through !" )
+	#indexer.start()
+	#indexer.merge()
+	print("Finished merging into 1 big happy family")
 	indexer.set_total_weight()
 	

--- a/stemmer.py
+++ b/stemmer.py
@ -1,18 +0,0 @@
-#Multiple implementation of stemming here please
-class Stemmer():
-
-	def __init__(self,mode, data):
-		#Different type of stemmer = different modes
-		self.mode = mode
-		self.data = data
-
-	def stem(self):
-		#Do stuff here
-		if(self.mode == 0):
-			#Do stemmer 1
-			return #stemmed data
-		#....
-
-	def #name of stemmer 1
-
-	def #name of stemmer 2
--- a/worker_weight.py
+++ b/worker_weight.py
@ -0,0 +1,109 @@
+from threading import Thread
+import json
+import os
+
+from bs4 import BeautifulSoup
+import re
+import math
+import time
+#Data process
+from nltk.tokenize import word_tokenize
+from nltk.stem import PorterStemmer
+
+from posting import Posting
+
+
+import sys
+
+class Node():
+	index_value = ''
+	postings = list()
+
+class Index():
+	length = 0
+	index = list()
+
+class Worker_Weight(Thread):
+	def __init__(self,worker_id,indexer):
+		self.indexer = indexer
+		self.stemmer = PorterStemmer()
+		self.worker_id = worker_id
+		self.num_partial = 0
+		self.weight = dict()
+		merged_index_index = open("merged_index.index" ,'r')
+		self.merged_index = open("merged_index.full",'r')
+		merged_index_index.seek(0,0)
+		json_value = merged_index_index.readline()
+		data = json.loads(json_value)
+		self.index_index = dict(data['index'])
+		
+		super().__init__(daemon=True)
+
+
+	def dump(self):
+		with open("docs"+str(self.worker_id)+".weight",'w') as f:
+			f.write(json.dumps(self.weight))
+
+	def run(self):
+		while True:
+			target = self.indexer.get_next_file()
+			if not target:
+				self.dump()
+				print("Worker " + str(self.worker_id) + " died")
+				break
+			
+			
+			print("Worker " + str(self.worker_id) + " weighting " + target)
+			file_load = open(target)
+			data = json.load(file_load)
+			soup = BeautifulSoup(data["content"],features="lxml")
+			url = data['url']
+			doc_id = target[target.rfind('/')+1:-5]
+			# Gets a cleaner version text comparative to soup.get_text()
+			clean_text = ' '.join(soup.stripped_strings)
+			# Looks for large white space, tabbed space, and other forms of spacing and removes it
+			# Regex expression matches for space characters excluding a single space or words
+			clean_text = re.sub(r'\s[^ \w]', '', clean_text)
+			# Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended
+			clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)])
+			# Stems tokenized text
+			clean_text = " ".join([self.stemmer.stem(i) for i in clean_text.split()])
+			# Put clean_text as an element in a list because get_tf_idf workers properly with single element lists
+
+			tokens = word_tokenize(clean_text)
+
+			total = 0
+
+
+			counter = dict()
+			#We calculating tf_raw, and positionals here
+			for i in range(len(tokens)):
+				word = tokens[i]
+				if word in counter:
+					counter[word]= counter[word] + 1
+				else:
+					counter[word] = 1
+
+			doc_length = len(tokens)
+
+			for index in tokens:
+				to_seek = self.index_index[index]
+				self.merged_index.seek(to_seek,0)
+				json_value = self.merged_index.readline()
+
+				data = json.loads(json_value)
+				df = len(data['postings'])
+				tf = counter[index]/doc_length
+				idf = math.log(self.indexer.num_doc/df)
+				tf_idf = tf*idf
+				total = total + tf_idf*tf_idf
+				
+			self.weight[doc_id] = math.sqrt(total)
+
+
+
+
+
+
+
+