We are looking for TF_WEIGHT not IDF_WEIGHT, make things A LOT CHEAPER

2022-05-27 10:39:13 -07:00
parent cf81cf5c6a
commit 8d2dcea172
4 changed files with 32 additions and 137 deletions
--- a/docs.weight
+++ b/docs.weight
--- a/indexer.py
+++ b/indexer.py
@@ -46,7 +46,7 @@ class Index():
 class Indexer():
 	def __init__(self,list_partials,weight,data_paths,worker_factory=Worker,worker_weight_factory=Worker_Weight):
 		#Config stuffs
-		self.path = "data/DEV/"
+		self.path = "test/"
 		self.num_doc = 0
 		self.list_partials = list_partials
 		self.weight = weight
@@ -76,20 +76,6 @@ class Indexer():
 		for worker in self.workers:
 			worker.join()

-	def join_weight(self):
-		for worker in self.weight_workers:
-			worker.join()
-
-	def start_async_weight(self):
-		self.weight_workers = [
-			self.worker_weight_factory(worker_id,self)
-			for worker_id in range(1)]
-		for worker in self.weight_workers:
-			worker.start()
-
-	def start_weight(self):
-		self.start_async_weight()
-		self.join_weight()

 	def get_postings(self,index):
 		merged_index_index = open("merged_index.index" ,'r')
@@ -104,11 +90,11 @@ class Indexer():
 		data = json.loads(json_value)
 		return data['postings']

-	def set_total_weight(self):
-		self.get_data_path()
-		self.start_weight()
-		
-			
+	def set_weight(self):
+		weight_file = open('docs.weight','w')
+		jsonStr =json.dumps(self.weight, default=lambda o: o.__dict__,sort_keys=False)
+		weight_file.write(jsonStr)
+		weight_file.close()

 	def get_weight(self,doc_id):
 		weight = open('docs.weight','r')
@@ -223,12 +209,22 @@ class Indexer():

 def main():
 	indexer = Indexer(list(),dict(),list())
-	#indexer.get_data_path()
-	#print("We have " + str(len(indexer.data_paths)) + " documents to go through !" )
-	#indexer.start()
-	#indexer.merge()
+	indexer.get_data_path()
+	print("We have " + str(len(indexer.data_paths)) + " documents to go through !" )
+	indexer.start()
+	indexer.merge()
 	print("Finished merging into 1 big happy family")
-	indexer.set_total_weight()
+	indexer.set_weight()
+
+	tic = time.perf_counter()
+	indexer.get_postings('artifici')
+	toc = time.perf_counter()
+	print(f"Took {toc - tic:0.4f} seconds to get postings for artifici")
+	tic = time.perf_counter()
+	indexer.get_weight('00ba3af6a00b7cfb4928e5d234342c5dc46b4e31714d4a8f315a2dd4d8e49860')
+	print(f"Took {toc - tic:0.4f} seconds to get weight for some random page ")
+	toc = time.perf_counter()
+
 	


--- a/worker.py
+++ b/worker.py
@@ -12,6 +12,7 @@ from nltk.stem import PorterStemmer

 from posting import Posting

+import math

 import sys

@@ -110,17 +111,23 @@ class Worker(Thread):
 					counter[word][1].append(i)

 			doc_length = len(tokens)
+			total = 0
 			for index in counter:
+				tf = counter[index][0]/doc_length
+				log_tf = 1 + math.log(tf)
+				total = total + log_tf * log_tf
 				if index in self.index:
 					postings = self.index[index]
-					postings.append(Posting(doc_id,url,counter[index][0]/doc_length,0,counter[index][1]))
+					postings.append(Posting(doc_id,url,tf,0,counter[index][1]))
 				else:
 					self.index[index] = list()
-					self.index[index].append(Posting(doc_id,url,counter[index][0]/doc_length,0,counter[index][1]))
+					self.index[index].append(Posting(doc_id,url,tf,0,counter[index][1]))
 					self.index[index].sort(key=lambda y:y.doc_id)

+			self.indexer.weight[doc_id] = math.sqrt(total)
+
 			#10 Megabytes index (in Ram approx)
-			if sys.getsizeof(self.index) > 10000000:
+			if sys.getsizeof(self.index) > 1000000:
 				self.dump()


--- a/worker_weight.py
+++ b/worker_weight.py
@@ -1,109 +0,0 @@
-from threading import Thread
-import json
-import os
-
-from bs4 import BeautifulSoup
-import re
-import math
-import time
-#Data process
-from nltk.tokenize import word_tokenize
-from nltk.stem import PorterStemmer
-
-from posting import Posting
-
-
-import sys
-
-class Node():
-	index_value = ''
-	postings = list()
-
-class Index():
-	length = 0
-	index = list()
-
-class Worker_Weight(Thread):
-	def __init__(self,worker_id,indexer):
-		self.indexer = indexer
-		self.stemmer = PorterStemmer()
-		self.worker_id = worker_id
-		self.num_partial = 0
-		self.weight = dict()
-		merged_index_index = open("merged_index.index" ,'r')
-		self.merged_index = open("merged_index.full",'r')
-		merged_index_index.seek(0,0)
-		json_value = merged_index_index.readline()
-		data = json.loads(json_value)
-		self.index_index = dict(data['index'])
-		
-		super().__init__(daemon=True)
-
-
-	def dump(self):
-		with open("docs"+str(self.worker_id)+".weight",'w') as f:
-			f.write(json.dumps(self.weight))
-
-	def run(self):
-		while True:
-			target = self.indexer.get_next_file()
-			if not target:
-				self.dump()
-				print("Worker " + str(self.worker_id) + " died")
-				break
-			
-			
-			print("Worker " + str(self.worker_id) + " weighting " + target)
-			file_load = open(target)
-			data = json.load(file_load)
-			soup = BeautifulSoup(data["content"],features="lxml")
-			url = data['url']
-			doc_id = target[target.rfind('/')+1:-5]
-			# Gets a cleaner version text comparative to soup.get_text()
-			clean_text = ' '.join(soup.stripped_strings)
-			# Looks for large white space, tabbed space, and other forms of spacing and removes it
-			# Regex expression matches for space characters excluding a single space or words
-			clean_text = re.sub(r'\s[^ \w]', '', clean_text)
-			# Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended
-			clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)])
-			# Stems tokenized text
-			clean_text = " ".join([self.stemmer.stem(i) for i in clean_text.split()])
-			# Put clean_text as an element in a list because get_tf_idf workers properly with single element lists
-
-			tokens = word_tokenize(clean_text)
-
-			total = 0
-
-
-			counter = dict()
-			#We calculating tf_raw, and positionals here
-			for i in range(len(tokens)):
-				word = tokens[i]
-				if word in counter:
-					counter[word]= counter[word] + 1
-				else:
-					counter[word] = 1
-
-			doc_length = len(tokens)
-
-			for index in tokens:
-				to_seek = self.index_index[index]
-				self.merged_index.seek(to_seek,0)
-				json_value = self.merged_index.readline()
-
-				data = json.loads(json_value)
-				df = len(data['postings'])
-				tf = counter[index]/doc_length
-				idf = math.log(self.indexer.num_doc/df)
-				tf_idf = tf*idf
-				total = total + tf_idf*tf_idf
-				
-			self.weight[doc_id] = math.sqrt(total)
-
-
-
-
-
-
-
-