We are looking for TF_WEIGHT not IDF_WEIGHT, make things A LOT CHEAPER
This commit is contained in:
		
							
								
								
									
										1
									
								
								docs.weight
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								docs.weight
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										46
									
								
								indexer.py
									
									
									
									
									
								
							
							
						
						
									
										46
									
								
								indexer.py
									
									
									
									
									
								
							| @@ -46,7 +46,7 @@ class Index(): | ||||
| class Indexer(): | ||||
| 	def __init__(self,list_partials,weight,data_paths,worker_factory=Worker,worker_weight_factory=Worker_Weight): | ||||
| 		#Config stuffs | ||||
| 		self.path = "data/DEV/" | ||||
| 		self.path = "test/" | ||||
| 		self.num_doc = 0 | ||||
| 		self.list_partials = list_partials | ||||
| 		self.weight = weight | ||||
| @@ -76,20 +76,6 @@ class Indexer(): | ||||
| 		for worker in self.workers: | ||||
| 			worker.join() | ||||
|  | ||||
| 	def join_weight(self): | ||||
| 		for worker in self.weight_workers: | ||||
| 			worker.join() | ||||
|  | ||||
| 	def start_async_weight(self): | ||||
| 		self.weight_workers = [ | ||||
| 			self.worker_weight_factory(worker_id,self) | ||||
| 			for worker_id in range(1)] | ||||
| 		for worker in self.weight_workers: | ||||
| 			worker.start() | ||||
|  | ||||
| 	def start_weight(self): | ||||
| 		self.start_async_weight() | ||||
| 		self.join_weight() | ||||
|  | ||||
| 	def get_postings(self,index): | ||||
| 		merged_index_index = open("merged_index.index" ,'r') | ||||
| @@ -104,11 +90,11 @@ class Indexer(): | ||||
| 		data = json.loads(json_value) | ||||
| 		return data['postings'] | ||||
|  | ||||
| 	def set_total_weight(self): | ||||
| 		self.get_data_path() | ||||
| 		self.start_weight() | ||||
| 		 | ||||
| 			 | ||||
| 	def set_weight(self): | ||||
| 		weight_file = open('docs.weight','w') | ||||
| 		jsonStr =json.dumps(self.weight, default=lambda o: o.__dict__,sort_keys=False) | ||||
| 		weight_file.write(jsonStr) | ||||
| 		weight_file.close() | ||||
|  | ||||
| 	def get_weight(self,doc_id): | ||||
| 		weight = open('docs.weight','r') | ||||
| @@ -223,12 +209,22 @@ class Indexer(): | ||||
|  | ||||
| def main(): | ||||
| 	indexer = Indexer(list(),dict(),list()) | ||||
| 	#indexer.get_data_path() | ||||
| 	#print("We have " + str(len(indexer.data_paths)) + " documents to go through !" ) | ||||
| 	#indexer.start() | ||||
| 	#indexer.merge() | ||||
| 	indexer.get_data_path() | ||||
| 	print("We have " + str(len(indexer.data_paths)) + " documents to go through !" ) | ||||
| 	indexer.start() | ||||
| 	indexer.merge() | ||||
| 	print("Finished merging into 1 big happy family") | ||||
| 	indexer.set_total_weight() | ||||
| 	indexer.set_weight() | ||||
|  | ||||
| 	tic = time.perf_counter() | ||||
| 	indexer.get_postings('artifici') | ||||
| 	toc = time.perf_counter() | ||||
| 	print(f"Took {toc - tic:0.4f} seconds to get postings for artifici") | ||||
| 	tic = time.perf_counter() | ||||
| 	indexer.get_weight('00ba3af6a00b7cfb4928e5d234342c5dc46b4e31714d4a8f315a2dd4d8e49860') | ||||
| 	print(f"Took {toc - tic:0.4f} seconds to get weight for some random page ") | ||||
| 	toc = time.perf_counter() | ||||
|  | ||||
| 	 | ||||
|  | ||||
|  | ||||
|   | ||||
							
								
								
									
										13
									
								
								worker.py
									
									
									
									
									
								
							
							
						
						
									
										13
									
								
								worker.py
									
									
									
									
									
								
							| @@ -12,6 +12,7 @@ from nltk.stem import PorterStemmer | ||||
|  | ||||
| from posting import Posting | ||||
|  | ||||
| import math | ||||
|  | ||||
| import sys | ||||
|  | ||||
| @@ -110,17 +111,23 @@ class Worker(Thread): | ||||
| 					counter[word][1].append(i) | ||||
|  | ||||
| 			doc_length = len(tokens) | ||||
| 			total = 0 | ||||
| 			for index in counter: | ||||
| 				tf = counter[index][0]/doc_length | ||||
| 				log_tf = 1 + math.log(tf) | ||||
| 				total = total + log_tf * log_tf | ||||
| 				if index in self.index: | ||||
| 					postings = self.index[index] | ||||
| 					postings.append(Posting(doc_id,url,counter[index][0]/doc_length,0,counter[index][1])) | ||||
| 					postings.append(Posting(doc_id,url,tf,0,counter[index][1])) | ||||
| 				else: | ||||
| 					self.index[index] = list() | ||||
| 					self.index[index].append(Posting(doc_id,url,counter[index][0]/doc_length,0,counter[index][1])) | ||||
| 					self.index[index].append(Posting(doc_id,url,tf,0,counter[index][1])) | ||||
| 					self.index[index].sort(key=lambda y:y.doc_id) | ||||
|  | ||||
| 			self.indexer.weight[doc_id] = math.sqrt(total) | ||||
|  | ||||
| 			#10 Megabytes index (in Ram approx) | ||||
| 			if sys.getsizeof(self.index) > 10000000: | ||||
| 			if sys.getsizeof(self.index) > 1000000: | ||||
| 				self.dump() | ||||
|  | ||||
|  | ||||
|   | ||||
							
								
								
									
										109
									
								
								worker_weight.py
									
									
									
									
									
								
							
							
						
						
									
										109
									
								
								worker_weight.py
									
									
									
									
									
								
							| @@ -1,109 +0,0 @@ | ||||
| from threading import Thread | ||||
| import json | ||||
| import os | ||||
|  | ||||
| from bs4 import BeautifulSoup | ||||
| import re | ||||
| import math | ||||
| import time | ||||
| #Data process | ||||
| from nltk.tokenize import word_tokenize | ||||
| from nltk.stem import PorterStemmer | ||||
|  | ||||
| from posting import Posting | ||||
|  | ||||
|  | ||||
| import sys | ||||
|  | ||||
| class Node(): | ||||
| 	index_value = '' | ||||
| 	postings = list() | ||||
|  | ||||
| class Index(): | ||||
| 	length = 0 | ||||
| 	index = list() | ||||
|  | ||||
| class Worker_Weight(Thread): | ||||
| 	def __init__(self,worker_id,indexer): | ||||
| 		self.indexer = indexer | ||||
| 		self.stemmer = PorterStemmer() | ||||
| 		self.worker_id = worker_id | ||||
| 		self.num_partial = 0 | ||||
| 		self.weight = dict() | ||||
| 		merged_index_index = open("merged_index.index" ,'r') | ||||
| 		self.merged_index = open("merged_index.full",'r') | ||||
| 		merged_index_index.seek(0,0) | ||||
| 		json_value = merged_index_index.readline() | ||||
| 		data = json.loads(json_value) | ||||
| 		self.index_index = dict(data['index']) | ||||
| 		 | ||||
| 		super().__init__(daemon=True) | ||||
|  | ||||
|  | ||||
| 	def dump(self): | ||||
| 		with open("docs"+str(self.worker_id)+".weight",'w') as f: | ||||
| 			f.write(json.dumps(self.weight)) | ||||
|  | ||||
| 	def run(self): | ||||
| 		while True: | ||||
| 			target = self.indexer.get_next_file() | ||||
| 			if not target: | ||||
| 				self.dump() | ||||
| 				print("Worker " + str(self.worker_id) + " died") | ||||
| 				break | ||||
| 			 | ||||
| 			 | ||||
| 			print("Worker " + str(self.worker_id) + " weighting " + target) | ||||
| 			file_load = open(target) | ||||
| 			data = json.load(file_load) | ||||
| 			soup = BeautifulSoup(data["content"],features="lxml") | ||||
| 			url = data['url'] | ||||
| 			doc_id = target[target.rfind('/')+1:-5] | ||||
| 			# Gets a cleaner version text comparative to soup.get_text() | ||||
| 			clean_text = ' '.join(soup.stripped_strings) | ||||
| 			# Looks for large white space, tabbed space, and other forms of spacing and removes it | ||||
| 			# Regex expression matches for space characters excluding a single space or words | ||||
| 			clean_text = re.sub(r'\s[^ \w]', '', clean_text) | ||||
| 			# Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended | ||||
| 			clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)]) | ||||
| 			# Stems tokenized text | ||||
| 			clean_text = " ".join([self.stemmer.stem(i) for i in clean_text.split()]) | ||||
| 			# Put clean_text as an element in a list because get_tf_idf workers properly with single element lists | ||||
|  | ||||
| 			tokens = word_tokenize(clean_text) | ||||
|  | ||||
| 			total = 0 | ||||
|  | ||||
|  | ||||
| 			counter = dict() | ||||
| 			#We calculating tf_raw, and positionals here | ||||
| 			for i in range(len(tokens)): | ||||
| 				word = tokens[i] | ||||
| 				if word in counter: | ||||
| 					counter[word]= counter[word] + 1 | ||||
| 				else: | ||||
| 					counter[word] = 1 | ||||
|  | ||||
| 			doc_length = len(tokens) | ||||
|  | ||||
| 			for index in tokens: | ||||
| 				to_seek = self.index_index[index] | ||||
| 				self.merged_index.seek(to_seek,0) | ||||
| 				json_value = self.merged_index.readline() | ||||
|  | ||||
| 				data = json.loads(json_value) | ||||
| 				df = len(data['postings']) | ||||
| 				tf = counter[index]/doc_length | ||||
| 				idf = math.log(self.indexer.num_doc/df) | ||||
| 				tf_idf = tf*idf | ||||
| 				total = total + tf_idf*tf_idf | ||||
| 				 | ||||
| 			self.weight[doc_id] = math.sqrt(total) | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| 			 | ||||
		Reference in New Issue
	
	Block a user
	 inocturnis
					inocturnis