changing the way to calculate weight
This commit is contained in:
		
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@@ -1,3 +1,5 @@
 | 
				
			|||||||
/data/
 | 
					/data/
 | 
				
			||||||
*.shelve
 | 
					*.shelve
 | 
				
			||||||
/__pycache__/
 | 
					/__pycache__/
 | 
				
			||||||
 | 
					/test/
 | 
				
			||||||
 | 
					merged*
 | 
				
			||||||
 
 | 
				
			|||||||
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										92
									
								
								indexer.py
									
									
									
									
									
								
							
							
						
						
									
										92
									
								
								indexer.py
									
									
									
									
									
								
							@@ -33,6 +33,7 @@ import re
 | 
				
			|||||||
#Logging postings
 | 
					#Logging postings
 | 
				
			||||||
from posting import Posting
 | 
					from posting import Posting
 | 
				
			||||||
from worker import Worker
 | 
					from worker import Worker
 | 
				
			||||||
 | 
					from worker_weight import Worker_Weight
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Node():
 | 
					class Node():
 | 
				
			||||||
	index_value = ''
 | 
						index_value = ''
 | 
				
			||||||
@@ -43,9 +44,9 @@ class Index():
 | 
				
			|||||||
	index = list()
 | 
						index = list()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Indexer():
 | 
					class Indexer():
 | 
				
			||||||
	def __init__(self,list_partials,weight,data_paths,worker_factory=Worker):
 | 
						def __init__(self,list_partials,weight,data_paths,worker_factory=Worker,worker_weight_factory=Worker_Weight):
 | 
				
			||||||
		#Config stuffs
 | 
							#Config stuffs
 | 
				
			||||||
		self.path = "test/"
 | 
							self.path = "data/DEV/"
 | 
				
			||||||
		self.num_doc = 0
 | 
							self.num_doc = 0
 | 
				
			||||||
		self.list_partials = list_partials
 | 
							self.list_partials = list_partials
 | 
				
			||||||
		self.weight = weight
 | 
							self.weight = weight
 | 
				
			||||||
@@ -54,10 +55,12 @@ class Indexer():
 | 
				
			|||||||
		self.data_paths_lock = Lock()
 | 
							self.data_paths_lock = Lock()
 | 
				
			||||||
		self.list_partials_lock = Lock()
 | 
							self.list_partials_lock = Lock()
 | 
				
			||||||
		self.workers = list()
 | 
							self.workers = list()
 | 
				
			||||||
		self.merged_index = open("merged_index.full",'w')
 | 
							
 | 
				
			||||||
		self.merged_index_index = open("merged_index.index" ,'w')
 | 
					 | 
				
			||||||
		self.worker_factory = worker_factory
 | 
							self.worker_factory = worker_factory
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							self.weight_workers = list()
 | 
				
			||||||
 | 
							self.worker_weight_factory = worker_weight_factory
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	def start_async(self):
 | 
						def start_async(self):
 | 
				
			||||||
		self.workers = [
 | 
							self.workers = [
 | 
				
			||||||
			self.worker_factory(worker_id,self)
 | 
								self.worker_factory(worker_id,self)
 | 
				
			||||||
@@ -73,6 +76,21 @@ class Indexer():
 | 
				
			|||||||
		for worker in self.workers:
 | 
							for worker in self.workers:
 | 
				
			||||||
			worker.join()
 | 
								worker.join()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						def join_weight(self):
 | 
				
			||||||
 | 
							for worker in self.weight_workers:
 | 
				
			||||||
 | 
								worker.join()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						def start_async_weight(self):
 | 
				
			||||||
 | 
							self.weight_workers = [
 | 
				
			||||||
 | 
								self.worker_weight_factory(worker_id,self)
 | 
				
			||||||
 | 
								for worker_id in range(1)]
 | 
				
			||||||
 | 
							for worker in self.weight_workers:
 | 
				
			||||||
 | 
								worker.start()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						def start_weight(self):
 | 
				
			||||||
 | 
							self.start_async_weight()
 | 
				
			||||||
 | 
							self.join_weight()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	def get_postings(self,index):
 | 
						def get_postings(self,index):
 | 
				
			||||||
		merged_index_index = open("merged_index.index" ,'r')
 | 
							merged_index_index = open("merged_index.index" ,'r')
 | 
				
			||||||
		merged_index = open("merged_index.full",'r')
 | 
							merged_index = open("merged_index.full",'r')
 | 
				
			||||||
@@ -88,50 +106,8 @@ class Indexer():
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
	def set_total_weight(self):
 | 
						def set_total_weight(self):
 | 
				
			||||||
		self.get_data_path()
 | 
							self.get_data_path()
 | 
				
			||||||
		merged_index_index = open("merged_index.index" ,'r')
 | 
							self.start_weight()
 | 
				
			||||||
		merged_index = open("merged_index.full",'r')
 | 
					 | 
				
			||||||
		merged_index_index.seek(0,0)
 | 
					 | 
				
			||||||
		json_value = merged_index_index.readline()
 | 
					 | 
				
			||||||
		data = json.loads(json_value)
 | 
					 | 
				
			||||||
		index_index = dict(data['index'])
 | 
					 | 
				
			||||||
		
 | 
							
 | 
				
			||||||
		for doc in self.data_paths:
 | 
					 | 
				
			||||||
			file_load = open(doc)
 | 
					 | 
				
			||||||
			data = json.load(file_load)
 | 
					 | 
				
			||||||
			soup = BeautifulSoup(data["content"],features="lxml")
 | 
					 | 
				
			||||||
			url = data['url']
 | 
					 | 
				
			||||||
			doc_id = doc[doc.rfind('/')+1:-5]
 | 
					 | 
				
			||||||
			# Gets a cleaner version text comparative to soup.get_text()
 | 
					 | 
				
			||||||
			clean_text = ' '.join(soup.stripped_strings)
 | 
					 | 
				
			||||||
			# Looks for large white space, tabbed space, and other forms of spacing and removes it
 | 
					 | 
				
			||||||
			# Regex expression matches for space characters excluding a single space or words
 | 
					 | 
				
			||||||
			clean_text = re.sub(r'\s[^ \w]', '', clean_text)
 | 
					 | 
				
			||||||
			# Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended
 | 
					 | 
				
			||||||
			clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)])
 | 
					 | 
				
			||||||
			# Stems tokenized text
 | 
					 | 
				
			||||||
			clean_text = " ".join([self.stemmer.stem(i) for i in clean_text.split()])
 | 
					 | 
				
			||||||
			# Put clean_text as an element in a list because get_tf_idf workers properly with single element lists
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
			tokens = word_tokenize(clean_text)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
			tokens = set(tokens)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
			total = 0
 | 
					 | 
				
			||||||
			for token in tokens:
 | 
					 | 
				
			||||||
				to_seek = index_index[token]
 | 
					 | 
				
			||||||
				merged_index.seek(to_seek,0)
 | 
					 | 
				
			||||||
				json_value = merged_index.readline()
 | 
					 | 
				
			||||||
				data = json.loads(json_value)
 | 
					 | 
				
			||||||
				
 | 
					 | 
				
			||||||
				for posting in data['postings']:
 | 
					 | 
				
			||||||
					if posting['doc_id'] == doc_id:
 | 
					 | 
				
			||||||
						total = total + posting['tf_idf']* posting['tf_idf']
 | 
					 | 
				
			||||||
						break
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
			self.weight[doc_id] = math.sqrt(total)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
		with open('docs.weight','w') as f:
 | 
					 | 
				
			||||||
			f.write(json.dumps(self.weight))
 | 
					 | 
				
			||||||
			
 | 
								
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	def get_weight(self,doc_id):
 | 
						def get_weight(self,doc_id):
 | 
				
			||||||
@@ -193,6 +169,8 @@ class Indexer():
 | 
				
			|||||||
			partial_file.seek(0,0)
 | 
								partial_file.seek(0,0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		pointers = [0]*num_indices
 | 
							pointers = [0]*num_indices
 | 
				
			||||||
 | 
							merged_index = open("merged_index.full",'w')
 | 
				
			||||||
 | 
							merged_index_index = open("merged_index.index" ,'w')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		while(True):
 | 
							while(True):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -226,30 +204,30 @@ class Indexer():
 | 
				
			|||||||
			node.postings.sort(key=lambda y:y['doc_id'])
 | 
								node.postings.sort(key=lambda y:y['doc_id'])
 | 
				
			||||||
			for posting in node.postings:
 | 
								for posting in node.postings:
 | 
				
			||||||
				posting['tf_idf'] = posting['tf_raw']*math.log(self.num_doc/len(node.postings))
 | 
									posting['tf_idf'] = posting['tf_raw']*math.log(self.num_doc/len(node.postings))
 | 
				
			||||||
			full_index.index.append((value,self.merged_index.tell()))
 | 
								full_index.index.append((value,merged_index.tell()))
 | 
				
			||||||
			full_index.length = full_index.length + 1
 | 
								full_index.length = full_index.length + 1
 | 
				
			||||||
			jsonStr = json.dumps(node,default=lambda o: o.__dict__,sort_keys=False)
 | 
								jsonStr = json.dumps(node,default=lambda o: o.__dict__,sort_keys=False)
 | 
				
			||||||
			self.merged_index.write(jsonStr + '\n')
 | 
								merged_index.write(jsonStr + '\n')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		full_index.index.sort(key=lambda y:y[0])
 | 
							full_index.index.sort(key=lambda y:y[0])
 | 
				
			||||||
		jsonStr =json.dumps(full_index, default=lambda o: o.__dict__,sort_keys=False)
 | 
							jsonStr =json.dumps(full_index, default=lambda o: o.__dict__,sort_keys=False)
 | 
				
			||||||
		self.merged_index_index.write(jsonStr)
 | 
							merged_index_index.write(jsonStr)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		for partial_index in self.list_partials:
 | 
							for partial_index in self.list_partials:
 | 
				
			||||||
			os.remove("temp/" + partial_index+'.partial')
 | 
								os.remove("temp/" + partial_index+'.partial')
 | 
				
			||||||
			os.remove("temp/" + partial_index+'.index')
 | 
								os.remove("temp/" + partial_index+'.index')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		self.merged_index_index.close()
 | 
							merged_index_index.close()
 | 
				
			||||||
		self.merged_index.close()
 | 
							merged_index.close()
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def main():
 | 
					def main():
 | 
				
			||||||
	indexer = Indexer(list(),dict(),list())
 | 
						indexer = Indexer(list(),dict(),list())
 | 
				
			||||||
	indexer.get_data_path()
 | 
						#indexer.get_data_path()
 | 
				
			||||||
	print("We have " + str(len(indexer.data_paths)) + " documents to go through !" )
 | 
						#print("We have " + str(len(indexer.data_paths)) + " documents to go through !" )
 | 
				
			||||||
	indexer.start()
 | 
						#indexer.start()
 | 
				
			||||||
	indexer.merge()
 | 
						#indexer.merge()
 | 
				
			||||||
 | 
						print("Finished merging into 1 big happy family")
 | 
				
			||||||
	indexer.set_total_weight()
 | 
						indexer.set_total_weight()
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										18
									
								
								stemmer.py
									
									
									
									
									
								
							
							
						
						
									
										18
									
								
								stemmer.py
									
									
									
									
									
								
							@@ -1,18 +0,0 @@
 | 
				
			|||||||
#Multiple implementation of stemming here please
 | 
					 | 
				
			||||||
class Stemmer():
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	def __init__(self,mode, data):
 | 
					 | 
				
			||||||
		#Different type of stemmer = different modes
 | 
					 | 
				
			||||||
		self.mode = mode
 | 
					 | 
				
			||||||
		self.data = data
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	def stem(self):
 | 
					 | 
				
			||||||
		#Do stuff here
 | 
					 | 
				
			||||||
		if(self.mode == 0):
 | 
					 | 
				
			||||||
			#Do stemmer 1
 | 
					 | 
				
			||||||
			return #stemmed data
 | 
					 | 
				
			||||||
		#....
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	def #name of stemmer 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	def #name of stemmer 2
 | 
					 | 
				
			||||||
							
								
								
									
										109
									
								
								worker_weight.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										109
									
								
								worker_weight.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,109 @@
 | 
				
			|||||||
 | 
					from threading import Thread
 | 
				
			||||||
 | 
					import json
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from bs4 import BeautifulSoup
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					import math
 | 
				
			||||||
 | 
					import time
 | 
				
			||||||
 | 
					#Data process
 | 
				
			||||||
 | 
					from nltk.tokenize import word_tokenize
 | 
				
			||||||
 | 
					from nltk.stem import PorterStemmer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from posting import Posting
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Node():
 | 
				
			||||||
 | 
						index_value = ''
 | 
				
			||||||
 | 
						postings = list()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Index():
 | 
				
			||||||
 | 
						length = 0
 | 
				
			||||||
 | 
						index = list()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Worker_Weight(Thread):
 | 
				
			||||||
 | 
						def __init__(self,worker_id,indexer):
 | 
				
			||||||
 | 
							self.indexer = indexer
 | 
				
			||||||
 | 
							self.stemmer = PorterStemmer()
 | 
				
			||||||
 | 
							self.worker_id = worker_id
 | 
				
			||||||
 | 
							self.num_partial = 0
 | 
				
			||||||
 | 
							self.weight = dict()
 | 
				
			||||||
 | 
							merged_index_index = open("merged_index.index" ,'r')
 | 
				
			||||||
 | 
							self.merged_index = open("merged_index.full",'r')
 | 
				
			||||||
 | 
							merged_index_index.seek(0,0)
 | 
				
			||||||
 | 
							json_value = merged_index_index.readline()
 | 
				
			||||||
 | 
							data = json.loads(json_value)
 | 
				
			||||||
 | 
							self.index_index = dict(data['index'])
 | 
				
			||||||
 | 
							
 | 
				
			||||||
 | 
							super().__init__(daemon=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						def dump(self):
 | 
				
			||||||
 | 
							with open("docs"+str(self.worker_id)+".weight",'w') as f:
 | 
				
			||||||
 | 
								f.write(json.dumps(self.weight))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						def run(self):
 | 
				
			||||||
 | 
							while True:
 | 
				
			||||||
 | 
								target = self.indexer.get_next_file()
 | 
				
			||||||
 | 
								if not target:
 | 
				
			||||||
 | 
									self.dump()
 | 
				
			||||||
 | 
									print("Worker " + str(self.worker_id) + " died")
 | 
				
			||||||
 | 
									break
 | 
				
			||||||
 | 
								
 | 
				
			||||||
 | 
								
 | 
				
			||||||
 | 
								print("Worker " + str(self.worker_id) + " weighting " + target)
 | 
				
			||||||
 | 
								file_load = open(target)
 | 
				
			||||||
 | 
								data = json.load(file_load)
 | 
				
			||||||
 | 
								soup = BeautifulSoup(data["content"],features="lxml")
 | 
				
			||||||
 | 
								url = data['url']
 | 
				
			||||||
 | 
								doc_id = target[target.rfind('/')+1:-5]
 | 
				
			||||||
 | 
								# Gets a cleaner version text comparative to soup.get_text()
 | 
				
			||||||
 | 
								clean_text = ' '.join(soup.stripped_strings)
 | 
				
			||||||
 | 
								# Looks for large white space, tabbed space, and other forms of spacing and removes it
 | 
				
			||||||
 | 
								# Regex expression matches for space characters excluding a single space or words
 | 
				
			||||||
 | 
								clean_text = re.sub(r'\s[^ \w]', '', clean_text)
 | 
				
			||||||
 | 
								# Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended
 | 
				
			||||||
 | 
								clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)])
 | 
				
			||||||
 | 
								# Stems tokenized text
 | 
				
			||||||
 | 
								clean_text = " ".join([self.stemmer.stem(i) for i in clean_text.split()])
 | 
				
			||||||
 | 
								# Put clean_text as an element in a list because get_tf_idf workers properly with single element lists
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								tokens = word_tokenize(clean_text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								total = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								counter = dict()
 | 
				
			||||||
 | 
								#We calculating tf_raw, and positionals here
 | 
				
			||||||
 | 
								for i in range(len(tokens)):
 | 
				
			||||||
 | 
									word = tokens[i]
 | 
				
			||||||
 | 
									if word in counter:
 | 
				
			||||||
 | 
										counter[word]= counter[word] + 1
 | 
				
			||||||
 | 
									else:
 | 
				
			||||||
 | 
										counter[word] = 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								doc_length = len(tokens)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								for index in tokens:
 | 
				
			||||||
 | 
									to_seek = self.index_index[index]
 | 
				
			||||||
 | 
									self.merged_index.seek(to_seek,0)
 | 
				
			||||||
 | 
									json_value = self.merged_index.readline()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
									data = json.loads(json_value)
 | 
				
			||||||
 | 
									df = len(data['postings'])
 | 
				
			||||||
 | 
									tf = counter[index]/doc_length
 | 
				
			||||||
 | 
									idf = math.log(self.indexer.num_doc/df)
 | 
				
			||||||
 | 
									tf_idf = tf*idf
 | 
				
			||||||
 | 
									total = total + tf_idf*tf_idf
 | 
				
			||||||
 | 
									
 | 
				
			||||||
 | 
								self.weight[doc_id] = math.sqrt(total)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								
 | 
				
			||||||
		Reference in New Issue
	
	Block a user