110 lines
		
	
	
		
			2.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			110 lines
		
	
	
		
			2.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from threading import Thread
 | |
| import json
 | |
| import os
 | |
| 
 | |
| from bs4 import BeautifulSoup
 | |
| import re
 | |
| import math
 | |
| import time
 | |
| #Data process
 | |
| from nltk.tokenize import word_tokenize
 | |
| from nltk.stem import PorterStemmer
 | |
| 
 | |
| from posting import Posting
 | |
| 
 | |
| 
 | |
| import sys
 | |
| 
 | |
| class Node():
 | |
| 	index_value = ''
 | |
| 	postings = list()
 | |
| 
 | |
| class Index():
 | |
| 	length = 0
 | |
| 	index = list()
 | |
| 
 | |
| class Worker_Weight(Thread):
 | |
| 	def __init__(self,worker_id,indexer):
 | |
| 		self.indexer = indexer
 | |
| 		self.stemmer = PorterStemmer()
 | |
| 		self.worker_id = worker_id
 | |
| 		self.num_partial = 0
 | |
| 		self.weight = dict()
 | |
| 		merged_index_index = open("merged_index.index" ,'r')
 | |
| 		self.merged_index = open("merged_index.full",'r')
 | |
| 		merged_index_index.seek(0,0)
 | |
| 		json_value = merged_index_index.readline()
 | |
| 		data = json.loads(json_value)
 | |
| 		self.index_index = dict(data['index'])
 | |
| 		
 | |
| 		super().__init__(daemon=True)
 | |
| 
 | |
| 
 | |
| 	def dump(self):
 | |
| 		with open("docs"+str(self.worker_id)+".weight",'w') as f:
 | |
| 			f.write(json.dumps(self.weight))
 | |
| 
 | |
| 	def run(self):
 | |
| 		while True:
 | |
| 			target = self.indexer.get_next_file()
 | |
| 			if not target:
 | |
| 				self.dump()
 | |
| 				print("Worker " + str(self.worker_id) + " died")
 | |
| 				break
 | |
| 			
 | |
| 			
 | |
| 			print("Worker " + str(self.worker_id) + " weighting " + target)
 | |
| 			file_load = open(target)
 | |
| 			data = json.load(file_load)
 | |
| 			soup = BeautifulSoup(data["content"],features="lxml")
 | |
| 			url = data['url']
 | |
| 			doc_id = target[target.rfind('/')+1:-5]
 | |
| 			# Gets a cleaner version text comparative to soup.get_text()
 | |
| 			clean_text = ' '.join(soup.stripped_strings)
 | |
| 			# Looks for large white space, tabbed space, and other forms of spacing and removes it
 | |
| 			# Regex expression matches for space characters excluding a single space or words
 | |
| 			clean_text = re.sub(r'\s[^ \w]', '', clean_text)
 | |
| 			# Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended
 | |
| 			clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)])
 | |
| 			# Stems tokenized text
 | |
| 			clean_text = " ".join([self.stemmer.stem(i) for i in clean_text.split()])
 | |
| 			# Put clean_text as an element in a list because get_tf_idf workers properly with single element lists
 | |
| 
 | |
| 			tokens = word_tokenize(clean_text)
 | |
| 
 | |
| 			total = 0
 | |
| 
 | |
| 
 | |
| 			counter = dict()
 | |
| 			#We calculating tf_raw, and positionals here
 | |
| 			for i in range(len(tokens)):
 | |
| 				word = tokens[i]
 | |
| 				if word in counter:
 | |
| 					counter[word]= counter[word] + 1
 | |
| 				else:
 | |
| 					counter[word] = 1
 | |
| 
 | |
| 			doc_length = len(tokens)
 | |
| 
 | |
| 			for index in tokens:
 | |
| 				to_seek = self.index_index[index]
 | |
| 				self.merged_index.seek(to_seek,0)
 | |
| 				json_value = self.merged_index.readline()
 | |
| 
 | |
| 				data = json.loads(json_value)
 | |
| 				df = len(data['postings'])
 | |
| 				tf = counter[index]/doc_length
 | |
| 				idf = math.log(self.indexer.num_doc/df)
 | |
| 				tf_idf = tf*idf
 | |
| 				total = total + tf_idf*tf_idf
 | |
| 				
 | |
| 			self.weight[doc_id] = math.sqrt(total)
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 			
 | 
