138 lines
		
	
	
		
			4.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			138 lines
		
	
	
		
			4.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from threading import Thread
 | |
| import json
 | |
| import os
 | |
| 
 | |
| from bs4 import BeautifulSoup
 | |
| import re
 | |
| 
 | |
| 
 | |
| #Data process
 | |
| from nltk.tokenize import word_tokenize
 | |
| from nltk.stem import PorterStemmer
 | |
| 
 | |
| from posting import Posting
 | |
| 
 | |
| import math
 | |
| 
 | |
| import sys
 | |
| 
 | |
| class Node():
 | |
| 	index_value = ''
 | |
| 	postings = list()
 | |
| 
 | |
| class Index():
 | |
| 	length = 0
 | |
| 	index = list()
 | |
| 
 | |
| class Worker(Thread):
 | |
| 	def __init__(self,worker_id,indexer):
 | |
| 		self.indexer = indexer
 | |
| 		self.stemmer = PorterStemmer()
 | |
| 		self.worker_id = worker_id
 | |
| 		self.num_partial = 0
 | |
| 		self.index = dict()
 | |
| 		super().__init__(daemon=True)
 | |
| 
 | |
| 	def dump(self):
 | |
| 		part_index = Index()
 | |
| 		part_index.length = 0
 | |
| 		part_index.index = list()
 | |
| 
 | |
| 		cur_partial_index_str = "temp/" + str(self.worker_id) + "_" + str(self.num_partial) + '.partial'
 | |
| 		cur_partial_index_index_str = "temp/" +  str(self.worker_id) + "_" + str(self.num_partial) + '.index'
 | |
| 
 | |
| 
 | |
| 		cur_partial_index = open(cur_partial_index_str,'w')
 | |
| 		cur_partial_index_index = open(cur_partial_index_index_str,'w')
 | |
| 
 | |
| 		for key in self.index:
 | |
| 			node = Node()
 | |
| 			node.index_value = key
 | |
| 			node.postings = self.index[key]
 | |
| 
 | |
| 			jsonStr = json.dumps(node, default=lambda o: o.__dict__,sort_keys=False)
 | |
| 
 | |
| 			part_index.index.append((node.index_value,cur_partial_index.tell()))
 | |
| 			cur_partial_index.write(jsonStr + '\n')
 | |
| 			part_index.length = part_index.length + 1
 | |
| 
 | |
| 		part_index.index.sort(key=lambda y:y[0])
 | |
| 		jsonStr =json.dumps(part_index, default=lambda o: o.__dict__,sort_keys=False)
 | |
| 		cur_partial_index_index.write(jsonStr)
 | |
| 
 | |
| 		self.indexer.add_partial_index(str(self.worker_id) + "_" + str(self.num_partial))
 | |
| 		self.num_partial = self.num_partial + 1
 | |
| 		self.index.clear()
 | |
| 
 | |
| 
 | |
| 	def run(self):
 | |
| 		while True:
 | |
| 			target = self.indexer.get_next_file()
 | |
| 			if not target:
 | |
| 				self.dump()
 | |
| 				print("Worker " + str(self.worker_id) + " died")
 | |
| 				break
 | |
| 			file_load = open(target)
 | |
| 			data = json.load(file_load)
 | |
| 			soup = BeautifulSoup(data["content"],features="lxml")
 | |
| 			doc_id = target[target.rfind('/')+1:-5]
 | |
| 			url = data['url']
 | |
| 			print("Worker " + str(self.worker_id) + " working on " + url)
 | |
| 			important = {'b' : [], 'h1' : [], 'h2' : [], 'h3' : [], 'title' : []}
 | |
| 			for key_words in important.keys():
 | |
| 				for i in soup.findAll(key_words):
 | |
| 					for word in word_tokenize(i.text):
 | |
| 						important[key_words].append(self.stemmer.stem(word))
 | |
| 
 | |
| 			# Gets a cleaner version text comparative to soup.get_text()
 | |
| 			clean_text = ' '.join(soup.stripped_strings)
 | |
| 			# Looks for large white space, tabbed space, and other forms of spacing and removes it
 | |
| 			# Regex expression matches for space characters excluding a single space or words
 | |
| 			clean_text = re.sub(r'\s[^ \w]', '', clean_text)
 | |
| 			# Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended
 | |
| 			clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)])
 | |
| 			# Stems tokenized text
 | |
| 			clean_text = " ".join([self.stemmer.stem(i) for i in clean_text.split()])
 | |
| 			# Put clean_text as an element in a list because get_tf_idf workers properly with single element lists
 | |
| 
 | |
| 			tokens = word_tokenize(clean_text)
 | |
| 
 | |
| 			#counter(count,positionals)
 | |
| 
 | |
| 			counter = dict()
 | |
| 			#We calculating tf_raw, and positionals here
 | |
| 			for i in range(len(tokens)):
 | |
| 				word = tokens[i]
 | |
| 				if word in counter:
 | |
| 					counter[word][0] = counter[word][0] + 1
 | |
| 					counter[word][1].append(i)
 | |
| 				else:
 | |
| 					counter[word] = [1,list()]
 | |
| 					counter[word][1].append(i)
 | |
| 
 | |
| 			doc_length = len(tokens)
 | |
| 			total = 0
 | |
| 			for index in counter:
 | |
| 				tf = counter[index][0]/doc_length
 | |
| 				log_tf = 1 + math.log(tf)
 | |
| 				total = total + log_tf * log_tf
 | |
| 				if index in self.index:
 | |
| 					postings = self.index[index]
 | |
| 					postings.append(Posting(doc_id,url,tf,0,counter[index][1]))
 | |
| 				else:
 | |
| 					self.index[index] = list()
 | |
| 					self.index[index].append(Posting(doc_id,url,tf,0,counter[index][1]))
 | |
| 					self.index[index].sort(key=lambda y:y.doc_id)
 | |
| 
 | |
| 			self.indexer.weight[doc_id] = math.sqrt(total)
 | |
| 
 | |
| 			#10 Megabytes index (in Ram approx)
 | |
| 			if sys.getsizeof(self.index) > 1000000:
 | |
| 				self.dump()
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 			
 | 
