Search_Engine/worker.py
2022-05-27 13:12:15 -07:00

138 lines
4.1 KiB
Python

from threading import Thread
import json
import os
from bs4 import BeautifulSoup
import re
#Data process
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from posting import Posting
import math
import sys
class Node():
index_value = ''
postings = list()
class Index():
length = 0
index = list()
class Worker(Thread):
def __init__(self,worker_id,indexer):
self.indexer = indexer
self.stemmer = PorterStemmer()
self.worker_id = worker_id
self.num_partial = 0
self.index = dict()
super().__init__(daemon=True)
def dump(self):
part_index = Index()
part_index.length = 0
part_index.index = list()
cur_partial_index_str = "temp/" + str(self.worker_id) + "_" + str(self.num_partial) + '.partial'
cur_partial_index_index_str = "temp/" + str(self.worker_id) + "_" + str(self.num_partial) + '.index'
cur_partial_index = open(cur_partial_index_str,'w')
cur_partial_index_index = open(cur_partial_index_index_str,'w')
for key in self.index:
node = Node()
node.index_value = key
node.postings = self.index[key]
jsonStr = json.dumps(node, default=lambda o: o.__dict__,sort_keys=False)
part_index.index.append((node.index_value,cur_partial_index.tell()))
cur_partial_index.write(jsonStr + '\n')
part_index.length = part_index.length + 1
part_index.index.sort(key=lambda y:y[0])
jsonStr =json.dumps(part_index, default=lambda o: o.__dict__,sort_keys=False)
cur_partial_index_index.write(jsonStr)
self.indexer.add_partial_index(str(self.worker_id) + "_" + str(self.num_partial))
self.num_partial = self.num_partial + 1
self.index.clear()
def run(self):
while True:
target = self.indexer.get_next_file()
if not target:
self.dump()
print("Worker " + str(self.worker_id) + " died")
break
file_load = open(target)
data = json.load(file_load)
soup = BeautifulSoup(data["content"],features="lxml")
doc_id = target[target.rfind('/')+1:-5]
url = data['url']
print("Worker " + str(self.worker_id) + " working on " + url)
important = {'b' : [], 'h1' : [], 'h2' : [], 'h3' : [], 'title' : []}
for key_words in important.keys():
for i in soup.findAll(key_words):
for word in word_tokenize(i.text):
important[key_words].append(self.stemmer.stem(word))
# Gets a cleaner version text comparative to soup.get_text()
clean_text = ' '.join(soup.stripped_strings)
# Looks for large white space, tabbed space, and other forms of spacing and removes it
# Regex expression matches for space characters excluding a single space or words
clean_text = re.sub(r'\s[^ \w]', '', clean_text)
# Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended
clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)])
# Stems tokenized text
clean_text = " ".join([self.stemmer.stem(i) for i in clean_text.split()])
# Put clean_text as an element in a list because get_tf_idf workers properly with single element lists
tokens = word_tokenize(clean_text)
#counter(count,positionals)
counter = dict()
#We calculating tf_raw, and positionals here
for i in range(len(tokens)):
word = tokens[i]
if word in counter:
counter[word][0] = counter[word][0] + 1
counter[word][1].append(i)
else:
counter[word] = [1,list()]
counter[word][1].append(i)
doc_length = len(tokens)
total = 0
for index in counter:
tf = counter[index][0]/doc_length
log_tf = 1 + math.log(tf)
total = total + log_tf * log_tf
if index in self.index:
postings = self.index[index]
postings.append(Posting(doc_id,url,tf,0,counter[index][1]))
else:
self.index[index] = list()
self.index[index].append(Posting(doc_id,url,tf,0,counter[index][1]))
self.index[index].sort(key=lambda y:y.doc_id)
self.indexer.weight[doc_id] = math.sqrt(total)
#10 Megabytes index (in Ram approx)
if sys.getsizeof(self.index) > 1000000:
self.dump()