We are looking for TF_WEIGHT not IDF_WEIGHT, make things A LOT CHEAPER
This commit is contained in:
parent
cf81cf5c6a
commit
8d2dcea172
1
docs.weight
Normal file
1
docs.weight
Normal file
File diff suppressed because one or more lines are too long
46
indexer.py
46
indexer.py
@ -46,7 +46,7 @@ class Index():
|
|||||||
class Indexer():
|
class Indexer():
|
||||||
def __init__(self,list_partials,weight,data_paths,worker_factory=Worker,worker_weight_factory=Worker_Weight):
|
def __init__(self,list_partials,weight,data_paths,worker_factory=Worker,worker_weight_factory=Worker_Weight):
|
||||||
#Config stuffs
|
#Config stuffs
|
||||||
self.path = "data/DEV/"
|
self.path = "test/"
|
||||||
self.num_doc = 0
|
self.num_doc = 0
|
||||||
self.list_partials = list_partials
|
self.list_partials = list_partials
|
||||||
self.weight = weight
|
self.weight = weight
|
||||||
@ -76,20 +76,6 @@ class Indexer():
|
|||||||
for worker in self.workers:
|
for worker in self.workers:
|
||||||
worker.join()
|
worker.join()
|
||||||
|
|
||||||
def join_weight(self):
|
|
||||||
for worker in self.weight_workers:
|
|
||||||
worker.join()
|
|
||||||
|
|
||||||
def start_async_weight(self):
|
|
||||||
self.weight_workers = [
|
|
||||||
self.worker_weight_factory(worker_id,self)
|
|
||||||
for worker_id in range(1)]
|
|
||||||
for worker in self.weight_workers:
|
|
||||||
worker.start()
|
|
||||||
|
|
||||||
def start_weight(self):
|
|
||||||
self.start_async_weight()
|
|
||||||
self.join_weight()
|
|
||||||
|
|
||||||
def get_postings(self,index):
|
def get_postings(self,index):
|
||||||
merged_index_index = open("merged_index.index" ,'r')
|
merged_index_index = open("merged_index.index" ,'r')
|
||||||
@ -104,11 +90,11 @@ class Indexer():
|
|||||||
data = json.loads(json_value)
|
data = json.loads(json_value)
|
||||||
return data['postings']
|
return data['postings']
|
||||||
|
|
||||||
def set_total_weight(self):
|
def set_weight(self):
|
||||||
self.get_data_path()
|
weight_file = open('docs.weight','w')
|
||||||
self.start_weight()
|
jsonStr =json.dumps(self.weight, default=lambda o: o.__dict__,sort_keys=False)
|
||||||
|
weight_file.write(jsonStr)
|
||||||
|
weight_file.close()
|
||||||
|
|
||||||
def get_weight(self,doc_id):
|
def get_weight(self,doc_id):
|
||||||
weight = open('docs.weight','r')
|
weight = open('docs.weight','r')
|
||||||
@ -223,12 +209,22 @@ class Indexer():
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
indexer = Indexer(list(),dict(),list())
|
indexer = Indexer(list(),dict(),list())
|
||||||
#indexer.get_data_path()
|
indexer.get_data_path()
|
||||||
#print("We have " + str(len(indexer.data_paths)) + " documents to go through !" )
|
print("We have " + str(len(indexer.data_paths)) + " documents to go through !" )
|
||||||
#indexer.start()
|
indexer.start()
|
||||||
#indexer.merge()
|
indexer.merge()
|
||||||
print("Finished merging into 1 big happy family")
|
print("Finished merging into 1 big happy family")
|
||||||
indexer.set_total_weight()
|
indexer.set_weight()
|
||||||
|
|
||||||
|
tic = time.perf_counter()
|
||||||
|
indexer.get_postings('artifici')
|
||||||
|
toc = time.perf_counter()
|
||||||
|
print(f"Took {toc - tic:0.4f} seconds to get postings for artifici")
|
||||||
|
tic = time.perf_counter()
|
||||||
|
indexer.get_weight('00ba3af6a00b7cfb4928e5d234342c5dc46b4e31714d4a8f315a2dd4d8e49860')
|
||||||
|
print(f"Took {toc - tic:0.4f} seconds to get weight for some random page ")
|
||||||
|
toc = time.perf_counter()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
13
worker.py
13
worker.py
@ -12,6 +12,7 @@ from nltk.stem import PorterStemmer
|
|||||||
|
|
||||||
from posting import Posting
|
from posting import Posting
|
||||||
|
|
||||||
|
import math
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
@ -110,17 +111,23 @@ class Worker(Thread):
|
|||||||
counter[word][1].append(i)
|
counter[word][1].append(i)
|
||||||
|
|
||||||
doc_length = len(tokens)
|
doc_length = len(tokens)
|
||||||
|
total = 0
|
||||||
for index in counter:
|
for index in counter:
|
||||||
|
tf = counter[index][0]/doc_length
|
||||||
|
log_tf = 1 + math.log(tf)
|
||||||
|
total = total + log_tf * log_tf
|
||||||
if index in self.index:
|
if index in self.index:
|
||||||
postings = self.index[index]
|
postings = self.index[index]
|
||||||
postings.append(Posting(doc_id,url,counter[index][0]/doc_length,0,counter[index][1]))
|
postings.append(Posting(doc_id,url,tf,0,counter[index][1]))
|
||||||
else:
|
else:
|
||||||
self.index[index] = list()
|
self.index[index] = list()
|
||||||
self.index[index].append(Posting(doc_id,url,counter[index][0]/doc_length,0,counter[index][1]))
|
self.index[index].append(Posting(doc_id,url,tf,0,counter[index][1]))
|
||||||
self.index[index].sort(key=lambda y:y.doc_id)
|
self.index[index].sort(key=lambda y:y.doc_id)
|
||||||
|
|
||||||
|
self.indexer.weight[doc_id] = math.sqrt(total)
|
||||||
|
|
||||||
#10 Megabytes index (in Ram approx)
|
#10 Megabytes index (in Ram approx)
|
||||||
if sys.getsizeof(self.index) > 10000000:
|
if sys.getsizeof(self.index) > 1000000:
|
||||||
self.dump()
|
self.dump()
|
||||||
|
|
||||||
|
|
||||||
|
109
worker_weight.py
109
worker_weight.py
@ -1,109 +0,0 @@
|
|||||||
from threading import Thread
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import re
|
|
||||||
import math
|
|
||||||
import time
|
|
||||||
#Data process
|
|
||||||
from nltk.tokenize import word_tokenize
|
|
||||||
from nltk.stem import PorterStemmer
|
|
||||||
|
|
||||||
from posting import Posting
|
|
||||||
|
|
||||||
|
|
||||||
import sys
|
|
||||||
|
|
||||||
class Node():
|
|
||||||
index_value = ''
|
|
||||||
postings = list()
|
|
||||||
|
|
||||||
class Index():
|
|
||||||
length = 0
|
|
||||||
index = list()
|
|
||||||
|
|
||||||
class Worker_Weight(Thread):
|
|
||||||
def __init__(self,worker_id,indexer):
|
|
||||||
self.indexer = indexer
|
|
||||||
self.stemmer = PorterStemmer()
|
|
||||||
self.worker_id = worker_id
|
|
||||||
self.num_partial = 0
|
|
||||||
self.weight = dict()
|
|
||||||
merged_index_index = open("merged_index.index" ,'r')
|
|
||||||
self.merged_index = open("merged_index.full",'r')
|
|
||||||
merged_index_index.seek(0,0)
|
|
||||||
json_value = merged_index_index.readline()
|
|
||||||
data = json.loads(json_value)
|
|
||||||
self.index_index = dict(data['index'])
|
|
||||||
|
|
||||||
super().__init__(daemon=True)
|
|
||||||
|
|
||||||
|
|
||||||
def dump(self):
|
|
||||||
with open("docs"+str(self.worker_id)+".weight",'w') as f:
|
|
||||||
f.write(json.dumps(self.weight))
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
while True:
|
|
||||||
target = self.indexer.get_next_file()
|
|
||||||
if not target:
|
|
||||||
self.dump()
|
|
||||||
print("Worker " + str(self.worker_id) + " died")
|
|
||||||
break
|
|
||||||
|
|
||||||
|
|
||||||
print("Worker " + str(self.worker_id) + " weighting " + target)
|
|
||||||
file_load = open(target)
|
|
||||||
data = json.load(file_load)
|
|
||||||
soup = BeautifulSoup(data["content"],features="lxml")
|
|
||||||
url = data['url']
|
|
||||||
doc_id = target[target.rfind('/')+1:-5]
|
|
||||||
# Gets a cleaner version text comparative to soup.get_text()
|
|
||||||
clean_text = ' '.join(soup.stripped_strings)
|
|
||||||
# Looks for large white space, tabbed space, and other forms of spacing and removes it
|
|
||||||
# Regex expression matches for space characters excluding a single space or words
|
|
||||||
clean_text = re.sub(r'\s[^ \w]', '', clean_text)
|
|
||||||
# Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended
|
|
||||||
clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)])
|
|
||||||
# Stems tokenized text
|
|
||||||
clean_text = " ".join([self.stemmer.stem(i) for i in clean_text.split()])
|
|
||||||
# Put clean_text as an element in a list because get_tf_idf workers properly with single element lists
|
|
||||||
|
|
||||||
tokens = word_tokenize(clean_text)
|
|
||||||
|
|
||||||
total = 0
|
|
||||||
|
|
||||||
|
|
||||||
counter = dict()
|
|
||||||
#We calculating tf_raw, and positionals here
|
|
||||||
for i in range(len(tokens)):
|
|
||||||
word = tokens[i]
|
|
||||||
if word in counter:
|
|
||||||
counter[word]= counter[word] + 1
|
|
||||||
else:
|
|
||||||
counter[word] = 1
|
|
||||||
|
|
||||||
doc_length = len(tokens)
|
|
||||||
|
|
||||||
for index in tokens:
|
|
||||||
to_seek = self.index_index[index]
|
|
||||||
self.merged_index.seek(to_seek,0)
|
|
||||||
json_value = self.merged_index.readline()
|
|
||||||
|
|
||||||
data = json.loads(json_value)
|
|
||||||
df = len(data['postings'])
|
|
||||||
tf = counter[index]/doc_length
|
|
||||||
idf = math.log(self.indexer.num_doc/df)
|
|
||||||
tf_idf = tf*idf
|
|
||||||
total = total + tf_idf*tf_idf
|
|
||||||
|
|
||||||
self.weight[doc_id] = math.sqrt(total)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user