changing the way to calculate weight
This commit is contained in:
parent
fb88efd510
commit
cf81cf5c6a
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,3 +1,5 @@
|
||||
/data/
|
||||
*.shelve
|
||||
/__pycache__/
|
||||
/test/
|
||||
merged*
|
||||
|
File diff suppressed because one or more lines are too long
92
indexer.py
92
indexer.py
@ -33,6 +33,7 @@ import re
|
||||
#Logging postings
|
||||
from posting import Posting
|
||||
from worker import Worker
|
||||
from worker_weight import Worker_Weight
|
||||
|
||||
class Node():
|
||||
index_value = ''
|
||||
@ -43,9 +44,9 @@ class Index():
|
||||
index = list()
|
||||
|
||||
class Indexer():
|
||||
def __init__(self,list_partials,weight,data_paths,worker_factory=Worker):
|
||||
def __init__(self,list_partials,weight,data_paths,worker_factory=Worker,worker_weight_factory=Worker_Weight):
|
||||
#Config stuffs
|
||||
self.path = "test/"
|
||||
self.path = "data/DEV/"
|
||||
self.num_doc = 0
|
||||
self.list_partials = list_partials
|
||||
self.weight = weight
|
||||
@ -54,10 +55,12 @@ class Indexer():
|
||||
self.data_paths_lock = Lock()
|
||||
self.list_partials_lock = Lock()
|
||||
self.workers = list()
|
||||
self.merged_index = open("merged_index.full",'w')
|
||||
self.merged_index_index = open("merged_index.index" ,'w')
|
||||
|
||||
self.worker_factory = worker_factory
|
||||
|
||||
self.weight_workers = list()
|
||||
self.worker_weight_factory = worker_weight_factory
|
||||
|
||||
def start_async(self):
|
||||
self.workers = [
|
||||
self.worker_factory(worker_id,self)
|
||||
@ -73,6 +76,21 @@ class Indexer():
|
||||
for worker in self.workers:
|
||||
worker.join()
|
||||
|
||||
def join_weight(self):
|
||||
for worker in self.weight_workers:
|
||||
worker.join()
|
||||
|
||||
def start_async_weight(self):
|
||||
self.weight_workers = [
|
||||
self.worker_weight_factory(worker_id,self)
|
||||
for worker_id in range(1)]
|
||||
for worker in self.weight_workers:
|
||||
worker.start()
|
||||
|
||||
def start_weight(self):
|
||||
self.start_async_weight()
|
||||
self.join_weight()
|
||||
|
||||
def get_postings(self,index):
|
||||
merged_index_index = open("merged_index.index" ,'r')
|
||||
merged_index = open("merged_index.full",'r')
|
||||
@ -88,50 +106,8 @@ class Indexer():
|
||||
|
||||
def set_total_weight(self):
|
||||
self.get_data_path()
|
||||
merged_index_index = open("merged_index.index" ,'r')
|
||||
merged_index = open("merged_index.full",'r')
|
||||
merged_index_index.seek(0,0)
|
||||
json_value = merged_index_index.readline()
|
||||
data = json.loads(json_value)
|
||||
index_index = dict(data['index'])
|
||||
self.start_weight()
|
||||
|
||||
for doc in self.data_paths:
|
||||
file_load = open(doc)
|
||||
data = json.load(file_load)
|
||||
soup = BeautifulSoup(data["content"],features="lxml")
|
||||
url = data['url']
|
||||
doc_id = doc[doc.rfind('/')+1:-5]
|
||||
# Gets a cleaner version text comparative to soup.get_text()
|
||||
clean_text = ' '.join(soup.stripped_strings)
|
||||
# Looks for large white space, tabbed space, and other forms of spacing and removes it
|
||||
# Regex expression matches for space characters excluding a single space or words
|
||||
clean_text = re.sub(r'\s[^ \w]', '', clean_text)
|
||||
# Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended
|
||||
clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)])
|
||||
# Stems tokenized text
|
||||
clean_text = " ".join([self.stemmer.stem(i) for i in clean_text.split()])
|
||||
# Put clean_text as an element in a list because get_tf_idf workers properly with single element lists
|
||||
|
||||
tokens = word_tokenize(clean_text)
|
||||
|
||||
tokens = set(tokens)
|
||||
|
||||
total = 0
|
||||
for token in tokens:
|
||||
to_seek = index_index[token]
|
||||
merged_index.seek(to_seek,0)
|
||||
json_value = merged_index.readline()
|
||||
data = json.loads(json_value)
|
||||
|
||||
for posting in data['postings']:
|
||||
if posting['doc_id'] == doc_id:
|
||||
total = total + posting['tf_idf']* posting['tf_idf']
|
||||
break
|
||||
|
||||
self.weight[doc_id] = math.sqrt(total)
|
||||
|
||||
with open('docs.weight','w') as f:
|
||||
f.write(json.dumps(self.weight))
|
||||
|
||||
|
||||
def get_weight(self,doc_id):
|
||||
@ -193,6 +169,8 @@ class Indexer():
|
||||
partial_file.seek(0,0)
|
||||
|
||||
pointers = [0]*num_indices
|
||||
merged_index = open("merged_index.full",'w')
|
||||
merged_index_index = open("merged_index.index" ,'w')
|
||||
|
||||
while(True):
|
||||
|
||||
@ -226,30 +204,30 @@ class Indexer():
|
||||
node.postings.sort(key=lambda y:y['doc_id'])
|
||||
for posting in node.postings:
|
||||
posting['tf_idf'] = posting['tf_raw']*math.log(self.num_doc/len(node.postings))
|
||||
full_index.index.append((value,self.merged_index.tell()))
|
||||
full_index.index.append((value,merged_index.tell()))
|
||||
full_index.length = full_index.length + 1
|
||||
jsonStr = json.dumps(node,default=lambda o: o.__dict__,sort_keys=False)
|
||||
self.merged_index.write(jsonStr + '\n')
|
||||
merged_index.write(jsonStr + '\n')
|
||||
|
||||
full_index.index.sort(key=lambda y:y[0])
|
||||
jsonStr =json.dumps(full_index, default=lambda o: o.__dict__,sort_keys=False)
|
||||
self.merged_index_index.write(jsonStr)
|
||||
merged_index_index.write(jsonStr)
|
||||
|
||||
for partial_index in self.list_partials:
|
||||
os.remove("temp/" + partial_index+'.partial')
|
||||
os.remove("temp/" + partial_index+'.index')
|
||||
|
||||
self.merged_index_index.close()
|
||||
self.merged_index.close()
|
||||
|
||||
merged_index_index.close()
|
||||
merged_index.close()
|
||||
|
||||
|
||||
def main():
|
||||
indexer = Indexer(list(),dict(),list())
|
||||
indexer.get_data_path()
|
||||
print("We have " + str(len(indexer.data_paths)) + " documents to go through !" )
|
||||
indexer.start()
|
||||
indexer.merge()
|
||||
#indexer.get_data_path()
|
||||
#print("We have " + str(len(indexer.data_paths)) + " documents to go through !" )
|
||||
#indexer.start()
|
||||
#indexer.merge()
|
||||
print("Finished merging into 1 big happy family")
|
||||
indexer.set_total_weight()
|
||||
|
||||
|
||||
|
18
stemmer.py
18
stemmer.py
@ -1,18 +0,0 @@
|
||||
#Multiple implementation of stemming here please
|
||||
class Stemmer():
|
||||
|
||||
def __init__(self,mode, data):
|
||||
#Different type of stemmer = different modes
|
||||
self.mode = mode
|
||||
self.data = data
|
||||
|
||||
def stem(self):
|
||||
#Do stuff here
|
||||
if(self.mode == 0):
|
||||
#Do stemmer 1
|
||||
return #stemmed data
|
||||
#....
|
||||
|
||||
def #name of stemmer 1
|
||||
|
||||
def #name of stemmer 2
|
109
worker_weight.py
Normal file
109
worker_weight.py
Normal file
@ -0,0 +1,109 @@
|
||||
from threading import Thread
|
||||
import json
|
||||
import os
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
import math
|
||||
import time
|
||||
#Data process
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.stem import PorterStemmer
|
||||
|
||||
from posting import Posting
|
||||
|
||||
|
||||
import sys
|
||||
|
||||
class Node():
|
||||
index_value = ''
|
||||
postings = list()
|
||||
|
||||
class Index():
|
||||
length = 0
|
||||
index = list()
|
||||
|
||||
class Worker_Weight(Thread):
|
||||
def __init__(self,worker_id,indexer):
|
||||
self.indexer = indexer
|
||||
self.stemmer = PorterStemmer()
|
||||
self.worker_id = worker_id
|
||||
self.num_partial = 0
|
||||
self.weight = dict()
|
||||
merged_index_index = open("merged_index.index" ,'r')
|
||||
self.merged_index = open("merged_index.full",'r')
|
||||
merged_index_index.seek(0,0)
|
||||
json_value = merged_index_index.readline()
|
||||
data = json.loads(json_value)
|
||||
self.index_index = dict(data['index'])
|
||||
|
||||
super().__init__(daemon=True)
|
||||
|
||||
|
||||
def dump(self):
|
||||
with open("docs"+str(self.worker_id)+".weight",'w') as f:
|
||||
f.write(json.dumps(self.weight))
|
||||
|
||||
def run(self):
|
||||
while True:
|
||||
target = self.indexer.get_next_file()
|
||||
if not target:
|
||||
self.dump()
|
||||
print("Worker " + str(self.worker_id) + " died")
|
||||
break
|
||||
|
||||
|
||||
print("Worker " + str(self.worker_id) + " weighting " + target)
|
||||
file_load = open(target)
|
||||
data = json.load(file_load)
|
||||
soup = BeautifulSoup(data["content"],features="lxml")
|
||||
url = data['url']
|
||||
doc_id = target[target.rfind('/')+1:-5]
|
||||
# Gets a cleaner version text comparative to soup.get_text()
|
||||
clean_text = ' '.join(soup.stripped_strings)
|
||||
# Looks for large white space, tabbed space, and other forms of spacing and removes it
|
||||
# Regex expression matches for space characters excluding a single space or words
|
||||
clean_text = re.sub(r'\s[^ \w]', '', clean_text)
|
||||
# Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended
|
||||
clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)])
|
||||
# Stems tokenized text
|
||||
clean_text = " ".join([self.stemmer.stem(i) for i in clean_text.split()])
|
||||
# Put clean_text as an element in a list because get_tf_idf workers properly with single element lists
|
||||
|
||||
tokens = word_tokenize(clean_text)
|
||||
|
||||
total = 0
|
||||
|
||||
|
||||
counter = dict()
|
||||
#We calculating tf_raw, and positionals here
|
||||
for i in range(len(tokens)):
|
||||
word = tokens[i]
|
||||
if word in counter:
|
||||
counter[word]= counter[word] + 1
|
||||
else:
|
||||
counter[word] = 1
|
||||
|
||||
doc_length = len(tokens)
|
||||
|
||||
for index in tokens:
|
||||
to_seek = self.index_index[index]
|
||||
self.merged_index.seek(to_seek,0)
|
||||
json_value = self.merged_index.readline()
|
||||
|
||||
data = json.loads(json_value)
|
||||
df = len(data['postings'])
|
||||
tf = counter[index]/doc_length
|
||||
idf = math.log(self.indexer.num_doc/df)
|
||||
tf_idf = tf*idf
|
||||
total = total + tf_idf*tf_idf
|
||||
|
||||
self.weight[doc_id] = math.sqrt(total)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user