4 Commits

Author SHA1 Message Date
Aaron
e7c4170cc2 Update indexer.py
had incorrect implementation
2022-05-12 17:58:31 -07:00
inocturnis
c4b3512df7 Changed tf_idf model into the new one, try it on the current dataset 2022-05-12 15:00:09 -07:00
iNocturnis
c8640001c7 Merge branch 'tf_idf' 2022-05-12 14:30:22 -07:00
Lacerum
f5610eaa62 tf-idf ngrams and now returns dict rather than
score
2022-05-11 14:46:32 -07:00
12 changed files with 291 additions and 700 deletions

2
.gitignore vendored
View File

@@ -1,5 +1,3 @@
/data/
*.shelve
/__pycache__/
/test/
merged*

View File

@@ -1,8 +0,0 @@
### To create index:
1. Make sure that all requirements are installed, check `requirements.txt` and install using `pip install reqirements.txt`.
2. Run `python indexer.py` to build index, this may take some time to run.
3. Index is now created.
### Start search interface:
Run `python launcher.py` to start the search interface.
### Perform query:
To perfrom a search simply enter a query in the textbox and click search. The top results will be displayed.

View File

@@ -1,52 +0,0 @@
### Bad:
- computer science - common
- university of california irvine -common
- donald bren - common
- uci - common
- informatics - common
- The Donald Bren School of Information and Computer Sciences - long and common
- toilet - not likely to be found easily
- perfume - not likely to be found
- SPY×FAMILY - should not exist in data
- undergraduate - likely to be on tons of pages
### Good to Meh:
- liquids in labs - uncommon word with common
- Alberto Krone-Martins - should have a good amount of results but not absurd
- Advising & Planning - should be specific but not too common
- Honors Program - ^
- Papaefthymiou - similar to the martins query
- General information - there should be quite a few pages with this but not tons
- Prerequisite Clearing System - has some common and uncommon terms
- Recruiting - not stupid common
- counseling - ^ and should only be on a subset of pages
- social justice - specific terms that should appear without being costly
### Others tested:
- masters of computer science - not super common but will have a good amount of pages
- thornton ics46 notes - name + class + common
- Theory of Computation - two terms which have high count in papers
- facility distribution - two terms which don't really make sense together
- artificial intelligence history - two common terms with semi-common
- prospective alumni - should have very few instances of both terms but should be found together
- enrollment window - should be on only a couple of pages
- available capstone sponsorship - ^
- spring seminars - common with term that may be somewhat restricted
- hackuci - two terms into one that exists in dataset
- ucinetid help - specific term with common
- course restrictions - specific pages
- project management - a course name
- yelan research - term should not exist + common
- hybrid-learning - common phrase
- genshin is a computer game - contains terms that exist and others that don't
- computable AI machine learning big data - sentence of CS buzz words (really really common)
- Publications & Technical Reports - in json file
- Tutor coordinators - in many json (bold, title, and body)
- Death Image Service - in some weird areas
- send anonymous email - only in some
### Things done for improvement
1. Create index of index for substantial gain in efficiency and speed.
2. Split TF-IDF into TF and IDF for more specific calculations when needed without the whole computation. This also removes the relevance on external library for TF-IDF.
3. Switched from using IDF & weight, to TF & weight for helping with the overall weight.
4. Dropped indexing and searching of unigram, bigram, and trigrams.
5. Add length of document during indexing for improved speed via normalization calculation.

File diff suppressed because one or more lines are too long

View File

@@ -17,8 +17,6 @@ from bs4 import BeautifulSoup
from time import perf_counter
import time
import threading
from threading import Lock
import math
#Data process
@@ -34,196 +32,204 @@ import re
from posting import Posting
from worker import Worker
class Node():
index_value = ''
postings = list()
class Index():
length = 0
index = list()
class Indexer():
def __init__(self,list_partials,weight,data_paths,worker_factory=Worker):
def __init__(self,restart,trimming):
#Config stuffs
self.path = "data/DEV"
self.num_doc = 0
self.list_partials = list_partials
self.weight = weight
self.data_paths = data_paths
self.path = "data/DEV/"
self.restart = restart
self.trimming = trimming
self.stemmer = PorterStemmer()
self.data_paths_lock = Lock()
self.list_partials_lock = Lock()
self.workers = list()
self.worker_factory = worker_factory
#Shelves for index
#https://www3.nd.edu/~busiforc/handouts/cryptography/letterfrequencies.html
#https://www.irishtimes.com/news/science/how-many-numbers-begin-with-a-1-more-than-30-per-cent-1.4162466
#According to this will be how we split things
#Save #1 = ABCD + (1) ~ 18.3% of words
#Save #2 = EFGHIJK + (2-3)~ 27.1% of words
#Save #3 = LMNOPQ + (4-7) ~ 25.4% of words
#Save #4 = RSTUVWXYZ + (8-9)~ 29.2% of words
#Save #5 = Special characters
if os.path.exists("save_1.shelve") and restart:
os.remove("save_1.shelve")
if os.path.exists("save_2.shelve") and restart:
os.remove("save_2.shelve")
if os.path.exists("save_3.shelve") and restart:
os.remove("save_3.shelve")
if os.path.exists("save_4.shelve") and restart:
os.remove("save_4.shelve")
if os.path.exists("save_5.shelve") and restart:
os.remove("save_5.shelve")
def start_async(self):
self.workers = [
self.worker_factory(worker_id,self)
for worker_id in range(8)]
for worker in self.workers:
worker.start()
self.save_1 = shelve.open("save_1.shelve")
self.save_1_lock = threading.Lock()
self.save_2 = shelve.open("save_2.shelve")
self.save_2_lock = threading.Lock()
self.save_3 = shelve.open("save_3.shelve")
self.save_3_lock = threading.Lock()
self.save_4 = shelve.open("save_4.shelve")
self.save_4_lock = threading.Lock()
self.save_5 = shelve.open("save_5.shelve")
self.save_5_lock = threading.Lock()
def start(self):
self.start_async()
self.join()
print(len(list(self.save_1.keys())))
print(len(list(self.save_2.keys())))
print(len(list(self.save_3.keys())))
print(len(list(self.save_4.keys())))
print(len(list(self.save_5.keys())))
def join(self):
for worker in self.workers:
worker.join()
def save_index(self,word,posting):
cur_save = self.get_save_file(word)
lock = self.get_save_lock(word)
lock.acquire()
shelve_list = list()
try:
shelve_list = cur_save[word]
shelve_list.append(posting)
tic = perf_counter()
shelve_list.sort(key=lambda x: x.tf_idf, reverse = True)
toc = perf_counter()
if toc - tic > 1 :
print("Took " + str(toc - tic) + "seconds to sort shelve list !")
cur_save.sync()
lock.release()
except:
shelve_list.append(posting)
cur_save[word] = shelve_list
cur_save.sync()
lock.release()
def get_save_file(self,word):
#return the correct save depending on the starting letter of word
word_lower = word.lower()
if re.match(r"^[a-d0-1].*",word_lower):
return self.save_1
elif re.match(r"^[e-k2-3].*",word_lower):
return self.save_2
elif re.match(r"^[l-q4-7].*",word_lower):
return self.save_3
elif re.match(r"^[r-z8-9].*",word_lower):
return self.save_4
else:
print(word)
print("You have somehow went beyond the magic")
return self.save_5
def get_save_lock(self,word):
word_lower = word.lower()
if re.match(r"^[a-d0-1].*",word_lower):
return self.save_1_lock
elif re.match(r"^[e-k2-3].*",word_lower):
return self.save_2_lock
elif re.match(r"^[l-q4-7].*",word_lower):
return self.save_3_lock
elif re.match(r"^[r-z8-9].*",word_lower):
return self.save_4_lock
else:
print(word)
print("You have somehow went beyond the magic")
return self.save_5_lock.acquire()
# I have a test file (mytest.py) with pandas but couldn't figure out how to grab just a single cell.
# so I came up with this, if anyone knows how to get a single cell and can explain it to
# me I would love to know, as I think that method might be quicker, maybe, idk it like
# 4am
# retuns a dict of words/n-grams with their assosiated tf-idf score *can also return just a single score or a pandas dataframe
# https://stackoverflow.com/questions/34449127/sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-documen
# Andy: added paramenter imporant_words in order to do multiplication of score
def get_tf_idf(self,words,word, important_words):
#tf_idf
#words = whole text
#word the word we finding the score for
#return the score
try:
'''
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(words)
df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out())
score = df.iloc[0][''.join(word)]
for k,v in important_words.items():
if k == 'b' and word in v:
score = score * 1.2
elif k == 'h1' and word in v:
score = score * 1.75
elif k == 'h2' and word in v:
score = score * 1.5
elif k == 'h3' and word in v:
score = score * 1.2
elif k == 'title' and word in v:
score = score * 2
return(score)
#print(df)
except KeyError:
return -1
'''
try:
tfidf = TfidfVectorizer(ngram_range=(1,3)) # ngram_range is range of n-values for different n-grams to be extracted (1,3) gets unigrams, bigrams, trigrams
tfidf_matrix = tfidf.fit_transform(words) # fit trains the model, transform creates matrix
df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out()) # store value of matrix to associated word/n-gram
#return(df.iloc[0][''.join(word)]) #used for finding single word in dataset
tfidf_dict = df.to_dict() # transform dataframe to dict *could be expensive the larger the data gets, tested on ~1000 word doc and took 0.002 secs to run
return tfidf_dict # returns the dict of words/n-grams with tf-idf as value
#print(df) # debugging
except:
print("Error in tf_idf!")
return
def get_postings(self,index):
merged_index_index = open("merged_index.index" ,'r')
merged_index = open("merged_index.full",'r')
merged_index_index.seek(0,0)
json_value = merged_index_index.readline()
data = json.loads(json_value)
index_index = dict(data['index'])
to_seek = index_index[index]
merged_index.seek(to_seek,0)
json_value = merged_index.readline()
data = json.loads(json_value)
return data['postings']
def get_data(self):
def set_weight(self):
weight_file = open('docs.weight','w')
jsonStr =json.dumps(self.weight, default=lambda o: o.__dict__,sort_keys=False)
weight_file.write(jsonStr)
weight_file.close()
num_threads = 1
threads = list()
def get_weight(self,doc_id):
weight = open('docs.weight','r')
weight.seek(0,0)
json_value = weight.readline()
data = json.loads(json_value)
return data[doc_id]
def get_data_path(self):
for directory in os.listdir(self.path):
for file in os.listdir(self.path + "/" + directory + "/"):
self.data_paths.append("data/DEV/" + directory + "/"+file)
self.num_doc = len(self.data_paths)
def get_next_file(self):
self.data_paths_lock.acquire()
try:
holder = self.data_paths.pop()
self.data_paths_lock.release()
return holder
except IndexError:
self.data_paths_lock.release()
return None
def add_partial_index(self,partial_index):
self.list_partials_lock.acquire()
self.list_partials.append(partial_index)
self.list_partials_lock.release()
#Actual files here
#JSON["url"] = url of crawled page, ignore fragments
#JSON["content"] = actual HTML
#JSON["encoding"] = ENCODING
index = 0
while True:
file_path = self.path + "" + directory + "/"+file
if len(threads) < num_threads:
thread = Worker(self,file_path)
threads.append(thread)
thread.start()
break
else:
if not threads[index].is_alive():
threads[index] = Worker(self,file_path)
threads[index].start()
break
else:
index = index + 1
if(index >= num_threads):
index = 0
time.sleep(.1)
#Found 55770 documents
#
#getting important tokens
def merge(self):
partial_files = list()
partial_index_files = list()
parital_index_indices = list()
num_indices = len(self.list_partials)
#Full Index.Index and Length
full_index = Index()
full_index.index = list()
full_index.length = 0
for partial_index in self.list_partials:
file = open("temp/" + partial_index+'.partial','r')
partial_files.append(file)
index = open("temp/" + partial_index+'.index','r')
partial_index_files.append(index)
for partial_index_file in partial_index_files:
partial_index_file.seek(0,0)
parital_index_indices.append(json.loads(partial_index_file.readline()))
#Start all indexes at 0
for partial_file in partial_files:
partial_file.seek(0,0)
pointers = [0]*num_indices
merged_index = open("merged_index.full",'w')
merged_index_index = open("merged_index.index" ,'w')
while(True):
#Get all values from all indices to find min
value = None
values = list()
for i in range(num_indices):
if pointers[i] < parital_index_indices[i]['length']:
values.append(parital_index_indices[i]['index'][pointers[i]][0])
if(len(values) == 0):
break
value = min(values)
#Get data from the min value of all indices if exists then save to mergedIndex
if value == None:
print("I have crashed some how by not getting min value")
break
node = Node()
node.index_value = value
for i in range(num_indices):
if pointers[i] < parital_index_indices[i]['length'] and parital_index_indices[i]['index'][pointers[i]][0] == value:
to_seek = parital_index_indices[i]['index'][pointers[i]][1]
partial_files[i].seek(to_seek,0)
json_value = partial_files[i].readline()
temp_node = json.loads(json_value)
node.postings = node.postings + temp_node['postings']
pointers[i] = pointers[i] + 1
#Change postings here with tf*idf idf = log (n/df(t))
node.postings.sort(key=lambda y:y['doc_id'])
for posting in node.postings:
posting['tf_idf'] = posting['tf_raw']*math.log(self.num_doc/len(node.postings))
full_index.index.append((value,merged_index.tell()))
full_index.length = full_index.length + 1
jsonStr = json.dumps(node,default=lambda o: o.__dict__,sort_keys=False)
merged_index.write(jsonStr + '\n')
full_index.index.sort(key=lambda y:y[0])
jsonStr =json.dumps(full_index, default=lambda o: o.__dict__,sort_keys=False)
merged_index_index.write(jsonStr)
for partial_index in self.list_partials:
os.remove("temp/" + partial_index+'.partial')
os.remove("temp/" + partial_index+'.index')
merged_index_index.close()
merged_index.close()
def main():
indexer = Indexer(list(),dict(),list())
indexer.get_data_path()
print("We have " + str(len(indexer.data_paths)) + " documents to go through !" )
indexer.start()
indexer.merge()
print("Finished merging into 1 big happy family")
indexer.set_weight()
tic = time.perf_counter()
indexer.get_postings('artifici')
toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to get postings for artifici")
tic = time.perf_counter()
indexer.get_weight('00ba3af6a00b7cfb4928e5d234342c5dc46b4e31714d4a8f315a2dd4d8e49860')
print(f"Took {toc - tic:0.4f} seconds to get weight for some random page ")
toc = time.perf_counter()
indexer = Indexer(True,0)
indexer.get_data()
if __name__ == "__main__":
main()

View File

@@ -1,16 +1,9 @@
#Posting class for indexer, will probably be more complex as we keep adding crap to it
class Posting():
def __init__(self,doc_id,url,tf_raw,tf_idf,positionals):
self.doc_id = doc_id
def __init__(self,url,tf_idf):
self.url = url
self.tf_raw = tf_raw
self.tf_idf = tf_idf
self.positionals = positionals
def __repr__(self):
return "Doc_id:" + str(self.doc_id) + " URL:" + self.url + " tf_raw:" + str(self.tf_raw) + " tf_idf:" + str(self.tf_idf) + " positionals:" + str(self.positionals)
def __str__(self):
return "Doc_id:" + str(self.doc_id) + " URL:" + self.url + " tf_raw:" + str(self.tf_raw) + " tf_idf:" + str(self.tf_idf) + " positionals:" + str(self.positionals)
def comparator(self):
#Some custom comparator for sorting postings later

111
search.py
View File

@@ -1,111 +0,0 @@
#Data input
import json
import os
import shelve
from bs4 import BeautifulSoup
from time import perf_counter
import time
import threading
import pickle
#Data process
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import re
#Logging postings
from posting import Posting
from worker import Worker
class Search():
# wrote the code for testing in the file searchtesting.py so many of the variables and function calls are wrong.
def __init__(self):
self.stemmer = PorterStemmer()
p = os.path.dirname(os.path.abspath(__file__))
my_filename = os.path.join(p, "urlID.pkl")
self.f = open(my_filename, "rb+")
self.id = pickle.load(self.f)
# takes a list of posting lists returns a list of indexes that correspond to search temp list
def two_shortest(self, l_posting):
short = []
location = []
for postings in l_posting:
short.append(len(postings))
for i in range(2):
x = short.index(min(short))
location.append(x)
short[x] = float('inf')
return location
# len(list1) <= len(list2) So the code in this function works with that in mind
def merge(self, list1, list2):
merged = []
i = 0
j = 0
# TODO: optimize by having a pointer to the current index+4
while i < len(list1) or j < len(list2):
if j == len(list2):
break
if i == len(list1):
break
# Since list1 is shorter it will hit its max index sooner,
# so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2
if i == len(list1)-1:
if list1[i].url == list2[j].url:
merged.append(list1[i])
j += 1
i += 1
elif list1[i].url < list2[j].url:
break
else:
j += 1
else:
if list1[i].url == list2[j].url:
merged.append(list1[i])
i += 1
j += 1
elif list1[i].url < list2[j].url:
break
else:
i += 1
j += 1
return merged
# query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
def search(self, query):
temp = []
for token in query:
temp.append(get_index(token))
l = two_shortest(temp)
m = merge(temp[l[0]], temp[l[1]])
while len(temp) > 1:
# delete from temp the already merged lists
del temp[l[0]]
del temp[l[1]]
temp.append(m)
l = two_shortest(temp)
m = merge(temp[l[0]], temp[l[1]])
for p in m:
print(p.url)
# For now going to do a loop through each query's index and match it with the merged list (can be faster if i implement something during merge/search in order to keep track of the postings)

View File

@@ -1,117 +0,0 @@
import math
import json
import os
import shelve
from bs4 import BeautifulSoup
from time import perf_counter
import time
import threading
import pickle
#Data process
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import re
class Posting():
def __init__(self, url, rtf, position):
self.url = url
self.rtf = rtf
self.tf = 1
self.tfidf = 0
self.positions = [position]
d = {
'a' : [Posting(0, 1, 1), Posting(2, 1, 1), Posting(3, 1, 1), Posting(8, 1, 1)],
'b' :[Posting(0, 1, 1), Posting(8, 1, 1)],
'c' : [Posting(0, 1, 1), Posting(1, 1, 1), Posting(2, 1, 1), Posting(8, 1, 1)]
}
def get_index(word):
for k, v in d.items():
if k == word:
return v
# takes a list of posting lists returns a list of indexes that correspond to search temp list
def two_shortest(l_posting):
short = []
location = []
for postings in l_posting:
short.append(len(postings))
for i in range(2):
x = short.index(min(short))
location.append(x)
short[x] = float('inf')
return location
# len(list1) <= len(list2) So the code in this function works with that in mind
def merge(list1, list2):
merged = []
i = 0
j = 0
# TODO: optimize by having a pointer to the current index+4
while i < len(list1) or j < len(list2):
if j == len(list2):
break
if i == len(list1):
break
# Since list1 is shorter it will hit its max index sooner,
# so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2
if i == len(list1)-1:
if list1[i].url == list2[j].url:
merged.append(list1[i])
j += 1
i += 1
elif list1[i].url < list2[j].url:
break
else:
j += 1
else:
if list1[i].url == list2[j].url:
merged.append(list1[i])
i += 1
j += 1
elif list1[i].url < list2[j].url:
break
else:
i += 1
j += 1
return merged,
# query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
def search(query):
temp = []
for token in query:
temp.append(get_index(token))
l = two_shortest(temp)
m = merge(temp[l[0]], temp[l[1]])
while len(temp) > 1:
# delete from temp the already merged lists
del temp[l[0]]
del temp[l[1]]
temp.append(m)
l = two_shortest(temp)
m = merge(temp[l[0]], temp[l[1]])
for p in m:
print(p.url)
# For now going to do a loop through each query's index and match it with the merged list (can be faster if i implement something during merge/search in order to keep track of the postings)
search(["a", "b", "c"])

18
stemmer.py Normal file
View File

@@ -0,0 +1,18 @@
#Multiple implementation of stemming here please
class Stemmer():
def __init__(self,mode, data):
#Different type of stemmer = different modes
self.mode = mode
self.data = data
def stem(self):
#Do stuff here
if(self.mode == 0):
#Do stemmer 1
return #stemmed data
#....
def #name of stemmer 1
def #name of stemmer 2

28
test.py
View File

@@ -1,13 +1,17 @@
from threading import Thread
import json
import os
import shelve
import sys
from bs4 import BeautifulSoup
from time import perf_counter
from nltk.stem import PorterStemmer
import nltk
import time
from posting import Posting
import re
import os
for i in range(99):
word_lower = chr(i % 26 + 97) + chr(i % 26 + 97 + 1)
print(word_lower)
if re.match(r"^[a-d1-1].*",word_lower):
print("SAVE 1")
elif re.match(r"^[e-k2-3].*",word_lower):
print("SAVE 2")
elif re.match(r"^[l-q4-7].*",word_lower):
print("SAVE 3")
elif re.match(r"^[r-z8-9].*",word_lower):
print("SAVE 4")
path = "data/DEV/"
print(os.listdir(path))

View File

@@ -1,116 +0,0 @@
import json
from posting import Posting
import math
import sys
import random
from nltk.corpus import words
random_list = [1,2,3,4,5,6,7,8,9,10]
test_data = words.words()
random.shuffle(test_data)
def random_posting(id):
return Posting(id,random.choice(random_list),random.choice(random_list),[random.choice(random_list),random.choice(random_list),random.choice(random_list),random.choice(random_list),
random.choice(random_list),random.choice(random_list),random.choice(random_list),random.choice(random_list)])
class Node():
index_value = 'Something'
postings = list()
class Index():
length = 0
index = list()
def random_partial_index(name):
part_index = Index()
part_index.index = list()
part_index.length = 0
with open(name +'.partial', 'w') as f:
for i in range(1000):
node1 = Node()
node1.index_value = random.choice(test_data).lower()
node1.postings = list()
for i in range(10):
node1.postings.append(random_posting(i))
jsonStr = json.dumps(node1, default=lambda o: o.__dict__,sort_keys=False)
part_index.index.append((node1.index_value,f.tell()))
f.write(jsonStr + '\n')
part_index.length = part_index.length + 1
part_index.index.sort(key=lambda y:y[0])
jsonStr =json.dumps(part_index, default=lambda o: o.__dict__,sort_keys=False)
with open(name + '.index','w') as f:
f.write(jsonStr)
def merge(partial_indices):
partial_files = list()
partial_index_files = list()
parital_index_indices = list()
merged_index = open("merged_index.full",'w')
num_indices = len(partial_indices)
#Full Index.Index and Length
full_index = Index()
full_index.index = list()
full_index.length = 0
for partial_index in partial_indices:
file = open(partial_index+'.partial','r')
partial_files.append(file)
index = open(partial_index+'.index','r')
partial_index_files.append(index)
for partial_index_file in partial_index_files:
partial_index_file.seek(0,0)
parital_index_indices.append(json.loads(partial_index_file.readline()))
#Start all indexes at 0
for partial_file in partial_files:
partial_file.seek(0,0)
pointers = [0]*num_indices
while(True):
#Get all values from all indices to find min
value = None
values = list()
for i in range(num_indices):
if pointers[i] < parital_index_indices[i]['length']:
values.append(parital_index_indices[i]['index'][pointers[i]][0])
if(len(values) == 0):
break
value = min(values)
#Get data from the min value of all indices if exists then save to mergedIndex
if value == None:
print("I have crashed some how by not getting min value")
break
node = Node()
node.index_value = value
for i in range(num_indices):
if pointers[i] < parital_index_indices[i]['length'] and parital_index_indices[i]['index'][pointers[i]][0] == value:
to_seek = parital_index_indices[i]['index'][pointers[i]][1]
partial_files[i].seek(to_seek,0)
json_value = partial_files[i].readline()
temp_node = json.loads(json_value)
node.postings = node.postings + temp_node['postings']
pointers[i] = pointers[i] + 1
node.postings.sort(key=lambda y:y['doc_id'])
full_index.index.append((value,merged_index.tell()))
full_index.length = full_index.length + 1
jsonStr = json.dumps(node,default=lambda o: o.__dict__,sort_keys=False)
merged_index.write(jsonStr + '\n')
full_index.index.sort(key=lambda y:y[0])
jsonStr =json.dumps(full_index, default=lambda o: o.__dict__,sort_keys=False)
with open("merged_index.index" ,'w') as f:
f.write(jsonStr)

173
worker.py
View File

@@ -1,137 +1,114 @@
from threading import Thread
import json
import os
import shelve
from bs4 import BeautifulSoup
from time import perf_counter
import time
import re
#Data process
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from collections import Counter
from posting import Posting
import math
import sys
class Node():
index_value = ''
postings = list()
class Index():
length = 0
index = list()
class Worker(Thread):
def __init__(self,worker_id,indexer):
def __init__(self,indexer,target):
self.file = target
self.indexer = indexer
self.stemmer = PorterStemmer()
self.worker_id = worker_id
self.num_partial = 0
self.index = dict()
super().__init__(daemon=True)
def dump(self):
part_index = Index()
part_index.length = 0
part_index.index = list()
cur_partial_index_str = "temp/" + str(self.worker_id) + "_" + str(self.num_partial) + '.partial'
cur_partial_index_index_str = "temp/" + str(self.worker_id) + "_" + str(self.num_partial) + '.index'
cur_partial_index = open(cur_partial_index_str,'w')
cur_partial_index_index = open(cur_partial_index_index_str,'w')
for key in self.index:
node = Node()
node.index_value = key
node.postings = self.index[key]
jsonStr = json.dumps(node, default=lambda o: o.__dict__,sort_keys=False)
part_index.index.append((node.index_value,cur_partial_index.tell()))
cur_partial_index.write(jsonStr + '\n')
part_index.length = part_index.length + 1
part_index.index.sort(key=lambda y:y[0])
jsonStr =json.dumps(part_index, default=lambda o: o.__dict__,sort_keys=False)
cur_partial_index_index.write(jsonStr)
self.indexer.add_partial_index(str(self.worker_id) + "_" + str(self.num_partial))
self.num_partial = self.num_partial + 1
self.index.clear()
def run(self):
while True:
target = self.indexer.get_next_file()
if not target:
self.dump()
print("Worker " + str(self.worker_id) + " died")
break
file_load = open(target)
print("Target: " + str(self.file))
ticker = perf_counter()
tic = perf_counter()
file_load = open(self.file)
data = json.load(file_load)
soup = BeautifulSoup(data["content"],features="lxml")
doc_id = target[target.rfind('/')+1:-5]
url = data['url']
print("Worker " + str(self.worker_id) + " working on " + url)
words = word_tokenize(soup.get_text())
toc = perf_counter()
if toc - tic > 1 :
print("Took " + str(toc - tic) + "seconds to tokenize text !")
tokenized_words = list()
stemmed_words = list()
important = {'b' : [], 'h1' : [], 'h2' : [], 'h3' : [], 'title' : []}
for key_words in important.keys():
for i in soup.findAll(key_words):
for word in word_tokenize(i.text):
important[key_words].append(self.stemmer.stem(word))
important[key_words].append(self.indexer.stemmer.stem(word))
# Gets a cleaner version text comparative to soup.get_text()
clean_text = ' '.join(soup.stripped_strings)
# Looks for large white space, tabbed space, and other forms of spacing and removes it
# Regex expression matches for space characters excluding a single space or words
clean_text = re.sub(r'\s[^ \w]', '', clean_text)
# Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended
clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)])
# Stems tokenized text
clean_text = " ".join([self.stemmer.stem(i) for i in clean_text.split()])
# Put clean_text as an element in a list because get_tf_idf workers properly with single element lists
tic = perf_counter()
for word in words:
if word != "" and re.fullmatch('[A-Za-z0-9]+',word):
tokenized_words.append(word)
toc = perf_counter()
if toc - tic > 1 :
print("Took " + str(toc - tic) + "seconds to isalnum text !")
tokens = word_tokenize(clean_text)
tic = perf_counter()
for word in tokenized_words:
stemmed_words.append(self.indexer.stemmer.stem(word))
#counter(count,positionals)
toc = perf_counter()
if toc - tic > 1 :
print("Took " + str(toc - tic) + "seconds to stemmed text !")
counter = dict()
#We calculating tf_raw, and positionals here
for i in range(len(tokens)):
word = tokens[i]
if word in counter:
counter[word][0] = counter[word][0] + 1
counter[word][1].append(i)
else:
counter[word] = [1,list()]
counter[word][1].append(i)
"""
tfidf = TfidfVectorizer(ngram_range=(1,3)) # ngram_range is range of n-values for different n-grams to be extracted (1,3) gets unigrams, bigrams, trigrams
tfidf_matrix = tfidf.fit_transform(stemmed_words) # fit trains the model, transform creates matrix
#df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out()) # store value of matrix to associated word/n-gram
tfidf.sget_feature_names_out()
#tf_idf_dict = df.to_dict() # transform dataframe to dict *could be expensive the larger the data gets, tested on ~1000 word doc and took 0.002 secs to run
doc_length = len(tokens)
total = 0
for index in counter:
tf = counter[index][0]/doc_length
log_tf = 1 + math.log(tf)
total = total + log_tf * log_tf
if index in self.index:
postings = self.index[index]
postings.append(Posting(doc_id,url,tf,0,counter[index][1]))
else:
self.index[index] = list()
self.index[index].append(Posting(doc_id,url,tf,0,counter[index][1]))
self.index[index].sort(key=lambda y:y.doc_id)
print(tfidf_matrix)
"""
self.indexer.weight[doc_id] = math.sqrt(total)
#10 Megabytes index (in Ram approx)
if sys.getsizeof(self.index) > 1000000:
self.dump()
tfIdfVectorizer=TfidfVectorizer(use_idf=True)
tfIdf = tfIdfVectorizer.fit_transform(stemmed_words)
df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names_out(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
print(df.head(25))
for word in tf_idf_dict.keys():
tic = perf_counter()
print(tf_idf_dict)
weight = 1.0
for k,v in important.items():
if k == 'b' and word in v:
weight = 1.2
elif k == 'h1' and word in v:
weight = 1.75
elif k == 'h2' and word in v:
weight = 1.5
elif k == 'h3' and word in v:
weight = 1.2
elif k == 'title' and word in v:
weight = 2
posting = Posting(data["url"],tf_idf_dict[word]*weight)
toc = perf_counter()
if toc - tic > 1 :
print("Took " + str(toc - tic) + "seconds to tf_idf text !")
tic = perf_counter()
self.indexer.save_index(word,posting)
toc = perf_counter()
if toc - tic > 1 :
print("Took " + str(toc - tic) + "seconds to save text !")
tocker = perf_counter()
print("Finished " + data['url'] + "\n" + str(tocker-ticker))