1 Commits

Author SHA1 Message Date
Lacerum
f5610eaa62 tf-idf ngrams and now returns dict rather than
score
2022-05-11 14:46:32 -07:00
14 changed files with 180 additions and 765 deletions

2
.gitignore vendored
View File

@@ -1,5 +1,3 @@
/data/ /data/
*.shelve *.shelve
/__pycache__/ /__pycache__/
/test/
merged*

View File

@@ -1,8 +0,0 @@
### To create index:
1. Make sure that all requirements are installed, check `requirements.txt` and install using `pip install reqirements.txt`.
2. Run `python indexer.py` to build index, this may take some time to run.
3. Index is now created.
### Start search interface:
Run `python launcher.py` to start the search interface.
### Perform query:
To perfrom a search simply enter a query in the textbox and click search. The top results will be displayed.

View File

@@ -1,52 +0,0 @@
### Bad:
- computer science - common
- university of california irvine -common
- donald bren - common
- uci - common
- informatics - common
- The Donald Bren School of Information and Computer Sciences - long and common
- toilet - not likely to be found easily
- perfume - not likely to be found
- SPY×FAMILY - should not exist in data
- undergraduate - likely to be on tons of pages
### Good to Meh:
- liquids in labs - uncommon word with common
- Alberto Krone-Martins - should have a good amount of results but not absurd
- Advising & Planning - should be specific but not too common
- Honors Program - ^
- Papaefthymiou - similar to the martins query
- General information - there should be quite a few pages with this but not tons
- Prerequisite Clearing System - has some common and uncommon terms
- Recruiting - not stupid common
- counseling - ^ and should only be on a subset of pages
- social justice - specific terms that should appear without being costly
### Others tested:
- masters of computer science - not super common but will have a good amount of pages
- thornton ics46 notes - name + class + common
- Theory of Computation - two terms which have high count in papers
- facility distribution - two terms which don't really make sense together
- artificial intelligence history - two common terms with semi-common
- prospective alumni - should have very few instances of both terms but should be found together
- enrollment window - should be on only a couple of pages
- available capstone sponsorship - ^
- spring seminars - common with term that may be somewhat restricted
- hackuci - two terms into one that exists in dataset
- ucinetid help - specific term with common
- course restrictions - specific pages
- project management - a course name
- yelan research - term should not exist + common
- hybrid-learning - common phrase
- genshin is a computer game - contains terms that exist and others that don't
- computable AI machine learning big data - sentence of CS buzz words (really really common)
- Publications & Technical Reports - in json file
- Tutor coordinators - in many json (bold, title, and body)
- Death Image Service - in some weird areas
- send anonymous email - only in some
### Things done for improvement
1. Create index of index for substantial gain in efficiency and speed.
2. Split TF-IDF into TF and IDF for more specific calculations when needed without the whole computation. This also removes the relevance on external library for TF-IDF.
3. Switched from using IDF & weight, to TF & weight for helping with the overall weight.
4. Dropped indexing and searching of unigram, bigram, and trigrams.
5. Add length of document during indexing for improved speed via normalization calculation.

File diff suppressed because one or more lines are too long

View File

@@ -1,30 +0,0 @@
# You can ignore this file. This was for testing purposes
import json
import os
import shelve
from bs4 import BeautifulSoup
from time import perf_counter
import requests
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import numpy as np
path_to_script = os.path.dirname(os.path.abspath(__file__))
my_filename = os.path.join(path_to_script, "testfile.json")
url = "https://www.crummy.com/software/BeautifulSoup/bs4/doc/"
req = requests.get(url)
file = open('D:/Visual Studio Workspace/CS121/assignment3/Search_Engine/testfile.json')
content = json.load(file)
soup = BeautifulSoup(content["content"], 'lxml')
bold = []
#print(soup.prettify())
print(soup.findAll('h3'))
for i in soup.findAll('title'):
print(word_tokenize(i.text))
print(bold)

View File

@@ -15,10 +15,7 @@ import os
import shelve import shelve
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from time import perf_counter from time import perf_counter
import time
import threading
from threading import Lock
import math
#Data process #Data process
@@ -32,198 +29,176 @@ import re
#Logging postings #Logging postings
from posting import Posting from posting import Posting
from worker import Worker
class Node():
index_value = ''
postings = list()
class Index():
length = 0
index = list()
class Indexer(): class Indexer():
def __init__(self,list_partials,weight,data_paths,worker_factory=Worker): def __init__(self,restart,trimming):
#Config stuffs #Config stuffs
self.path = "data/DEV" self.path = "data/DEV/"
self.num_doc = 0 self.restart = restart
self.list_partials = list_partials self.trimming = trimming
self.weight = weight
self.data_paths = data_paths
self.stemmer = PorterStemmer() self.stemmer = PorterStemmer()
self.data_paths_lock = Lock()
self.list_partials_lock = Lock()
self.workers = list() #Shelves for index
self.worker_factory = worker_factory #https://www3.nd.edu/~busiforc/handouts/cryptography/letterfrequencies.html
#https://www.irishtimes.com/news/science/how-many-numbers-begin-with-a-1-more-than-30-per-cent-1.4162466
#According to this will be how we split things
#Save #1 = ABCD + (1) ~ 18.3% of words
#Save #2 = EFGHIJK + (2-3)~ 27.1% of words
#Save #3 = LMNOPQ + (4-7) ~ 25.4% of words
#Save #4 = RSTUVWXYZ + (8-9)~ 29.2% of words
#Save #5 = Special characters
if os.path.exists("save_1.shelve") and restart:
os.remove("save_1.shelve")
if os.path.exists("save_2.shelve") and restart:
os.remove("save_2.shelve")
if os.path.exists("save_3.shelve") and restart:
os.remove("save_3.shelve")
if os.path.exists("save_4.shelve") and restart:
os.remove("save_4.shelve")
if os.path.exists("save_5.shelve") and restart:
os.remove("save_5.shelve")
def start_async(self): self.save_1 = shelve.open("save_1.shelve")
self.workers = [ self.save_2 = shelve.open("save_2.shelve")
self.worker_factory(worker_id,self) self.save_3 = shelve.open("save_3.shelve")
for worker_id in range(8)] self.save_4 = shelve.open("save_4.shelve")
for worker in self.workers: self.save_5 = shelve.open("save_5.shelve")
worker.start()
def start(self):
self.start_async()
self.join()
def join(self):
for worker in self.workers:
worker.join()
def get_postings(self,index): def save_index(self,word,posting):
merged_index_index = open("merged_index.index" ,'r') cur_save = self.get_save_file(word)
merged_index = open("merged_index.full",'r') shelve_list = list()
merged_index_index.seek(0,0)
json_value = merged_index_index.readline()
data = json.loads(json_value)
index_index = dict(data['index'])
to_seek = index_index[index]
merged_index.seek(to_seek,0)
json_value = merged_index.readline()
data = json.loads(json_value)
return data['postings']
def set_weight(self): try:
weight_file = open('docs.weight','w') shelve_list = cur_save[word]
jsonStr =json.dumps(self.weight, default=lambda o: o.__dict__,sort_keys=False) shelve_list.append(posting)
weight_file.write(jsonStr) tic = perf_counter()
weight_file.close() shelve_list.sort(key=lambda x: x.tf_idf, reverse = True)
toc = perf_counter()
if toc - tic > 1 :
print("Took " + str(toc - tic) + "seconds to sort shelve list !")
cur_save.sync()
except:
shelve_list.append(posting)
cur_save[word] = shelve_list
cur_save.sync()
def get_weight(self,doc_id): def get_save_file(self,word):
weight = open('docs.weight','r') #return the correct save depending on the starting letter of word
weight.seek(0,0) word_lower = word.lower()
json_value = weight.readline()
data = json.loads(json_value)
return data[doc_id]
def get_data_path(self): if re.match(r"^[a-d0-1].*",word_lower):
return self.save_1
elif re.match(r"^[e-k2-3].*",word_lower):
return self.save_2
elif re.match(r"^[l-q4-7].*",word_lower):
return self.save_3
elif re.match(r"^[r-z8-9].*",word_lower):
return self.save_4
else:
print(word)
print("You have somehow went beyond the magic")
return self.save_5
# retuns a dict of words/n-grams with their assosiated tf-idf score *can also return just a single score or a pandas dataframe
# https://stackoverflow.com/questions/34449127/sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-documen
def get_tf_idf(self,words,word):
#tf_idf
#words = whole text
#word the word we finding the score for
#return the score
try:
tfidf = TfidfVectorizer(ngram_range=(1,3)) # ngram_range is range of n-values for different n-grams to be extracted (1,3) gets unigrams, bigrams, trigrams
tfidf_matrix = tfidf.fit_transform(words) # fit trains the model, transform creates matrix
df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out()) # store value of matrix to associated word/n-gram
#return(df.iloc[0][''.join(word)]) #used for finding single word in dataset
data = df.to_dict() # transform dataframe to dict *could be expensive the larger the data gets, tested on ~1000 word doc and took 0.002 secs to run
return data # returns the dict of words/n-grams with tf-idf
#print(df) # debugging
except:
print("Error in tf_idf!")
return
def get_data(self):
for directory in os.listdir(self.path): for directory in os.listdir(self.path):
for file in os.listdir(self.path + "/" + directory + "/"): for file in os.listdir(self.path + "/" + directory + "/"):
self.data_paths.append("data/DEV/" + directory + "/"+file) #Actual files here
self.num_doc = len(self.data_paths) #JSON["url"] = url of crawled page, ignore fragments
#JSON["content"] = actual HTML
#JSON["encoding"] = ENCODING
ticker = perf_counter()
tic = perf_counter()
file_load = open(self.path + "/" + directory + "/"+file)
data = json.load(file_load)
soup = BeautifulSoup(data["content"],from_encoding=data["encoding"])
words = word_tokenize(soup.get_text())
toc = perf_counter()
if toc - tic > 1 :
print("Took " + str(toc - tic) + "seconds to tokenize text !")
def get_next_file(self): tokenized_words = list()
self.data_paths_lock.acquire() stemmed_words = list()
try:
holder = self.data_paths.pop()
self.data_paths_lock.release()
return holder
except IndexError:
self.data_paths_lock.release()
return None
def add_partial_index(self,partial_index): tic = perf_counter()
self.list_partials_lock.acquire() for word in words:
self.list_partials.append(partial_index) if word != "" and re.fullmatch('[A-Za-z0-9]+',word):
self.list_partials_lock.release() #So all the tokenized words are here,
tokenized_words.append(word)
toc = perf_counter()
if toc - tic > 1 :
print("Took " + str(toc - tic) + "seconds to isalnum text !")
#YOUR CODE HERE
#Found 55770 documents tic = perf_counter()
# for word in tokenized_words:
#getting important tokens stemmed_words.append(self.stemmer.stem(word))
#stemming,
#tf_idf
#get_tf_idf(stemmed_words,word)
#post = Posting()
toc = perf_counter()
if toc - tic > 1 :
print("Took " + str(toc - tic) + "seconds to stemmed text !")
def merge(self): for word in stemmed_words:
partial_files = list() #posting = Posting(data["url"],self.get_tf_idf(list(' '.join(stemmed_words)),word))
partial_index_files = list() tic = perf_counter()
parital_index_indices = list() posting = Posting(data["url"],self.tf_idf_raw(stemmed_words,word))
toc = perf_counter()
if toc - tic > 1 :
print("Took " + str(toc - tic) + "seconds to tf_idf text !")
num_indices = len(self.list_partials) tic = perf_counter()
self.save_index(word,posting)
toc = perf_counter()
if toc - tic > 1 :
print("Took " + str(toc - tic) + "seconds to save text !")
#Full Index.Index and Length tocker = perf_counter()
full_index = Index() print("Finished " + data['url'] + " in \t " + str(tocker-ticker))
full_index.index = list()
full_index.length = 0
for partial_index in self.list_partials: def tf_idf_raw(self,words,word):
file = open("temp/" + partial_index+'.partial','r') tf_times = words.count(word)
partial_files.append(file)
index = open("temp/" + partial_index+'.index','r')
partial_index_files.append(index)
for partial_index_file in partial_index_files: tf = tf_times/len(words)
partial_index_file.seek(0,0)
parital_index_indices.append(json.loads(partial_index_file.readline()))
#Start all indexes at 0 return tf
for partial_file in partial_files:
partial_file.seek(0,0)
pointers = [0]*num_indices
merged_index = open("merged_index.full",'w')
merged_index_index = open("merged_index.index" ,'w')
while(True):
#Get all values from all indices to find min
value = None
values = list()
for i in range(num_indices):
if pointers[i] < parital_index_indices[i]['length']:
values.append(parital_index_indices[i]['index'][pointers[i]][0])
if(len(values) == 0):
break
value = min(values)
#Get data from the min value of all indices if exists then save to mergedIndex
if value == None:
print("I have crashed some how by not getting min value")
break
node = Node()
node.index_value = value
for i in range(num_indices):
if pointers[i] < parital_index_indices[i]['length'] and parital_index_indices[i]['index'][pointers[i]][0] == value:
to_seek = parital_index_indices[i]['index'][pointers[i]][1]
partial_files[i].seek(to_seek,0)
json_value = partial_files[i].readline()
temp_node = json.loads(json_value)
node.postings = node.postings + temp_node['postings']
pointers[i] = pointers[i] + 1
#Change postings here with tf*idf idf = log (n/df(t))
node.postings.sort(key=lambda y:y['doc_id'])
for posting in node.postings:
posting['tf_idf'] = posting['tf_raw']*math.log(self.num_doc/len(node.postings))
full_index.index.append((value,merged_index.tell()))
full_index.length = full_index.length + 1
jsonStr = json.dumps(node,default=lambda o: o.__dict__,sort_keys=False)
merged_index.write(jsonStr + '\n')
full_index.index.sort(key=lambda y:y[0])
jsonStr =json.dumps(full_index, default=lambda o: o.__dict__,sort_keys=False)
merged_index_index.write(jsonStr)
for partial_index in self.list_partials:
os.remove("temp/" + partial_index+'.partial')
os.remove("temp/" + partial_index+'.index')
merged_index_index.close()
merged_index.close()
def main(): def main():
indexer = Indexer(list(),dict(),list()) indexer = Indexer(True,0)
indexer.get_data_path() indexer.get_data()
print("We have " + str(len(indexer.data_paths)) + " documents to go through !" )
indexer.start()
indexer.merge()
print("Finished merging into 1 big happy family")
indexer.set_weight()
tic = time.perf_counter()
indexer.get_postings('artifici')
toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to get postings for artifici")
tic = time.perf_counter()
indexer.get_weight('00ba3af6a00b7cfb4928e5d234342c5dc46b4e31714d4a8f315a2dd4d8e49860')
print(f"Took {toc - tic:0.4f} seconds to get weight for some random page ")
toc = time.perf_counter()
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@@ -1,16 +1,9 @@
#Posting class for indexer, will probably be more complex as we keep adding crap to it #Posting class for indexer, will probably be more complex as we keep adding crap to it
class Posting(): class Posting():
def __init__(self,doc_id,url,tf_raw,tf_idf,positionals): def __init__(self,url,tf_idf):
self.doc_id = doc_id
self.url = url self.url = url
self.tf_raw = tf_raw
self.tf_idf = tf_idf self.tf_idf = tf_idf
self.positionals = positionals
def __repr__(self):
return "Doc_id:" + str(self.doc_id) + " URL:" + self.url + " tf_raw:" + str(self.tf_raw) + " tf_idf:" + str(self.tf_idf) + " positionals:" + str(self.positionals)
def __str__(self):
return "Doc_id:" + str(self.doc_id) + " URL:" + self.url + " tf_raw:" + str(self.tf_raw) + " tf_idf:" + str(self.tf_idf) + " positionals:" + str(self.positionals)
def comparator(self): def comparator(self):
#Some custom comparator for sorting postings later #Some custom comparator for sorting postings later

111
search.py
View File

@@ -1,111 +0,0 @@
#Data input
import json
import os
import shelve
from bs4 import BeautifulSoup
from time import perf_counter
import time
import threading
import pickle
#Data process
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import re
#Logging postings
from posting import Posting
from worker import Worker
class Search():
# wrote the code for testing in the file searchtesting.py so many of the variables and function calls are wrong.
def __init__(self):
self.stemmer = PorterStemmer()
p = os.path.dirname(os.path.abspath(__file__))
my_filename = os.path.join(p, "urlID.pkl")
self.f = open(my_filename, "rb+")
self.id = pickle.load(self.f)
# takes a list of posting lists returns a list of indexes that correspond to search temp list
def two_shortest(self, l_posting):
short = []
location = []
for postings in l_posting:
short.append(len(postings))
for i in range(2):
x = short.index(min(short))
location.append(x)
short[x] = float('inf')
return location
# len(list1) <= len(list2) So the code in this function works with that in mind
def merge(self, list1, list2):
merged = []
i = 0
j = 0
# TODO: optimize by having a pointer to the current index+4
while i < len(list1) or j < len(list2):
if j == len(list2):
break
if i == len(list1):
break
# Since list1 is shorter it will hit its max index sooner,
# so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2
if i == len(list1)-1:
if list1[i].url == list2[j].url:
merged.append(list1[i])
j += 1
i += 1
elif list1[i].url < list2[j].url:
break
else:
j += 1
else:
if list1[i].url == list2[j].url:
merged.append(list1[i])
i += 1
j += 1
elif list1[i].url < list2[j].url:
break
else:
i += 1
j += 1
return merged
# query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
def search(self, query):
temp = []
for token in query:
temp.append(get_index(token))
l = two_shortest(temp)
m = merge(temp[l[0]], temp[l[1]])
while len(temp) > 1:
# delete from temp the already merged lists
del temp[l[0]]
del temp[l[1]]
temp.append(m)
l = two_shortest(temp)
m = merge(temp[l[0]], temp[l[1]])
for p in m:
print(p.url)
# For now going to do a loop through each query's index and match it with the merged list (can be faster if i implement something during merge/search in order to keep track of the postings)

View File

@@ -1,117 +0,0 @@
import math
import json
import os
import shelve
from bs4 import BeautifulSoup
from time import perf_counter
import time
import threading
import pickle
#Data process
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import re
class Posting():
def __init__(self, url, rtf, position):
self.url = url
self.rtf = rtf
self.tf = 1
self.tfidf = 0
self.positions = [position]
d = {
'a' : [Posting(0, 1, 1), Posting(2, 1, 1), Posting(3, 1, 1), Posting(8, 1, 1)],
'b' :[Posting(0, 1, 1), Posting(8, 1, 1)],
'c' : [Posting(0, 1, 1), Posting(1, 1, 1), Posting(2, 1, 1), Posting(8, 1, 1)]
}
def get_index(word):
for k, v in d.items():
if k == word:
return v
# takes a list of posting lists returns a list of indexes that correspond to search temp list
def two_shortest(l_posting):
short = []
location = []
for postings in l_posting:
short.append(len(postings))
for i in range(2):
x = short.index(min(short))
location.append(x)
short[x] = float('inf')
return location
# len(list1) <= len(list2) So the code in this function works with that in mind
def merge(list1, list2):
merged = []
i = 0
j = 0
# TODO: optimize by having a pointer to the current index+4
while i < len(list1) or j < len(list2):
if j == len(list2):
break
if i == len(list1):
break
# Since list1 is shorter it will hit its max index sooner,
# so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2
if i == len(list1)-1:
if list1[i].url == list2[j].url:
merged.append(list1[i])
j += 1
i += 1
elif list1[i].url < list2[j].url:
break
else:
j += 1
else:
if list1[i].url == list2[j].url:
merged.append(list1[i])
i += 1
j += 1
elif list1[i].url < list2[j].url:
break
else:
i += 1
j += 1
return merged,
# query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
def search(query):
temp = []
for token in query:
temp.append(get_index(token))
l = two_shortest(temp)
m = merge(temp[l[0]], temp[l[1]])
while len(temp) > 1:
# delete from temp the already merged lists
del temp[l[0]]
del temp[l[1]]
temp.append(m)
l = two_shortest(temp)
m = merge(temp[l[0]], temp[l[1]])
for p in m:
print(p.url)
# For now going to do a loop through each query's index and match it with the merged list (can be faster if i implement something during merge/search in order to keep track of the postings)
search(["a", "b", "c"])

18
stemmer.py Normal file
View File

@@ -0,0 +1,18 @@
#Multiple implementation of stemming here please
class Stemmer():
def __init__(self,mode, data):
#Different type of stemmer = different modes
self.mode = mode
self.data = data
def stem(self):
#Do stuff here
if(self.mode == 0):
#Do stemmer 1
return #stemmed data
#....
def #name of stemmer 1
def #name of stemmer 2

28
test.py
View File

@@ -1,13 +1,17 @@
from threading import Thread
import json
import os
import shelve
import sys
from bs4 import BeautifulSoup
from time import perf_counter
from nltk.stem import PorterStemmer
import nltk
import time
from posting import Posting
import re import re
import os
for i in range(99):
word_lower = chr(i % 26 + 97) + chr(i % 26 + 97 + 1)
print(word_lower)
if re.match(r"^[a-d1-1].*",word_lower):
print("SAVE 1")
elif re.match(r"^[e-k2-3].*",word_lower):
print("SAVE 2")
elif re.match(r"^[l-q4-7].*",word_lower):
print("SAVE 3")
elif re.match(r"^[r-z8-9].*",word_lower):
print("SAVE 4")
path = "data/DEV/"
print(os.listdir(path))

View File

@@ -1,116 +0,0 @@
import json
from posting import Posting
import math
import sys
import random
from nltk.corpus import words
random_list = [1,2,3,4,5,6,7,8,9,10]
test_data = words.words()
random.shuffle(test_data)
def random_posting(id):
return Posting(id,random.choice(random_list),random.choice(random_list),[random.choice(random_list),random.choice(random_list),random.choice(random_list),random.choice(random_list),
random.choice(random_list),random.choice(random_list),random.choice(random_list),random.choice(random_list)])
class Node():
index_value = 'Something'
postings = list()
class Index():
length = 0
index = list()
def random_partial_index(name):
part_index = Index()
part_index.index = list()
part_index.length = 0
with open(name +'.partial', 'w') as f:
for i in range(1000):
node1 = Node()
node1.index_value = random.choice(test_data).lower()
node1.postings = list()
for i in range(10):
node1.postings.append(random_posting(i))
jsonStr = json.dumps(node1, default=lambda o: o.__dict__,sort_keys=False)
part_index.index.append((node1.index_value,f.tell()))
f.write(jsonStr + '\n')
part_index.length = part_index.length + 1
part_index.index.sort(key=lambda y:y[0])
jsonStr =json.dumps(part_index, default=lambda o: o.__dict__,sort_keys=False)
with open(name + '.index','w') as f:
f.write(jsonStr)
def merge(partial_indices):
partial_files = list()
partial_index_files = list()
parital_index_indices = list()
merged_index = open("merged_index.full",'w')
num_indices = len(partial_indices)
#Full Index.Index and Length
full_index = Index()
full_index.index = list()
full_index.length = 0
for partial_index in partial_indices:
file = open(partial_index+'.partial','r')
partial_files.append(file)
index = open(partial_index+'.index','r')
partial_index_files.append(index)
for partial_index_file in partial_index_files:
partial_index_file.seek(0,0)
parital_index_indices.append(json.loads(partial_index_file.readline()))
#Start all indexes at 0
for partial_file in partial_files:
partial_file.seek(0,0)
pointers = [0]*num_indices
while(True):
#Get all values from all indices to find min
value = None
values = list()
for i in range(num_indices):
if pointers[i] < parital_index_indices[i]['length']:
values.append(parital_index_indices[i]['index'][pointers[i]][0])
if(len(values) == 0):
break
value = min(values)
#Get data from the min value of all indices if exists then save to mergedIndex
if value == None:
print("I have crashed some how by not getting min value")
break
node = Node()
node.index_value = value
for i in range(num_indices):
if pointers[i] < parital_index_indices[i]['length'] and parital_index_indices[i]['index'][pointers[i]][0] == value:
to_seek = parital_index_indices[i]['index'][pointers[i]][1]
partial_files[i].seek(to_seek,0)
json_value = partial_files[i].readline()
temp_node = json.loads(json_value)
node.postings = node.postings + temp_node['postings']
pointers[i] = pointers[i] + 1
node.postings.sort(key=lambda y:y['doc_id'])
full_index.index.append((value,merged_index.tell()))
full_index.length = full_index.length + 1
jsonStr = json.dumps(node,default=lambda o: o.__dict__,sort_keys=False)
merged_index.write(jsonStr + '\n')
full_index.index.sort(key=lambda y:y[0])
jsonStr =json.dumps(full_index, default=lambda o: o.__dict__,sort_keys=False)
with open("merged_index.index" ,'w') as f:
f.write(jsonStr)

File diff suppressed because one or more lines are too long

137
worker.py
View File

@@ -1,137 +0,0 @@
from threading import Thread
import json
import os
from bs4 import BeautifulSoup
import re
#Data process
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from posting import Posting
import math
import sys
class Node():
index_value = ''
postings = list()
class Index():
length = 0
index = list()
class Worker(Thread):
def __init__(self,worker_id,indexer):
self.indexer = indexer
self.stemmer = PorterStemmer()
self.worker_id = worker_id
self.num_partial = 0
self.index = dict()
super().__init__(daemon=True)
def dump(self):
part_index = Index()
part_index.length = 0
part_index.index = list()
cur_partial_index_str = "temp/" + str(self.worker_id) + "_" + str(self.num_partial) + '.partial'
cur_partial_index_index_str = "temp/" + str(self.worker_id) + "_" + str(self.num_partial) + '.index'
cur_partial_index = open(cur_partial_index_str,'w')
cur_partial_index_index = open(cur_partial_index_index_str,'w')
for key in self.index:
node = Node()
node.index_value = key
node.postings = self.index[key]
jsonStr = json.dumps(node, default=lambda o: o.__dict__,sort_keys=False)
part_index.index.append((node.index_value,cur_partial_index.tell()))
cur_partial_index.write(jsonStr + '\n')
part_index.length = part_index.length + 1
part_index.index.sort(key=lambda y:y[0])
jsonStr =json.dumps(part_index, default=lambda o: o.__dict__,sort_keys=False)
cur_partial_index_index.write(jsonStr)
self.indexer.add_partial_index(str(self.worker_id) + "_" + str(self.num_partial))
self.num_partial = self.num_partial + 1
self.index.clear()
def run(self):
while True:
target = self.indexer.get_next_file()
if not target:
self.dump()
print("Worker " + str(self.worker_id) + " died")
break
file_load = open(target)
data = json.load(file_load)
soup = BeautifulSoup(data["content"],features="lxml")
doc_id = target[target.rfind('/')+1:-5]
url = data['url']
print("Worker " + str(self.worker_id) + " working on " + url)
important = {'b' : [], 'h1' : [], 'h2' : [], 'h3' : [], 'title' : []}
for key_words in important.keys():
for i in soup.findAll(key_words):
for word in word_tokenize(i.text):
important[key_words].append(self.stemmer.stem(word))
# Gets a cleaner version text comparative to soup.get_text()
clean_text = ' '.join(soup.stripped_strings)
# Looks for large white space, tabbed space, and other forms of spacing and removes it
# Regex expression matches for space characters excluding a single space or words
clean_text = re.sub(r'\s[^ \w]', '', clean_text)
# Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended
clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)])
# Stems tokenized text
clean_text = " ".join([self.stemmer.stem(i) for i in clean_text.split()])
# Put clean_text as an element in a list because get_tf_idf workers properly with single element lists
tokens = word_tokenize(clean_text)
#counter(count,positionals)
counter = dict()
#We calculating tf_raw, and positionals here
for i in range(len(tokens)):
word = tokens[i]
if word in counter:
counter[word][0] = counter[word][0] + 1
counter[word][1].append(i)
else:
counter[word] = [1,list()]
counter[word][1].append(i)
doc_length = len(tokens)
total = 0
for index in counter:
tf = counter[index][0]/doc_length
log_tf = 1 + math.log(tf)
total = total + log_tf * log_tf
if index in self.index:
postings = self.index[index]
postings.append(Posting(doc_id,url,tf,0,counter[index][1]))
else:
self.index[index] = list()
self.index[index].append(Posting(doc_id,url,tf,0,counter[index][1]))
self.index[index].sort(key=lambda y:y.doc_id)
self.indexer.weight[doc_id] = math.sqrt(total)
#10 Megabytes index (in Ram approx)
if sys.getsizeof(self.index) > 1000000:
self.dump()