Compare commits

...

9 Commits
main ... search

Author SHA1 Message Date
Aaron
3e047aec45
test and readme txt 2022-05-27 21:37:38 -07:00
unknown
e325b9d810 Same as previous push 2022-05-27 13:12:15 -07:00
unknown
60f6eb0df0 search functionality to obtain set of documents 2022-05-26 23:34:29 -07:00
unknown
95ba16cf2e added normalizing functionality + tfidf 2022-05-26 01:05:26 -07:00
unknown
d80a977450 Added way to save doc score 2022-05-25 19:59:31 -07:00
unknown
a567424a54 created new tf-idf and changed posting class 2022-05-25 18:41:36 -07:00
unknown
a736e05d00 changed tf-idf 2022-05-25 18:39:02 -07:00
unknown
d9fdee7b87 Added way to save ngrams to index 2022-05-13 16:42:33 -07:00
unknown
808ed56bb7 Nothing changed just added a space 2022-05-11 17:22:01 -07:00
13 changed files with 706 additions and 275 deletions

2
.gitignore vendored
View File

@ -1,3 +1,5 @@
/data/ /data/
*.shelve *.shelve
/__pycache__/ /__pycache__/
/test/
merged*

8
README.txt Normal file
View File

@ -0,0 +1,8 @@
### To create index:
1. Make sure that all requirements are installed, check `requirements.txt` and install using `pip install reqirements.txt`.
2. Run `python indexer.py` to build index, this may take some time to run.
3. Index is now created.
### Start search interface:
Run `python launcher.py` to start the search interface.
### Perform query:
To perfrom a search simply enter a query in the textbox and click search. The top results will be displayed.

52
TEST.txt Normal file
View File

@ -0,0 +1,52 @@
### Bad:
- computer science - common
- university of california irvine -common
- donald bren - common
- uci - common
- informatics - common
- The Donald Bren School of Information and Computer Sciences - long and common
- toilet - not likely to be found easily
- perfume - not likely to be found
- SPY×FAMILY - should not exist in data
- undergraduate - likely to be on tons of pages
### Good to Meh:
- liquids in labs - uncommon word with common
- Alberto Krone-Martins - should have a good amount of results but not absurd
- Advising & Planning - should be specific but not too common
- Honors Program - ^
- Papaefthymiou - similar to the martins query
- General information - there should be quite a few pages with this but not tons
- Prerequisite Clearing System - has some common and uncommon terms
- Recruiting - not stupid common
- counseling - ^ and should only be on a subset of pages
- social justice - specific terms that should appear without being costly
### Others tested:
- masters of computer science - not super common but will have a good amount of pages
- thornton ics46 notes - name + class + common
- Theory of Computation - two terms which have high count in papers
- facility distribution - two terms which don't really make sense together
- artificial intelligence history - two common terms with semi-common
- prospective alumni - should have very few instances of both terms but should be found together
- enrollment window - should be on only a couple of pages
- available capstone sponsorship - ^
- spring seminars - common with term that may be somewhat restricted
- hackuci - two terms into one that exists in dataset
- ucinetid help - specific term with common
- course restrictions - specific pages
- project management - a course name
- yelan research - term should not exist + common
- hybrid-learning - common phrase
- genshin is a computer game - contains terms that exist and others that don't
- computable AI machine learning big data - sentence of CS buzz words (really really common)
- Publications & Technical Reports - in json file
- Tutor coordinators - in many json (bold, title, and body)
- Death Image Service - in some weird areas
- send anonymous email - only in some
### Things done for improvement
1. Create index of index for substantial gain in efficiency and speed.
2. Split TF-IDF into TF and IDF for more specific calculations when needed without the whole computation. This also removes the relevance on external library for TF-IDF.
3. Switched from using IDF & weight, to TF & weight for helping with the overall weight.
4. Dropped indexing and searching of unigram, bigram, and trigrams.
5. Add length of document during indexing for improved speed via normalization calculation.

1
docs.weight Normal file

File diff suppressed because one or more lines are too long

View File

@ -17,6 +17,8 @@ from bs4 import BeautifulSoup
from time import perf_counter from time import perf_counter
import time import time
import threading import threading
from threading import Lock
import math
#Data process #Data process
@ -32,187 +34,196 @@ import re
from posting import Posting from posting import Posting
from worker import Worker from worker import Worker
class Node():
index_value = ''
postings = list()
class Index():
length = 0
index = list()
class Indexer(): class Indexer():
def __init__(self,restart,trimming): def __init__(self,list_partials,weight,data_paths,worker_factory=Worker):
#Config stuffs #Config stuffs
self.path = "data/DEV/" self.path = "data/DEV"
self.restart = restart self.num_doc = 0
self.trimming = trimming self.list_partials = list_partials
self.weight = weight
self.data_paths = data_paths
self.stemmer = PorterStemmer() self.stemmer = PorterStemmer()
self.data_paths_lock = Lock()
self.list_partials_lock = Lock()
#Shelves for index self.workers = list()
#https://www3.nd.edu/~busiforc/handouts/cryptography/letterfrequencies.html self.worker_factory = worker_factory
#https://www.irishtimes.com/news/science/how-many-numbers-begin-with-a-1-more-than-30-per-cent-1.4162466
#According to this will be how we split things
#Save #1 = ABCD + (1) ~ 18.3% of words
#Save #2 = EFGHIJK + (2-3)~ 27.1% of words
#Save #3 = LMNOPQ + (4-7) ~ 25.4% of words
#Save #4 = RSTUVWXYZ + (8-9)~ 29.2% of words
#Save #5 = Special characters
if os.path.exists("save_1.shelve") and restart:
os.remove("save_1.shelve")
if os.path.exists("save_2.shelve") and restart:
os.remove("save_2.shelve")
if os.path.exists("save_3.shelve") and restart:
os.remove("save_3.shelve")
if os.path.exists("save_4.shelve") and restart:
os.remove("save_4.shelve")
if os.path.exists("save_5.shelve") and restart:
os.remove("save_5.shelve")
self.save_1 = shelve.open("save_1.shelve") def start_async(self):
self.save_1_lock = threading.Lock() self.workers = [
self.save_2 = shelve.open("save_2.shelve") self.worker_factory(worker_id,self)
self.save_2_lock = threading.Lock() for worker_id in range(8)]
self.save_3 = shelve.open("save_3.shelve") for worker in self.workers:
self.save_3_lock = threading.Lock() worker.start()
self.save_4 = shelve.open("save_4.shelve")
self.save_4_lock = threading.Lock()
self.save_5 = shelve.open("save_5.shelve")
self.save_5_lock = threading.Lock()
print(len(list(self.save_1.keys()))) def start(self):
print(len(list(self.save_2.keys()))) self.start_async()
print(len(list(self.save_3.keys()))) self.join()
print(len(list(self.save_4.keys())))
print(len(list(self.save_5.keys())))
def save_index(self,word,posting): def join(self):
cur_save = self.get_save_file(word) for worker in self.workers:
lock = self.get_save_lock(word) worker.join()
lock.acquire()
shelve_list = list()
try:
shelve_list = cur_save[word]
shelve_list.append(posting)
tic = perf_counter()
shelve_list.sort(key=lambda x: x.tf_idf, reverse = True)
toc = perf_counter()
if toc - tic > 1 :
print("Took " + str(toc - tic) + "seconds to sort shelve list !")
cur_save.sync()
lock.release()
except:
shelve_list.append(posting)
cur_save[word] = shelve_list
cur_save.sync()
lock.release()
def get_save_file(self,word):
#return the correct save depending on the starting letter of word
word_lower = word.lower()
if re.match(r"^[a-d0-1].*",word_lower):
return self.save_1
elif re.match(r"^[e-k2-3].*",word_lower):
return self.save_2
elif re.match(r"^[l-q4-7].*",word_lower):
return self.save_3
elif re.match(r"^[r-z8-9].*",word_lower):
return self.save_4
else:
print(word)
print("You have somehow went beyond the magic")
return self.save_5
def get_save_lock(self,word):
word_lower = word.lower()
if re.match(r"^[a-d0-1].*",word_lower):
return self.save_1_lock
elif re.match(r"^[e-k2-3].*",word_lower):
return self.save_2_lock
elif re.match(r"^[l-q4-7].*",word_lower):
return self.save_3_lock
elif re.match(r"^[r-z8-9].*",word_lower):
return self.save_4_lock
else:
print(word)
print("You have somehow went beyond the magic")
return self.save_5_lock.acquire()
# I have a test file (mytest.py) with pandas but couldn't figure out how to grab just a single cell.
# so I came up with this, if anyone knows how to get a single cell and can explain it to
# me I would love to know, as I think that method might be quicker, maybe, idk it like
# 4am
# https://stackoverflow.com/questions/34449127/sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-documen
# Andy: added paramenter imporant_words in order to do multiplication of score
def get_tf_idf(self,words,word, important_words):
#tf_idf
#words = whole text
#word the word we finding the score for
#return the score
try:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(words)
df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out())
score = df.iloc[0][''.join(word)]
for k,v in important_words.items():
if k == 'b' and word in v:
score = score * 1.2
elif k == 'h1' and word in v:
score = score * 1.75
elif k == 'h2' and word in v:
score = score * 1.5
elif k == 'h3' and word in v:
score = score * 1.2
elif k == 'title' and word in v:
score = score * 2
return(score)
#print(df)
except KeyError:
return -1
def get_data(self): def get_postings(self,index):
merged_index_index = open("merged_index.index" ,'r')
merged_index = open("merged_index.full",'r')
merged_index_index.seek(0,0)
json_value = merged_index_index.readline()
data = json.loads(json_value)
index_index = dict(data['index'])
to_seek = index_index[index]
merged_index.seek(to_seek,0)
json_value = merged_index.readline()
data = json.loads(json_value)
return data['postings']
num_threads = 8 def set_weight(self):
threads = list() weight_file = open('docs.weight','w')
jsonStr =json.dumps(self.weight, default=lambda o: o.__dict__,sort_keys=False)
weight_file.write(jsonStr)
weight_file.close()
def get_weight(self,doc_id):
weight = open('docs.weight','r')
weight.seek(0,0)
json_value = weight.readline()
data = json.loads(json_value)
return data[doc_id]
def get_data_path(self):
for directory in os.listdir(self.path): for directory in os.listdir(self.path):
for file in os.listdir(self.path + "/" + directory + "/"): for file in os.listdir(self.path + "/" + directory + "/"):
#Actual files here self.data_paths.append("data/DEV/" + directory + "/"+file)
#JSON["url"] = url of crawled page, ignore fragments self.num_doc = len(self.data_paths)
#JSON["content"] = actual HTML
#JSON["encoding"] = ENCODING def get_next_file(self):
index = 0 self.data_paths_lock.acquire()
while True: try:
file_path = self.path + "" + directory + "/"+file holder = self.data_paths.pop()
if len(threads) < num_threads: self.data_paths_lock.release()
thread = Worker(self,file_path) return holder
threads.append(thread) except IndexError:
thread.start() self.data_paths_lock.release()
break return None
else:
if not threads[index].is_alive(): def add_partial_index(self,partial_index):
threads[index] = Worker(self,file_path) self.list_partials_lock.acquire()
threads[index].start() self.list_partials.append(partial_index)
break self.list_partials_lock.release()
else:
index = index + 1
if(index >= num_threads):
index = 0
time.sleep(.1)
#Found 55770 documents #Found 55770 documents
# #
#getting important tokens
#getting important tokens def merge(self):
partial_files = list()
partial_index_files = list()
parital_index_indices = list()
num_indices = len(self.list_partials)
#Full Index.Index and Length
full_index = Index()
full_index.index = list()
full_index.length = 0
for partial_index in self.list_partials:
file = open("temp/" + partial_index+'.partial','r')
partial_files.append(file)
index = open("temp/" + partial_index+'.index','r')
partial_index_files.append(index)
for partial_index_file in partial_index_files:
partial_index_file.seek(0,0)
parital_index_indices.append(json.loads(partial_index_file.readline()))
#Start all indexes at 0
for partial_file in partial_files:
partial_file.seek(0,0)
pointers = [0]*num_indices
merged_index = open("merged_index.full",'w')
merged_index_index = open("merged_index.index" ,'w')
while(True):
#Get all values from all indices to find min
value = None
values = list()
for i in range(num_indices):
if pointers[i] < parital_index_indices[i]['length']:
values.append(parital_index_indices[i]['index'][pointers[i]][0])
if(len(values) == 0):
break
value = min(values)
#Get data from the min value of all indices if exists then save to mergedIndex
if value == None:
print("I have crashed some how by not getting min value")
break
node = Node()
node.index_value = value
for i in range(num_indices):
if pointers[i] < parital_index_indices[i]['length'] and parital_index_indices[i]['index'][pointers[i]][0] == value:
to_seek = parital_index_indices[i]['index'][pointers[i]][1]
partial_files[i].seek(to_seek,0)
json_value = partial_files[i].readline()
temp_node = json.loads(json_value)
node.postings = node.postings + temp_node['postings']
pointers[i] = pointers[i] + 1
#Change postings here with tf*idf idf = log (n/df(t))
node.postings.sort(key=lambda y:y['doc_id'])
for posting in node.postings:
posting['tf_idf'] = posting['tf_raw']*math.log(self.num_doc/len(node.postings))
full_index.index.append((value,merged_index.tell()))
full_index.length = full_index.length + 1
jsonStr = json.dumps(node,default=lambda o: o.__dict__,sort_keys=False)
merged_index.write(jsonStr + '\n')
full_index.index.sort(key=lambda y:y[0])
jsonStr =json.dumps(full_index, default=lambda o: o.__dict__,sort_keys=False)
merged_index_index.write(jsonStr)
for partial_index in self.list_partials:
os.remove("temp/" + partial_index+'.partial')
os.remove("temp/" + partial_index+'.index')
merged_index_index.close()
merged_index.close()
def main(): def main():
indexer = Indexer(True,0) indexer = Indexer(list(),dict(),list())
indexer.get_data() indexer.get_data_path()
print("We have " + str(len(indexer.data_paths)) + " documents to go through !" )
indexer.start()
indexer.merge()
print("Finished merging into 1 big happy family")
indexer.set_weight()
tic = time.perf_counter()
indexer.get_postings('artifici')
toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to get postings for artifici")
tic = time.perf_counter()
indexer.get_weight('00ba3af6a00b7cfb4928e5d234342c5dc46b4e31714d4a8f315a2dd4d8e49860')
print(f"Took {toc - tic:0.4f} seconds to get weight for some random page ")
toc = time.perf_counter()
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -4,6 +4,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd import pandas as pd
import numpy as np import numpy as np
#tf_idf #tf_idf
#words = whole text #words = whole text
#word the word we finding the score for #word the word we finding the score for
@ -19,13 +20,12 @@ words = ['this is the first document '
doc1 = ["I can't fucking take it any more. Among Us has singlehandedly ruined my life. The other day my teacher was teaching us Greek Mythology and he mentioned a pegasus and I immediately thought 'Pegasus? more like Mega Sus!!!!' and I've never wanted to kms more. I can't look at a vent without breaking down and fucking crying. I can't eat pasta without thinking 'IMPASTA??? THATS PRETTY SUS!!!!' Skit 4 by Kanye West. The lyrics ruined me. A Mongoose, or the 25th island of greece. The scientific name for pig. I can't fucking take it anymore. Please fucking end my suffering."] doc1 = ["I can't fucking take it any more. Among Us has singlehandedly ruined my life. The other day my teacher was teaching us Greek Mythology and he mentioned a pegasus and I immediately thought 'Pegasus? more like Mega Sus!!!!' and I've never wanted to kms more. I can't look at a vent without breaking down and fucking crying. I can't eat pasta without thinking 'IMPASTA??? THATS PRETTY SUS!!!!' Skit 4 by Kanye West. The lyrics ruined me. A Mongoose, or the 25th island of greece. The scientific name for pig. I can't fucking take it anymore. Please fucking end my suffering."]
doc2 = ["Anyways, um... I bought a whole bunch of shungite rocks, do you know what shungite is? Anybody know what shungite is? No, not Suge Knight, I think he's locked up in prison. I'm talkin' shungite. Anyways, it's a two billion year-old like, rock stone that protects against frequencies and unwanted frequencies that may be traveling in the air. That's my story, I bought a whole bunch of stuff. Put 'em around the la casa. Little pyramids, stuff like that."] doc2 = ["Anyways, um... I bought a whole bunch of shungite rocks, do you know what shungite is? Anybody know what shungite is? No, not Suge Knight, I think he's locked up in prison. I'm talkin' shungite. Anyways, it's a two billion year-old like, rock stone that protects against frequencies and unwanted frequencies that may be traveling in the air. That's my story, I bought a whole bunch of stuff. Put 'em around the la casa. Little pyramids, stuff like that."]
word = 'life' word = 'life'
try: try:
tfidf = TfidfVectorizer() tfidf = TfidfVectorizer(ngram_range=(3,3)) # ngram_range is range of n-values for different n-grams to be extracted (1,3) gets unigrams, bigrams, trigrams
tfidf_matrix = tfidf.fit_transform(doc1) tfidf_matrix = tfidf.fit_transform(words)
df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out()) df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out())
print(df.iloc[0][''.join(word)]) #print(df.iloc[0][''.join(word)])
#print(df) data = df.to_dict()
except KeyError: # word does not exist except KeyError: # word does not exist
print(-1) print(-1)

View File

@ -1,9 +1,16 @@
#Posting class for indexer, will probably be more complex as we keep adding crap to it #Posting class for indexer, will probably be more complex as we keep adding crap to it
class Posting(): class Posting():
def __init__(self,url,tf_idf): def __init__(self,doc_id,url,tf_raw,tf_idf,positionals):
self.doc_id = doc_id
self.url = url self.url = url
self.tf_raw = tf_raw
self.tf_idf = tf_idf self.tf_idf = tf_idf
self.positionals = positionals
def __repr__(self):
return "Doc_id:" + str(self.doc_id) + " URL:" + self.url + " tf_raw:" + str(self.tf_raw) + " tf_idf:" + str(self.tf_idf) + " positionals:" + str(self.positionals)
def __str__(self):
return "Doc_id:" + str(self.doc_id) + " URL:" + self.url + " tf_raw:" + str(self.tf_raw) + " tf_idf:" + str(self.tf_idf) + " positionals:" + str(self.positionals)
def comparator(self): def comparator(self):
#Some custom comparator for sorting postings later #Some custom comparator for sorting postings later

111
search.py Normal file
View File

@ -0,0 +1,111 @@
#Data input
import json
import os
import shelve
from bs4 import BeautifulSoup
from time import perf_counter
import time
import threading
import pickle
#Data process
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import re
#Logging postings
from posting import Posting
from worker import Worker
class Search():
# wrote the code for testing in the file searchtesting.py so many of the variables and function calls are wrong.
def __init__(self):
self.stemmer = PorterStemmer()
p = os.path.dirname(os.path.abspath(__file__))
my_filename = os.path.join(p, "urlID.pkl")
self.f = open(my_filename, "rb+")
self.id = pickle.load(self.f)
# takes a list of posting lists returns a list of indexes that correspond to search temp list
def two_shortest(self, l_posting):
short = []
location = []
for postings in l_posting:
short.append(len(postings))
for i in range(2):
x = short.index(min(short))
location.append(x)
short[x] = float('inf')
return location
# len(list1) <= len(list2) So the code in this function works with that in mind
def merge(self, list1, list2):
merged = []
i = 0
j = 0
# TODO: optimize by having a pointer to the current index+4
while i < len(list1) or j < len(list2):
if j == len(list2):
break
if i == len(list1):
break
# Since list1 is shorter it will hit its max index sooner,
# so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2
if i == len(list1)-1:
if list1[i].url == list2[j].url:
merged.append(list1[i])
j += 1
i += 1
elif list1[i].url < list2[j].url:
break
else:
j += 1
else:
if list1[i].url == list2[j].url:
merged.append(list1[i])
i += 1
j += 1
elif list1[i].url < list2[j].url:
break
else:
i += 1
j += 1
return merged
# query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
def search(self, query):
temp = []
for token in query:
temp.append(get_index(token))
l = two_shortest(temp)
m = merge(temp[l[0]], temp[l[1]])
while len(temp) > 1:
# delete from temp the already merged lists
del temp[l[0]]
del temp[l[1]]
temp.append(m)
l = two_shortest(temp)
m = merge(temp[l[0]], temp[l[1]])
for p in m:
print(p.url)
# For now going to do a loop through each query's index and match it with the merged list (can be faster if i implement something during merge/search in order to keep track of the postings)

117
searchtesting.py Normal file
View File

@ -0,0 +1,117 @@
import math
import json
import os
import shelve
from bs4 import BeautifulSoup
from time import perf_counter
import time
import threading
import pickle
#Data process
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import re
class Posting():
def __init__(self, url, rtf, position):
self.url = url
self.rtf = rtf
self.tf = 1
self.tfidf = 0
self.positions = [position]
d = {
'a' : [Posting(0, 1, 1), Posting(2, 1, 1), Posting(3, 1, 1), Posting(8, 1, 1)],
'b' :[Posting(0, 1, 1), Posting(8, 1, 1)],
'c' : [Posting(0, 1, 1), Posting(1, 1, 1), Posting(2, 1, 1), Posting(8, 1, 1)]
}
def get_index(word):
for k, v in d.items():
if k == word:
return v
# takes a list of posting lists returns a list of indexes that correspond to search temp list
def two_shortest(l_posting):
short = []
location = []
for postings in l_posting:
short.append(len(postings))
for i in range(2):
x = short.index(min(short))
location.append(x)
short[x] = float('inf')
return location
# len(list1) <= len(list2) So the code in this function works with that in mind
def merge(list1, list2):
merged = []
i = 0
j = 0
# TODO: optimize by having a pointer to the current index+4
while i < len(list1) or j < len(list2):
if j == len(list2):
break
if i == len(list1):
break
# Since list1 is shorter it will hit its max index sooner,
# so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2
if i == len(list1)-1:
if list1[i].url == list2[j].url:
merged.append(list1[i])
j += 1
i += 1
elif list1[i].url < list2[j].url:
break
else:
j += 1
else:
if list1[i].url == list2[j].url:
merged.append(list1[i])
i += 1
j += 1
elif list1[i].url < list2[j].url:
break
else:
i += 1
j += 1
return merged,
# query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
def search(query):
temp = []
for token in query:
temp.append(get_index(token))
l = two_shortest(temp)
m = merge(temp[l[0]], temp[l[1]])
while len(temp) > 1:
# delete from temp the already merged lists
del temp[l[0]]
del temp[l[1]]
temp.append(m)
l = two_shortest(temp)
m = merge(temp[l[0]], temp[l[1]])
for p in m:
print(p.url)
# For now going to do a loop through each query's index and match it with the merged list (can be faster if i implement something during merge/search in order to keep track of the postings)
search(["a", "b", "c"])

View File

@ -1,18 +0,0 @@
#Multiple implementation of stemming here please
class Stemmer():
def __init__(self,mode, data):
#Different type of stemmer = different modes
self.mode = mode
self.data = data
def stem(self):
#Do stuff here
if(self.mode == 0):
#Do stemmer 1
return #stemmed data
#....
def #name of stemmer 1
def #name of stemmer 2

26
test.py
View File

@ -1,17 +1,13 @@
import re from threading import Thread
import json
import os import os
import shelve
import sys
from bs4 import BeautifulSoup
from time import perf_counter
from nltk.stem import PorterStemmer
import nltk
import time
from posting import Posting
for i in range(99): import re
word_lower = chr(i % 26 + 97) + chr(i % 26 + 97 + 1)
print(word_lower)
if re.match(r"^[a-d1-1].*",word_lower):
print("SAVE 1")
elif re.match(r"^[e-k2-3].*",word_lower):
print("SAVE 2")
elif re.match(r"^[l-q4-7].*",word_lower):
print("SAVE 3")
elif re.match(r"^[r-z8-9].*",word_lower):
print("SAVE 4")
path = "data/DEV/"
print(os.listdir(path))

116
test_merge.py Normal file
View File

@ -0,0 +1,116 @@
import json
from posting import Posting
import math
import sys
import random
from nltk.corpus import words
random_list = [1,2,3,4,5,6,7,8,9,10]
test_data = words.words()
random.shuffle(test_data)
def random_posting(id):
return Posting(id,random.choice(random_list),random.choice(random_list),[random.choice(random_list),random.choice(random_list),random.choice(random_list),random.choice(random_list),
random.choice(random_list),random.choice(random_list),random.choice(random_list),random.choice(random_list)])
class Node():
index_value = 'Something'
postings = list()
class Index():
length = 0
index = list()
def random_partial_index(name):
part_index = Index()
part_index.index = list()
part_index.length = 0
with open(name +'.partial', 'w') as f:
for i in range(1000):
node1 = Node()
node1.index_value = random.choice(test_data).lower()
node1.postings = list()
for i in range(10):
node1.postings.append(random_posting(i))
jsonStr = json.dumps(node1, default=lambda o: o.__dict__,sort_keys=False)
part_index.index.append((node1.index_value,f.tell()))
f.write(jsonStr + '\n')
part_index.length = part_index.length + 1
part_index.index.sort(key=lambda y:y[0])
jsonStr =json.dumps(part_index, default=lambda o: o.__dict__,sort_keys=False)
with open(name + '.index','w') as f:
f.write(jsonStr)
def merge(partial_indices):
partial_files = list()
partial_index_files = list()
parital_index_indices = list()
merged_index = open("merged_index.full",'w')
num_indices = len(partial_indices)
#Full Index.Index and Length
full_index = Index()
full_index.index = list()
full_index.length = 0
for partial_index in partial_indices:
file = open(partial_index+'.partial','r')
partial_files.append(file)
index = open(partial_index+'.index','r')
partial_index_files.append(index)
for partial_index_file in partial_index_files:
partial_index_file.seek(0,0)
parital_index_indices.append(json.loads(partial_index_file.readline()))
#Start all indexes at 0
for partial_file in partial_files:
partial_file.seek(0,0)
pointers = [0]*num_indices
while(True):
#Get all values from all indices to find min
value = None
values = list()
for i in range(num_indices):
if pointers[i] < parital_index_indices[i]['length']:
values.append(parital_index_indices[i]['index'][pointers[i]][0])
if(len(values) == 0):
break
value = min(values)
#Get data from the min value of all indices if exists then save to mergedIndex
if value == None:
print("I have crashed some how by not getting min value")
break
node = Node()
node.index_value = value
for i in range(num_indices):
if pointers[i] < parital_index_indices[i]['length'] and parital_index_indices[i]['index'][pointers[i]][0] == value:
to_seek = parital_index_indices[i]['index'][pointers[i]][1]
partial_files[i].seek(to_seek,0)
json_value = partial_files[i].readline()
temp_node = json.loads(json_value)
node.postings = node.postings + temp_node['postings']
pointers[i] = pointers[i] + 1
node.postings.sort(key=lambda y:y['doc_id'])
full_index.index.append((value,merged_index.tell()))
full_index.length = full_index.length + 1
jsonStr = json.dumps(node,default=lambda o: o.__dict__,sort_keys=False)
merged_index.write(jsonStr + '\n')
full_index.index.sort(key=lambda y:y[0])
jsonStr =json.dumps(full_index, default=lambda o: o.__dict__,sort_keys=False)
with open("merged_index.index" ,'w') as f:
f.write(jsonStr)

188
worker.py
View File

@ -1,109 +1,137 @@
from threading import Thread from threading import Thread
import json import json
import os import os
import shelve
from bs4 import BeautifulSoup
from time import perf_counter
import time
from bs4 import BeautifulSoup
import re import re
#Data process #Data process
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from collections import Counter
from posting import Posting from posting import Posting
import math
import sys import sys
class Node():
index_value = ''
postings = list()
class Index():
length = 0
index = list()
class Worker(Thread): class Worker(Thread):
def __init__(self,indexer,target): def __init__(self,worker_id,indexer):
self.file = target
self.indexer = indexer self.indexer = indexer
self.stemmer = PorterStemmer()
self.worker_id = worker_id
self.num_partial = 0
self.index = dict()
super().__init__(daemon=True) super().__init__(daemon=True)
def dump(self):
part_index = Index()
part_index.length = 0
part_index.index = list()
cur_partial_index_str = "temp/" + str(self.worker_id) + "_" + str(self.num_partial) + '.partial'
cur_partial_index_index_str = "temp/" + str(self.worker_id) + "_" + str(self.num_partial) + '.index'
cur_partial_index = open(cur_partial_index_str,'w')
cur_partial_index_index = open(cur_partial_index_index_str,'w')
for key in self.index:
node = Node()
node.index_value = key
node.postings = self.index[key]
jsonStr = json.dumps(node, default=lambda o: o.__dict__,sort_keys=False)
part_index.index.append((node.index_value,cur_partial_index.tell()))
cur_partial_index.write(jsonStr + '\n')
part_index.length = part_index.length + 1
part_index.index.sort(key=lambda y:y[0])
jsonStr =json.dumps(part_index, default=lambda o: o.__dict__,sort_keys=False)
cur_partial_index_index.write(jsonStr)
self.indexer.add_partial_index(str(self.worker_id) + "_" + str(self.num_partial))
self.num_partial = self.num_partial + 1
self.index.clear()
def run(self): def run(self):
print("Target: " + str(self.file)) while True:
ticker = perf_counter() target = self.indexer.get_next_file()
tic = perf_counter() if not target:
file_load = open(self.file) self.dump()
data = json.load(file_load) print("Worker " + str(self.worker_id) + " died")
soup = BeautifulSoup(data["content"],features="lxml") break
words = word_tokenize(soup.get_text()) file_load = open(target)
toc = perf_counter() data = json.load(file_load)
if toc - tic > 1 : soup = BeautifulSoup(data["content"],features="lxml")
print("Took " + str(toc - tic) + "seconds to tokenize text !") doc_id = target[target.rfind('/')+1:-5]
url = data['url']
print("Worker " + str(self.worker_id) + " working on " + url)
important = {'b' : [], 'h1' : [], 'h2' : [], 'h3' : [], 'title' : []}
for key_words in important.keys():
for i in soup.findAll(key_words):
for word in word_tokenize(i.text):
important[key_words].append(self.stemmer.stem(word))
tokenized_words = list() # Gets a cleaner version text comparative to soup.get_text()
stemmed_words = list() clean_text = ' '.join(soup.stripped_strings)
# Looks for large white space, tabbed space, and other forms of spacing and removes it
# Regex expression matches for space characters excluding a single space or words
clean_text = re.sub(r'\s[^ \w]', '', clean_text)
# Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended
clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)])
# Stems tokenized text
clean_text = " ".join([self.stemmer.stem(i) for i in clean_text.split()])
# Put clean_text as an element in a list because get_tf_idf workers properly with single element lists
important = {'b' : [], 'h1' : [], 'h2' : [], 'h3' : [], 'title' : []} tokens = word_tokenize(clean_text)
for key_words in important.keys():
for i in soup.findAll(key_words): #counter(count,positionals)
for word in word_tokenize(i.text):
important[key_words].append(self.indexer.stemmer.stem(word)) counter = dict()
#We calculating tf_raw, and positionals here
for i in range(len(tokens)):
word = tokens[i]
if word in counter:
counter[word][0] = counter[word][0] + 1
counter[word][1].append(i)
else:
counter[word] = [1,list()]
counter[word][1].append(i)
doc_length = len(tokens)
total = 0
for index in counter:
tf = counter[index][0]/doc_length
log_tf = 1 + math.log(tf)
total = total + log_tf * log_tf
if index in self.index:
postings = self.index[index]
postings.append(Posting(doc_id,url,tf,0,counter[index][1]))
else:
self.index[index] = list()
self.index[index].append(Posting(doc_id,url,tf,0,counter[index][1]))
self.index[index].sort(key=lambda y:y.doc_id)
self.indexer.weight[doc_id] = math.sqrt(total)
#10 Megabytes index (in Ram approx)
if sys.getsizeof(self.index) > 1000000:
self.dump()
tic = perf_counter()
for word in words:
if word != "" and re.fullmatch('[A-Za-z0-9]+',word):
#So all the tokenized words are here,
tokenized_words.append(word)
toc = perf_counter()
if toc - tic > 1 :
print("Took " + str(toc - tic) + "seconds to isalnum text !")
#YOUR CODE HERE
tic = perf_counter()
for word in tokenized_words:
stemmed_words.append(self.indexer.stemmer.stem(word))
#stemming,
#tf_idf
#get_tf_idf(stemmed_words,word)
#post = Posting()
toc = perf_counter()
if toc - tic > 1 :
print("Took " + str(toc - tic) + "seconds to stemmed text !")
counts = Counter(stemmed_words)
size = len(stemmed_words)
for word in counts:
#posting = Posting(data["url"],self.get_tf_idf(list(' '.join(stemmed_words)),word))
tic = perf_counter()
weight = 1.0
index = 0
"""
for group in important:
for word_important in group:
if word_important.lower() == word.lower():
if index == 0:
weight = 1.2
elif index == 1:
weight = 1.8
elif index == 2:
weight = 1.5
elif index == 3:
weight = 1.3
elif index == 4:
weight = 2.0
index = index + 1
"""
posting = Posting(data["url"],counts[word]/size*weight)
toc = perf_counter()
if toc - tic > 1 :
print("Took " + str(toc - tic) + "seconds to tf_idf text !")
tic = perf_counter()
self.indexer.save_index(word,posting)
toc = perf_counter()
if toc - tic > 1 :
print("Took " + str(toc - tic) + "seconds to save text !")
tocker = perf_counter()
print("Finished " + data['url'] + "\n" + str(tocker-ticker))