From 53c7b49806a2a68ed35f0392a53f8a8a381d01ae Mon Sep 17 00:00:00 2001 From: inocturnis Date: Fri, 27 May 2022 03:08:56 -0700 Subject: [PATCH] Massive changes to indexer and created merge --- indexer.py | 160 +---------------------------------------------------- posting.py | 8 ++- test.py | 128 +++++++++++++++++++++++++++++++++++++----- 3 files changed, 120 insertions(+), 176 deletions(-) diff --git a/indexer.py b/indexer.py index ef711ad..c3cb9ce 100644 --- a/indexer.py +++ b/indexer.py @@ -34,162 +34,17 @@ from worker import Worker class Indexer(): - def __init__(self,restart,trimming): + def __init__(self,restart): #Config stuffs self.path = "data/DEV/" self.restart = restart - self.trimming = trimming - self.stemmer = PorterStemmer() - - #Shelves for index - #https://www3.nd.edu/~busiforc/handouts/cryptography/letterfrequencies.html - #https://www.irishtimes.com/news/science/how-many-numbers-begin-with-a-1-more-than-30-per-cent-1.4162466 - #According to this will be how we split things - #Save #1 = ABCD + (1) ~ 18.3% of words - #Save #2 = EFGHIJK + (2-3)~ 27.1% of words - #Save #3 = LMNOPQ + (4-7) ~ 25.4% of words - #Save #4 = RSTUVWXYZ + (8-9)~ 29.2% of words - #Save #5 = Special characters - if os.path.exists("save_1.shelve") and restart: - os.remove("save_1.shelve") - if os.path.exists("save_2.shelve") and restart: - os.remove("save_2.shelve") - if os.path.exists("save_3.shelve") and restart: - os.remove("save_3.shelve") - if os.path.exists("save_4.shelve") and restart: - os.remove("save_4.shelve") - if os.path.exists("save_5.shelve") and restart: - os.remove("save_5.shelve") - - - self.save_1 = shelve.open("save_1.shelve") - self.save_1_lock = threading.Lock() - self.save_2 = shelve.open("save_2.shelve") - self.save_2_lock = threading.Lock() - self.save_3 = shelve.open("save_3.shelve") - self.save_3_lock = threading.Lock() - self.save_4 = shelve.open("save_4.shelve") - self.save_4_lock = threading.Lock() - self.save_5 = shelve.open("save_5.shelve") - self.save_5_lock = threading.Lock() - - print(len(list(self.save_1.keys()))) - print(len(list(self.save_2.keys()))) - print(len(list(self.save_3.keys()))) - print(len(list(self.save_4.keys()))) - print(len(list(self.save_5.keys()))) - - def save_index(self,word,posting): - cur_save = self.get_save_file(word) - lock = self.get_save_lock(word) - lock.acquire() - shelve_list = list() - try: - shelve_list = cur_save[word] - shelve_list.append(posting) - tic = perf_counter() - shelve_list.sort(key=lambda x: x.tf_idf, reverse = True) - toc = perf_counter() - if toc - tic > 1 : - print("Took " + str(toc - tic) + "seconds to sort shelve list !") - cur_save.sync() - lock.release() - except: - shelve_list.append(posting) - cur_save[word] = shelve_list - cur_save.sync() - lock.release() - - def get_save_file(self,word): - #return the correct save depending on the starting letter of word - word_lower = word.lower() - - if re.match(r"^[a-d0-1].*",word_lower): - return self.save_1 - elif re.match(r"^[e-k2-3].*",word_lower): - return self.save_2 - elif re.match(r"^[l-q4-7].*",word_lower): - return self.save_3 - elif re.match(r"^[r-z8-9].*",word_lower): - return self.save_4 - else: - print(word) - print("You have somehow went beyond the magic") - return self.save_5 - - def get_save_lock(self,word): - word_lower = word.lower() - if re.match(r"^[a-d0-1].*",word_lower): - return self.save_1_lock - elif re.match(r"^[e-k2-3].*",word_lower): - return self.save_2_lock - elif re.match(r"^[l-q4-7].*",word_lower): - return self.save_3_lock - elif re.match(r"^[r-z8-9].*",word_lower): - return self.save_4_lock - else: - print(word) - print("You have somehow went beyond the magic") - return self.save_5_lock.acquire() - - # I have a test file (mytest.py) with pandas but couldn't figure out how to grab just a single cell. - # so I came up with this, if anyone knows how to get a single cell and can explain it to - # me I would love to know, as I think that method might be quicker, maybe, idk it like - # 4am - - # retuns a dict of words/n-grams with their assosiated tf-idf score *can also return just a single score or a pandas dataframe - # https://stackoverflow.com/questions/34449127/sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-documen - - # Andy: added paramenter imporant_words in order to do multiplication of score - def get_tf_idf(self,words,word, important_words): - #tf_idf - #words = whole text - #word the word we finding the score for - #return the score - try: - tfidf = TfidfVectorizer() - tfidf_matrix = tfidf.fit_transform(words) - df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out()) - score = df.iloc[0][''.join(word)] - for k,v in important_words.items(): - if k == 'b' and word in v: - score = score * 1.2 - elif k == 'h1' and word in v: - score = score * 1.75 - elif k == 'h2' and word in v: - score = score * 1.5 - elif k == 'h3' and word in v: - score = score * 1.2 - elif k == 'title' and word in v: - score = score * 2 - return(score) - #print(df) - except KeyError: - return -1 - tfidf = TfidfVectorizer(ngram_range=(1,3)) # ngram_range is range of n-values for different n-grams to be extracted (1,3) gets unigrams, bigrams, trigrams - tfidf_matrix = tfidf.fit_transform(words) # fit trains the model, transform creates matrix - df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out()) # store value of matrix to associated word/n-gram - #return(df.iloc[0][''.join(word)]) #used for finding single word in dataset - data = df.to_dict() # transform dataframe to dict *could be expensive the larger the data gets, tested on ~1000 word doc and took 0.002 secs to run - return data # returns the dict of words/n-grams with tf-idf - #print(df) # debugging - except: - print("Error in tf_idf!") - return - - + def get_data(self): - num_threads = 1 threads = list() for directory in os.listdir(self.path): for file in os.listdir(self.path + "/" + directory + "/"): - #Actual files here - #JSON["url"] = url of crawled page, ignore fragments - #JSON["content"] = actual HTML - #JSON["encoding"] = ENCODING - index = 0 while True: file_path = self.path + "" + directory + "/"+file if len(threads) < num_threads: @@ -212,17 +67,6 @@ class Indexer(): # #getting important tokens - - - - - - - - - - - def main(): indexer = Indexer(True,0) diff --git a/posting.py b/posting.py index e6d1141..4edf6c6 100644 --- a/posting.py +++ b/posting.py @@ -1,10 +1,12 @@ #Posting class for indexer, will probably be more complex as we keep adding crap to it class Posting(): - def __init__(self,url,tf_idf): - self.url = url + def __init__(self,doc_id,tf_raw,tf_idf,positionals): + self.doc_id = doc_id + self.tf_raw = tf_raw self.tf_idf = tf_idf - + self.positionals = positionals + def comparator(self): #Some custom comparator for sorting postings later pass \ No newline at end of file diff --git a/test.py b/test.py index 754903b..f2d8011 100644 --- a/test.py +++ b/test.py @@ -1,17 +1,115 @@ -import re -import os +import json +from posting import Posting +import math +import sys +import random +from nltk.corpus import words +random_list = [1,2,3,4,5,6,7,8,9,10] -for i in range(99): - word_lower = chr(i % 26 + 97) + chr(i % 26 + 97 + 1) - print(word_lower) - if re.match(r"^[a-d1-1].*",word_lower): - print("SAVE 1") - elif re.match(r"^[e-k2-3].*",word_lower): - print("SAVE 2") - elif re.match(r"^[l-q4-7].*",word_lower): - print("SAVE 3") - elif re.match(r"^[r-z8-9].*",word_lower): - print("SAVE 4") -path = "data/DEV/" -print(os.listdir(path)) \ No newline at end of file +test_data = words.words() +random.shuffle(test_data) + +class Node(): + index_value = '' + postings = list() + +class Index(): + length = 0 + index = list() + +def random_posting(id): + return Posting(id,random.choice(random_list),random.choice(random_list),[random.choice(random_list),random.choice(random_list),random.choice(random_list),random.choice(random_list), + random.choice(random_list),random.choice(random_list),random.choice(random_list),random.choice(random_list)]) + +def random_partial_index(name): + part_index = Index() + part_index.index = list() + part_index.length = 0 + with open(name +'.partial', 'w') as f: + for i in range(1000): + + node1 = Node() + node1.index_value = random.choice(test_data).lower() + node1.postings = list() + for i in range(10): + node1.postings.append(random_posting(i)) + + jsonStr = json.dumps(node1, default=lambda o: o.__dict__,sort_keys=False) + + part_index.index.append((node1.index_value,f.tell())) + f.write(jsonStr + '\n') + part_index.length = part_index.length + 1 + + part_index.index.sort(key=lambda y:y[0]) + jsonStr =json.dumps(part_index, default=lambda o: o.__dict__,sort_keys=False) + with open(name + '.index','w') as f: + f.write(jsonStr) + +def merge(partial_indices): + partial_files = list() + partial_index_files = list() + parital_index_indices = list() + merged_index = open("merged_index.full",'w') + num_indices = len(partial_indices) + + #Full Index.Index and Length + full_index = Index() + full_index.index = list() + full_index.length = 0 + + for partial_index in partial_indices: + file = open(partial_index+'.partial','r') + partial_files.append(file) + index = open(partial_index+'.index','r') + partial_index_files.append(index) + + for partial_index_file in partial_index_files: + partial_index_file.seek(0,0) + parital_index_indices.append(json.loads(partial_index_file.readline())) + + #Start all indexes at 0 + for partial_file in partial_files: + partial_file.seek(0,0) + + pointers = [0]*num_indices + + while(True): + + #Get all values from all indices to find min + value = None + values = list() + for i in range(num_indices): + if pointers[i] < parital_index_indices[i]['length']: + values.append(parital_index_indices[i]['index'][pointers[i]][0]) + + if(len(values) == 0): + break + value = min(values) + + #Get data from the min value of all indices if exists then save to mergedIndex + if value == None: + print("I have crashed some how by not getting min value") + break + + node = Node() + node.index_value = value + for i in range(num_indices): + if pointers[i] < parital_index_indices[i]['length'] and parital_index_indices[i]['index'][pointers[i]][0] == value: + to_seek = parital_index_indices[i]['index'][pointers[i]][1] + partial_files[i].seek(to_seek,0) + json_value = partial_files[i].readline() + temp_node = json.loads(json_value) + node.postings = node.postings + temp_node['postings'] + pointers[i] = pointers[i] + 1 + + node.postings.sort(key=lambda y:y['doc_id']) + full_index.index.append((value,merged_index.tell())) + full_index.length = full_index.length + 1 + jsonStr = json.dumps(node,default=lambda o: o.__dict__,sort_keys=False) + merged_index.write(jsonStr + '\n') + + full_index.index.sort(key=lambda y:y[0]) + jsonStr =json.dumps(full_index, default=lambda o: o.__dict__,sort_keys=False) + with open("merged_index.index" ,'w') as f: + f.write(jsonStr)