From d80a977450103cb0e96d73e9fde8e27b2c931eec Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 25 May 2022 19:59:31 -0700 Subject: [PATCH] Added way to save doc score --- indexer.py | 53 ++++++++++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/indexer.py b/indexer.py index 5bc734c..e970c04 100644 --- a/indexer.py +++ b/indexer.py @@ -43,21 +43,8 @@ class Indexer(): self.trimming = trimming self.stemmer = PorterStemmer() self.id = list() - - - # Creates a pickle file that is a list of urls where the index of the url is the id that the posting refers to. - p = os.path.dirname(os.path.abspath(__file__)) - my_filename = os.path.join(p, "urlID.pkl") - if os.path.exists(my_filename): - os.remove(my_filename) - - # Creates file and closes it - self.f = open(my_filename, "wb") - pickle.dump(id, self.f) - self.f.close() - - # Opens for reading for the entire duration of indexer for worker to use - self.f = open(my_filename, "rb+") + # list that contains the denominator for normalization before taking the square root of it. square root will be taken during query time + self.normalize = list() #Shelves for index #https://www3.nd.edu/~busiforc/handouts/cryptography/letterfrequencies.html @@ -192,10 +179,13 @@ class Indexer(): tokens[split[i]].positions.append(i) return tokens + # Does the idf part of the tfidf def tfidf(self, current_save): for token, postings in current_save.items(): for p in postings: p.tfidf = p.tf * math.log(len(self.id)/len(postings)) + self.normalize[p.url] += p.tfidf + def get_data(self): @@ -236,14 +226,35 @@ class Indexer(): if(index >= num_threads): index = 0 time.sleep(.1) + # Make a list the size of the corpus to keep track of document scores + self.normalize = [0] * len(self.id) + # These last few function calls calculates idf and finalizes tf-idf weighting for each index self.tfidf(self.save_1) self.tfidf(self.save_2) self.tfidf(self.save_3) self.tfidf(self.save_4) self.tfidf(self.save_5) - pickle.dump(self.id, self.f) - # should I self.f.close() here? + + # Creates a pickle file that is a list of urls where the index of the url is the id that the posting refers to. + p = os.path.dirname(os.path.abspath(__file__)) + my_filename = os.path.join(p, "urlID.pkl") + if os.path.exists(my_filename): + os.remove(my_filename) + # Creates file and closes it + f = open(my_filename, "wb") + pickle.dump(self.id, f) + f.close() + + # Creates a pickle file that will contain the denominator (before the square root) for normalizing wt + p = os.path.dirname(os.path.abspath(__file__)) + my_filename = os.path.join(p, "normalize.pkl") + if os.path.exists(my_filename): + os.remove(my_filename) + # Creates file and closes it + f = open(my_filename, "wb") + pickle.dump(self.normalize, f) + f.close() #Found 55770 documents # @@ -251,14 +262,6 @@ class Indexer(): - - - - - - - - def main(): indexer = Indexer(True,0)