search functionality to obtain set of documents

2022-05-26 23:34:29 -07:00 · 2022-05-26 23:34:29 -07:00 · 60f6eb0df0
commit 60f6eb0df0
parent 95ba16cf2e
5 changed files with 176 additions and 84 deletions
--- a/indexer.py
+++ b/indexer.py
@ -172,7 +172,6 @@ class Indexer():
 		for i in range(len(split)):
 			if split[i] not in tokens:
 				tokens[split[i]] = Posting(self.get_url_id(url), 1, i)
-			
 			else:
 				tokens[split[i]].rtf += 1
 				tokens[split[i]].tf = (1 + math.log(tokens[split[i]].rtf))
--- a/search.py
+++ b/search.py
@ -23,109 +23,84 @@ from posting import Posting
 from worker import Worker

 class Search():
-
+    # wrote the code for testing in the file searchtesting.py so many of the variables and function calls are wrong.
    def __init__(self):
-        self.save_1 = shelve.open("save_1.shelve")
-        self.save_2 = shelve.open("save_2.shelve")
-        self.save_3 = shelve.open("save_3.shelve")
-        self.save_4 = shelve.open("save_4.shelve")
-        self.save_5 = shelve.open("save_5.shelve")
        self.stemmer = PorterStemmer()
        p = os.path.dirname(os.path.abspath(__file__))
        my_filename = os.path.join(p, "urlID.pkl")
        self.f = open(my_filename, "rb+")
        self.id = pickle.load(self.f)

-    def get_save_file(self, word):
-        word_lower = word.lower()
+    # takes a list of posting lists returns a list of indexes that correspond to search temp list
+    def two_shortest(self, l_posting):
+        short = []
+        location = []
+        for postings in l_posting:
+            short.append(len(postings))
+        
+        for i in range(2):
+            x = short.index(min(short))
+            location.append(x)
+            short[x] = float('inf')
+        
+        return location

-        if re.match(r"^[a-d0-1].*", word_lower):
-            return self.save_1
-        elif re.match(r"^[e-k2-3].*", word_lower):
-            return self.save_2
-        elif re.match(r"^[l-q4-7].*", word_lower):
-            return self.save_3
-        elif re.match(r"^[r-z8-9].*", word_lower):
-            return self.save_4
-        else:
-            return self.save_5
-    
-    # looks for the smallest list and largest list
-    def find_extremes(self, q):
-        longest = float('-inf')
-        shortest = float('inf')
-        remaining = []
-        # Careful if there is a word that the indexer doesn't have
-        for word in q:
-            d = self.get_save_file(word)
-            if len(d[word]) > longest:
-                longest = len(d[word])
-                l = word  
-            elif len(d[word]) < shortest:
-                shortest = len(d[word])
-                s = word
-        for word in q:
-            if word != l or word != s:
-                remaining.append(word)
-        return s, l, remaining
-    
-    def merge(self, short, long, r):
-        m = []
+    # len(list1) <= len(list2) So the code in this function works with that in mind
+    def merge(self, list1, list2):
+        merged = []
        i = 0
        j = 0
-        s = self.get_save_file(short)
-        l = self.get_save_file(long)
-        while i < len(s[short]) or j < len(l[long]):
-            if i == len(d[short])-1:
-                if s[short][i].url == l[long][j].url:
-                    m.append(s[short][i].url)
+        # TODO: optimize by having a pointer to the current index+4
+        while i < len(list1) or j < len(list2):
+            if j == len(list2):
+                break
+            if i == len(list1):
+                break
+            # Since list1 is shorter it will hit its max index sooner, 
+            #   so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2
+            if i == len(list1)-1:
+                if list1[i].url == list2[j].url:
+                    merged.append(list1[i])
                    j += 1
-                elif s[short][i].url < l[long][j].url:
+                    i += 1
+                elif list1[i].url < list2[j].url:
                    break
                else:
                    j += 1
            else:
-                if s[short][i].url == l[long][j].url:
-                    m.append(d[short][i].url)
+                if list1[i].url == list2[j].url:
+                    merged.append(list1[i])
                    i += 1
                    j += 1
-                elif s[short][i].url < l[long][j].url:
+                elif list1[i].url < list2[j].url:
                    break
                else:
                    i += 1
                    j += 1
+        return merged
+
+    # query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
+    def search(self, query):
+        temp = []
+        for token in query:
+            temp.append(get_index(token))
        
-        final = []
-        if len(m) > 0:
-            while len(r) > 0:
-                d = self.get_save_file(r[0])
-                for i in d[r[0]]:
-                    if i.url > m[len(m) -1]:
-                        break
-                    elif i.url in m:
-                        final.append(i.url)
-                if len(final) != len(m):
-                    m = final
-                    final = []
-                    r.pop(0)
-                else:
-                    final = []
-                    r.pop(0)
-                        
-            return m
-        else:
-            return -1
+        l = two_shortest(temp)
+        m = merge(temp[l[0]], temp[l[1]])

-    def search(self):
-        query = input("Enter query: ")
-        query = [self.stemmer.stem(i) for i in query.split()]
-        x = self.find_extremes(query)
-        match = self.merge(x[0], x[1], x[2])
-        if match == -1:
-            print("No valid matches")
-        else:
-            for i in match:
-                print(self.id[i])
+        while len(temp) > 1:
+            # delete from temp the already merged lists
+            del temp[l[0]]
+            del temp[l[1]]
+            temp.append(m)
+
+            l = two_shortest(temp)
+            m = merge(temp[l[0]], temp[l[1]])
+
+        for p in m:
+            print(p.url)
+        
+        # For now going to do a loop through each query's index and match it with the merged list (can be faster if i implement something during merge/search in order to keep track of the postings)



--- a/searchtesting.py
+++ b/searchtesting.py
@ -0,0 +1,117 @@
+import math
+import json
+import os
+import shelve
+from bs4 import BeautifulSoup
+from time import perf_counter
+import time
+import threading
+import pickle
+
+
+#Data process
+from nltk.tokenize import word_tokenize
+from nltk.stem import PorterStemmer
+from sklearn.feature_extraction.text import TfidfVectorizer
+import pandas as pd
+import numpy as np
+
+import re
+
+class Posting():
+	def __init__(self, url, rtf, position):
+		self.url = url
+		self.rtf = rtf
+		self.tf = 1
+		self.tfidf = 0
+		self.positions = [position]
+
+
+d = {
+    'a' : [Posting(0, 1, 1), Posting(2, 1, 1), Posting(3, 1, 1), Posting(8, 1, 1)], 
+    'b' :[Posting(0, 1, 1), Posting(8, 1, 1)],
+    'c' : [Posting(0, 1, 1), Posting(1, 1, 1), Posting(2, 1, 1), Posting(8, 1, 1)]
+    }
+
+def get_index(word):
+    for k, v in d.items():
+        if k == word:
+            return v
+
+# takes a list of posting lists returns a list of indexes that correspond to search temp list
+def two_shortest(l_posting):
+    short = []
+    location = []
+    for postings in l_posting:
+        short.append(len(postings))
+    
+    for i in range(2):
+        x = short.index(min(short))
+        location.append(x)
+        short[x] = float('inf')
+    
+    return location
+
+# len(list1) <= len(list2) So the code in this function works with that in mind
+def merge(list1, list2):
+    merged = []
+    i = 0
+    j = 0
+    # TODO: optimize by having a pointer to the current index+4
+    while i < len(list1) or j < len(list2):
+        if j == len(list2):
+            break
+        if i == len(list1):
+            break
+        # Since list1 is shorter it will hit its max index sooner, 
+        #   so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2
+        if i == len(list1)-1:
+            if list1[i].url == list2[j].url:
+                merged.append(list1[i])
+                j += 1
+                i += 1
+            elif list1[i].url < list2[j].url:
+                break
+            else:
+                j += 1
+        else:
+            if list1[i].url == list2[j].url:
+                merged.append(list1[i])
+                i += 1
+                j += 1
+            elif list1[i].url < list2[j].url:
+                break
+            else:
+                i += 1
+                j += 1
+    return merged, 
+
+# query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
+def search(query):
+    temp = []
+    for token in query:
+        temp.append(get_index(token))
+    
+    l = two_shortest(temp)
+    m = merge(temp[l[0]], temp[l[1]])
+
+    while len(temp) > 1:
+        # delete from temp the already merged lists
+        del temp[l[0]]
+        del temp[l[1]]
+        temp.append(m)
+
+        l = two_shortest(temp)
+        m = merge(temp[l[0]], temp[l[1]])
+
+    for p in m:
+        print(p.url)
+    
+    # For now going to do a loop through each query's index and match it with the merged list (can be faster if i implement something during merge/search in order to keep track of the postings)
+    
+    
+
+
+
+search(["a", "b", "c"])
+            
--- a/tempCodeRunnerFile.py
+++ b/tempCodeRunnerFile.py
@ -0,0 +1,2 @@
+
+    for postings in l_posting:
--- a/worker.py
+++ b/worker.py
@ -54,9 +54,8 @@ class Worker(Thread):
 		ngrams = self.indexer.get_tf_idf(x)
 		if ngrams != -1:
 			tic = perf_counter()
-			for ngram, tfidf in ngrams.items():
-				posting = Posting(self.indexer.get_url_id(data["url"]), tfidf[0])
-				self.indexer.save_index(ngram,posting)
+			for ngram, posting in ngrams.items():
+				self.indexer.save_index(ngram, posting)
 			toc = perf_counter()
 			print("Took " + str(toc - tic) + " seconds to save ngram")