search functionality to obtain set of documents

This commit is contained in:
unknown
2022-05-26 23:34:29 -07:00
parent 95ba16cf2e
commit 60f6eb0df0
5 changed files with 176 additions and 84 deletions

135
search.py
View File

@@ -23,109 +23,84 @@ from posting import Posting
from worker import Worker
class Search():
# wrote the code for testing in the file searchtesting.py so many of the variables and function calls are wrong.
def __init__(self):
self.save_1 = shelve.open("save_1.shelve")
self.save_2 = shelve.open("save_2.shelve")
self.save_3 = shelve.open("save_3.shelve")
self.save_4 = shelve.open("save_4.shelve")
self.save_5 = shelve.open("save_5.shelve")
self.stemmer = PorterStemmer()
p = os.path.dirname(os.path.abspath(__file__))
my_filename = os.path.join(p, "urlID.pkl")
self.f = open(my_filename, "rb+")
self.id = pickle.load(self.f)
def get_save_file(self, word):
word_lower = word.lower()
# takes a list of posting lists returns a list of indexes that correspond to search temp list
def two_shortest(self, l_posting):
short = []
location = []
for postings in l_posting:
short.append(len(postings))
for i in range(2):
x = short.index(min(short))
location.append(x)
short[x] = float('inf')
return location
if re.match(r"^[a-d0-1].*", word_lower):
return self.save_1
elif re.match(r"^[e-k2-3].*", word_lower):
return self.save_2
elif re.match(r"^[l-q4-7].*", word_lower):
return self.save_3
elif re.match(r"^[r-z8-9].*", word_lower):
return self.save_4
else:
return self.save_5
# looks for the smallest list and largest list
def find_extremes(self, q):
longest = float('-inf')
shortest = float('inf')
remaining = []
# Careful if there is a word that the indexer doesn't have
for word in q:
d = self.get_save_file(word)
if len(d[word]) > longest:
longest = len(d[word])
l = word
elif len(d[word]) < shortest:
shortest = len(d[word])
s = word
for word in q:
if word != l or word != s:
remaining.append(word)
return s, l, remaining
def merge(self, short, long, r):
m = []
# len(list1) <= len(list2) So the code in this function works with that in mind
def merge(self, list1, list2):
merged = []
i = 0
j = 0
s = self.get_save_file(short)
l = self.get_save_file(long)
while i < len(s[short]) or j < len(l[long]):
if i == len(d[short])-1:
if s[short][i].url == l[long][j].url:
m.append(s[short][i].url)
# TODO: optimize by having a pointer to the current index+4
while i < len(list1) or j < len(list2):
if j == len(list2):
break
if i == len(list1):
break
# Since list1 is shorter it will hit its max index sooner,
# so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2
if i == len(list1)-1:
if list1[i].url == list2[j].url:
merged.append(list1[i])
j += 1
elif s[short][i].url < l[long][j].url:
i += 1
elif list1[i].url < list2[j].url:
break
else:
j += 1
else:
if s[short][i].url == l[long][j].url:
m.append(d[short][i].url)
if list1[i].url == list2[j].url:
merged.append(list1[i])
i += 1
j += 1
elif s[short][i].url < l[long][j].url:
elif list1[i].url < list2[j].url:
break
else:
i += 1
j += 1
return merged
# query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
def search(self, query):
temp = []
for token in query:
temp.append(get_index(token))
final = []
if len(m) > 0:
while len(r) > 0:
d = self.get_save_file(r[0])
for i in d[r[0]]:
if i.url > m[len(m) -1]:
break
elif i.url in m:
final.append(i.url)
if len(final) != len(m):
m = final
final = []
r.pop(0)
else:
final = []
r.pop(0)
return m
else:
return -1
l = two_shortest(temp)
m = merge(temp[l[0]], temp[l[1]])
def search(self):
query = input("Enter query: ")
query = [self.stemmer.stem(i) for i in query.split()]
x = self.find_extremes(query)
match = self.merge(x[0], x[1], x[2])
if match == -1:
print("No valid matches")
else:
for i in match:
print(self.id[i])
while len(temp) > 1:
# delete from temp the already merged lists
del temp[l[0]]
del temp[l[1]]
temp.append(m)
l = two_shortest(temp)
m = merge(temp[l[0]], temp[l[1]])
for p in m:
print(p.url)
# For now going to do a loop through each query's index and match it with the merged list (can be faster if i implement something during merge/search in order to keep track of the postings)