search functionality to obtain set of documents
This commit is contained in:
parent
95ba16cf2e
commit
60f6eb0df0
@ -172,7 +172,6 @@ class Indexer():
|
||||
for i in range(len(split)):
|
||||
if split[i] not in tokens:
|
||||
tokens[split[i]] = Posting(self.get_url_id(url), 1, i)
|
||||
|
||||
else:
|
||||
tokens[split[i]].rtf += 1
|
||||
tokens[split[i]].tf = (1 + math.log(tokens[split[i]].rtf))
|
||||
|
129
search.py
129
search.py
@ -23,109 +23,84 @@ from posting import Posting
|
||||
from worker import Worker
|
||||
|
||||
class Search():
|
||||
|
||||
# wrote the code for testing in the file searchtesting.py so many of the variables and function calls are wrong.
|
||||
def __init__(self):
|
||||
self.save_1 = shelve.open("save_1.shelve")
|
||||
self.save_2 = shelve.open("save_2.shelve")
|
||||
self.save_3 = shelve.open("save_3.shelve")
|
||||
self.save_4 = shelve.open("save_4.shelve")
|
||||
self.save_5 = shelve.open("save_5.shelve")
|
||||
self.stemmer = PorterStemmer()
|
||||
p = os.path.dirname(os.path.abspath(__file__))
|
||||
my_filename = os.path.join(p, "urlID.pkl")
|
||||
self.f = open(my_filename, "rb+")
|
||||
self.id = pickle.load(self.f)
|
||||
|
||||
def get_save_file(self, word):
|
||||
word_lower = word.lower()
|
||||
# takes a list of posting lists returns a list of indexes that correspond to search temp list
|
||||
def two_shortest(self, l_posting):
|
||||
short = []
|
||||
location = []
|
||||
for postings in l_posting:
|
||||
short.append(len(postings))
|
||||
|
||||
if re.match(r"^[a-d0-1].*", word_lower):
|
||||
return self.save_1
|
||||
elif re.match(r"^[e-k2-3].*", word_lower):
|
||||
return self.save_2
|
||||
elif re.match(r"^[l-q4-7].*", word_lower):
|
||||
return self.save_3
|
||||
elif re.match(r"^[r-z8-9].*", word_lower):
|
||||
return self.save_4
|
||||
else:
|
||||
return self.save_5
|
||||
for i in range(2):
|
||||
x = short.index(min(short))
|
||||
location.append(x)
|
||||
short[x] = float('inf')
|
||||
|
||||
# looks for the smallest list and largest list
|
||||
def find_extremes(self, q):
|
||||
longest = float('-inf')
|
||||
shortest = float('inf')
|
||||
remaining = []
|
||||
# Careful if there is a word that the indexer doesn't have
|
||||
for word in q:
|
||||
d = self.get_save_file(word)
|
||||
if len(d[word]) > longest:
|
||||
longest = len(d[word])
|
||||
l = word
|
||||
elif len(d[word]) < shortest:
|
||||
shortest = len(d[word])
|
||||
s = word
|
||||
for word in q:
|
||||
if word != l or word != s:
|
||||
remaining.append(word)
|
||||
return s, l, remaining
|
||||
return location
|
||||
|
||||
def merge(self, short, long, r):
|
||||
m = []
|
||||
# len(list1) <= len(list2) So the code in this function works with that in mind
|
||||
def merge(self, list1, list2):
|
||||
merged = []
|
||||
i = 0
|
||||
j = 0
|
||||
s = self.get_save_file(short)
|
||||
l = self.get_save_file(long)
|
||||
while i < len(s[short]) or j < len(l[long]):
|
||||
if i == len(d[short])-1:
|
||||
if s[short][i].url == l[long][j].url:
|
||||
m.append(s[short][i].url)
|
||||
# TODO: optimize by having a pointer to the current index+4
|
||||
while i < len(list1) or j < len(list2):
|
||||
if j == len(list2):
|
||||
break
|
||||
if i == len(list1):
|
||||
break
|
||||
# Since list1 is shorter it will hit its max index sooner,
|
||||
# so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2
|
||||
if i == len(list1)-1:
|
||||
if list1[i].url == list2[j].url:
|
||||
merged.append(list1[i])
|
||||
j += 1
|
||||
elif s[short][i].url < l[long][j].url:
|
||||
i += 1
|
||||
elif list1[i].url < list2[j].url:
|
||||
break
|
||||
else:
|
||||
j += 1
|
||||
else:
|
||||
if s[short][i].url == l[long][j].url:
|
||||
m.append(d[short][i].url)
|
||||
if list1[i].url == list2[j].url:
|
||||
merged.append(list1[i])
|
||||
i += 1
|
||||
j += 1
|
||||
elif s[short][i].url < l[long][j].url:
|
||||
elif list1[i].url < list2[j].url:
|
||||
break
|
||||
else:
|
||||
i += 1
|
||||
j += 1
|
||||
return merged
|
||||
|
||||
final = []
|
||||
if len(m) > 0:
|
||||
while len(r) > 0:
|
||||
d = self.get_save_file(r[0])
|
||||
for i in d[r[0]]:
|
||||
if i.url > m[len(m) -1]:
|
||||
break
|
||||
elif i.url in m:
|
||||
final.append(i.url)
|
||||
if len(final) != len(m):
|
||||
m = final
|
||||
final = []
|
||||
r.pop(0)
|
||||
else:
|
||||
final = []
|
||||
r.pop(0)
|
||||
# query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
|
||||
def search(self, query):
|
||||
temp = []
|
||||
for token in query:
|
||||
temp.append(get_index(token))
|
||||
|
||||
return m
|
||||
else:
|
||||
return -1
|
||||
l = two_shortest(temp)
|
||||
m = merge(temp[l[0]], temp[l[1]])
|
||||
|
||||
def search(self):
|
||||
query = input("Enter query: ")
|
||||
query = [self.stemmer.stem(i) for i in query.split()]
|
||||
x = self.find_extremes(query)
|
||||
match = self.merge(x[0], x[1], x[2])
|
||||
if match == -1:
|
||||
print("No valid matches")
|
||||
else:
|
||||
for i in match:
|
||||
print(self.id[i])
|
||||
while len(temp) > 1:
|
||||
# delete from temp the already merged lists
|
||||
del temp[l[0]]
|
||||
del temp[l[1]]
|
||||
temp.append(m)
|
||||
|
||||
l = two_shortest(temp)
|
||||
m = merge(temp[l[0]], temp[l[1]])
|
||||
|
||||
for p in m:
|
||||
print(p.url)
|
||||
|
||||
# For now going to do a loop through each query's index and match it with the merged list (can be faster if i implement something during merge/search in order to keep track of the postings)
|
||||
|
||||
|
||||
|
||||
|
117
searchtesting.py
Normal file
117
searchtesting.py
Normal file
@ -0,0 +1,117 @@
|
||||
import math
|
||||
import json
|
||||
import os
|
||||
import shelve
|
||||
from bs4 import BeautifulSoup
|
||||
from time import perf_counter
|
||||
import time
|
||||
import threading
|
||||
import pickle
|
||||
|
||||
|
||||
#Data process
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.stem import PorterStemmer
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
import re
|
||||
|
||||
class Posting():
|
||||
def __init__(self, url, rtf, position):
|
||||
self.url = url
|
||||
self.rtf = rtf
|
||||
self.tf = 1
|
||||
self.tfidf = 0
|
||||
self.positions = [position]
|
||||
|
||||
|
||||
d = {
|
||||
'a' : [Posting(0, 1, 1), Posting(2, 1, 1), Posting(3, 1, 1), Posting(8, 1, 1)],
|
||||
'b' :[Posting(0, 1, 1), Posting(8, 1, 1)],
|
||||
'c' : [Posting(0, 1, 1), Posting(1, 1, 1), Posting(2, 1, 1), Posting(8, 1, 1)]
|
||||
}
|
||||
|
||||
def get_index(word):
|
||||
for k, v in d.items():
|
||||
if k == word:
|
||||
return v
|
||||
|
||||
# takes a list of posting lists returns a list of indexes that correspond to search temp list
|
||||
def two_shortest(l_posting):
|
||||
short = []
|
||||
location = []
|
||||
for postings in l_posting:
|
||||
short.append(len(postings))
|
||||
|
||||
for i in range(2):
|
||||
x = short.index(min(short))
|
||||
location.append(x)
|
||||
short[x] = float('inf')
|
||||
|
||||
return location
|
||||
|
||||
# len(list1) <= len(list2) So the code in this function works with that in mind
|
||||
def merge(list1, list2):
|
||||
merged = []
|
||||
i = 0
|
||||
j = 0
|
||||
# TODO: optimize by having a pointer to the current index+4
|
||||
while i < len(list1) or j < len(list2):
|
||||
if j == len(list2):
|
||||
break
|
||||
if i == len(list1):
|
||||
break
|
||||
# Since list1 is shorter it will hit its max index sooner,
|
||||
# so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2
|
||||
if i == len(list1)-1:
|
||||
if list1[i].url == list2[j].url:
|
||||
merged.append(list1[i])
|
||||
j += 1
|
||||
i += 1
|
||||
elif list1[i].url < list2[j].url:
|
||||
break
|
||||
else:
|
||||
j += 1
|
||||
else:
|
||||
if list1[i].url == list2[j].url:
|
||||
merged.append(list1[i])
|
||||
i += 1
|
||||
j += 1
|
||||
elif list1[i].url < list2[j].url:
|
||||
break
|
||||
else:
|
||||
i += 1
|
||||
j += 1
|
||||
return merged,
|
||||
|
||||
# query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
|
||||
def search(query):
|
||||
temp = []
|
||||
for token in query:
|
||||
temp.append(get_index(token))
|
||||
|
||||
l = two_shortest(temp)
|
||||
m = merge(temp[l[0]], temp[l[1]])
|
||||
|
||||
while len(temp) > 1:
|
||||
# delete from temp the already merged lists
|
||||
del temp[l[0]]
|
||||
del temp[l[1]]
|
||||
temp.append(m)
|
||||
|
||||
l = two_shortest(temp)
|
||||
m = merge(temp[l[0]], temp[l[1]])
|
||||
|
||||
for p in m:
|
||||
print(p.url)
|
||||
|
||||
# For now going to do a loop through each query's index and match it with the merged list (can be faster if i implement something during merge/search in order to keep track of the postings)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
search(["a", "b", "c"])
|
||||
|
2
tempCodeRunnerFile.py
Normal file
2
tempCodeRunnerFile.py
Normal file
@ -0,0 +1,2 @@
|
||||
|
||||
for postings in l_posting:
|
@ -54,9 +54,8 @@ class Worker(Thread):
|
||||
ngrams = self.indexer.get_tf_idf(x)
|
||||
if ngrams != -1:
|
||||
tic = perf_counter()
|
||||
for ngram, tfidf in ngrams.items():
|
||||
posting = Posting(self.indexer.get_url_id(data["url"]), tfidf[0])
|
||||
self.indexer.save_index(ngram,posting)
|
||||
for ngram, posting in ngrams.items():
|
||||
self.indexer.save_index(ngram, posting)
|
||||
toc = perf_counter()
|
||||
print("Took " + str(toc - tic) + " seconds to save ngram")
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user