Search_Engine/search.py
2022-05-27 23:00:45 -07:00

280 lines
9.7 KiB
Python

#Data input
import json
import os
import shelve
from bs4 import BeautifulSoup
from time import perf_counter
import time
import threading
import pickle
import sys
import math
import numpy as np
sys.path.append('D:/Visual Studio Workspace')
#Data process
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import re
from indexer import Indexer
#Logging postings
from posting import Posting
from worker import Worker
import indexer
class Search():
# wrote the code for testing in the file searchtesting.py so many of the variables and function calls are wrong.
def __init__(self, indexer):
self.indexer = indexer
self.indexer.load_index_index()
self.indexer.load_weight_index()
self.stemmer = PorterStemmer()
# takes a list of posting lists returns a list of indexes of the querys with the two shortest postings list that corresponds to search temp list
def two_shortest(self, l_posting):
short = []
location = []
for postings in l_posting:
short.append(len(postings))
for i in range(2):
x = short.index(min(short))
location.append(x)
short[x] = float('inf')
return location
# len(list1) <= len(list2) So the code in this function works with that in mind
def merge(self, list1, list2):
max = 0
valid1 = []
valid2 = []
i = 0
j = 0
# TODO: optimize by having a pointer to the current index+4
i4 = 3
j4 = 3
while i < len(list1) or j < len(list2):
if j == len(list2):
break
if i == len(list1):
break
#if max == 40:
#break
try:
if i == len(list1)-1:
if list1[i]['doc_id'] == list2[j]['doc_id']:
valid1.append(list1[i])
valid2.append(list2[j])
j += 1
j4 +=1
i += 1
i4 += 1
max += 1
elif list1[i]['doc_id'] >= list2[j4]['doc_id']:
j = j4
j4 = j + 3
elif list1[i4]['doc_id'] < list2[j]['doc_id'] and i4 < len(list1):
i = i4
i4 = i + 3
elif list1[i]['doc_id'] < list2[j]['doc_id']:
i += 1
i4 += 1
elif list1[i]['doc_id'] > list2[j]['doc_id']:
j += 1
j4 += 1
else:
j += 1
j4 += 1
else:
if list1[i]['doc_id'] == list2[j]['doc_id']:
valid1.append(list1[i])
valid2.append(list2[j])
j += 1
j4 +=1
i += 1
i4 += 1
max += 1
elif list1[i]['doc_id'] >= list2[j4]['doc_id'] and j4 < len(list2):
j = j4
j4 = j + 3
elif list1[i4]['doc_id'] < list2[j]['doc_id'] and i4 < len(list1):
i = i4
i4 = i + 3
elif list1[i]['doc_id'] < list2[j]['doc_id']:
i += 1
i4 += 1
elif list1[i]['doc_id'] > list2[j]['doc_id']:
j += 1
j4 += 1
else:
j += 1
j4 +=1
i += 1
i4 += 1
except:
if i == len(list1)-1:
if list1[i]['doc_id'] == list2[j]['doc_id']:
valid1.append(list1[i])
valid2.append(list2[j])
j += 1
j4 +=1
i += 1
i4 += 1
elif list1[i]['doc_id'] < list2[j]['doc_id']:
i += 1
i4 += 1
elif list1[i]['doc_id'] > list2[j]['doc_id']:
j += 1
j4 += 1
else:
j += 1
j4 += 1
else:
if list1[i]['doc_id'] == list2[j]['doc_id']:
valid1.append(list1[i])
valid2.append(list2[j])
j += 1
j4 +=1
i += 1
i4 += 1
elif list1[i]['doc_id'] < list2[j]['doc_id']:
i += 1
i4 += 1
elif list1[i]['doc_id'] > list2[j]['doc_id']:
j += 1
j4 += 1
else:
j += 1
j4 +=1
i += 1
i4 += 1
# Since list1 is shorter it will hit its max index sooner,
# so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2
return valid1, valid2
# query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
def search(self, query):
tokens = word_tokenize(query)
stemmed_tokens = list()
for token in tokens:
token = self.stemmer.stem(token)
stemmed_tokens.append(token)
query_valid_postings = dict()
temp = []
for token in stemmed_tokens:
temp.append(self.indexer.get_postings(token))
query_valid_postings[token] = []
tic = perf_counter()
l = self.two_shortest(temp)
m = self.merge(temp[l[0]], temp[l[1]])
if len(m[0]) == 0:
return -1
# Keep track of the valid postings for each query as we do merge
first = stemmed_tokens[l[0]]
query_valid_postings[first] = m[0]
query_valid_postings[stemmed_tokens[l[1]]] = m[1]
toc = perf_counter()
print("first merge", toc-tic)
tic = perf_counter()
while len(temp) > 1:
# delete from temp the already merged lists
temp.pop(l[0])
# Try and except since temp length changes
try:
temp.pop(l[1])
except:
temp.pop(l[1]-1)
temp.append(m[0])
# Delete and append to query to make it consistent with temp
stemmed_tokens.pop(l[0])
try:
stemmed_tokens.pop(l[1])
except:
stemmed_tokens.pop(l[1]-1)
stemmed_tokens.append(None)
l = self.two_shortest(temp)
# Checks if contents in l are the same
if len(set(l)) == 1:
break
else:
m = self.merge(temp[l[0]], temp[l[1]])
print(len(m[0]), len(m[1]))
query_valid_postings[first] = m[0]
query_valid_postings[stemmed_tokens[l[1]]] = m[1]
toc = perf_counter()
print("while loop", toc-tic)
tic = perf_counter()
# Create list of doc ids of correct merged postings for cross checking
merge = []
for posting in query_valid_postings[first]:
merge.append(posting['doc_id'])
# Cross checking each query's valid postings list with correct merged set which we donated as being first
for token, postings in query_valid_postings.items():
if token == first:
continue
else:
print(token)
for p in postings:
if p['doc_id'] not in merge:
postings.remove(p)
toc = perf_counter()
print(toc-tic)
for token, postings in query_valid_postings.items():
print(token, len(postings))
tic = perf_counter()
results = []
for i in range(len(query_valid_postings[first])):
q_denom = 0
norm_q = []
norm_d = []
for q in query_valid_postings.keys():
q_denom += (query_valid_postings[q][i]['tf_idf']/(1 + math.log(query_valid_postings[q][i]['tf_raw'])))**2
q_denom = math.sqrt(q_denom)
for q in query_valid_postings.keys():
x = query_valid_postings[q][i]['tf_idf']/(1 + math.log(query_valid_postings[q][i]['tf_raw']))/q_denom
norm_q.append(x)
y = (1 + math.log(query_valid_postings[q][i]['tf_raw']))/self.indexer.get_weight(query_valid_postings[q][i]['doc_id'])
norm_d.append(y)
results.append({'url' :query_valid_postings[first][i]['url'], 'cosine' : np.dot(norm_q, norm_d)})
results = sorted(results, key = lambda x: x['cosine'], reverse = True)
finalresults = []
for i in range(20):
finalresults.append(results[i]['url'])
print(finalresults)
return finalresults