Search_Engine/search.py

#Data input
import json
import os
import shelve
from bs4 import BeautifulSoup
from time import perf_counter
import time
import threading
import pickle
import sys
import math
import numpy as np

sys.path.append('D:/Visual Studio Workspace')

#Data process
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np

import re
from indexer import Indexer

#Logging postings
from posting import Posting
from worker import Worker
import indexer

class Search():
    # wrote the code for testing in the file searchtesting.py so many of the variables and function calls are wrong.
    def __init__(self, indexer):
        self.indexer = indexer
        self.indexer.load_index_index()
        self.indexer.load_weight_index()
        self.stemmer = PorterStemmer()

    # takes a list of posting lists returns a list of indexes of the querys with the two shortest postings list that corresponds to search temp list
    def two_shortest(self, l_posting):
        short = []
        location = []
        for postings in l_posting:
            short.append(len(postings))

        for i in range(2):
            x = short.index(min(short))
            location.append(x)
            short[x] = float('inf')

        return location

    # len(list1) <= len(list2) So the code in this function works with that in mind
    def merge(self, list1, list2):
        max = 0
        valid1 = []
        valid2 = []
        i = 0
        j = 0
        # TODO: optimize by having a pointer to the current index+4
        i4 = 3
        j4 = 3
        while i < len(list1) or j < len(list2):
            if j == len(list2):
                break
            if i == len(list1):
                break
            #if max == 40:
                #break
            try:
                if i == len(list1)-1:
                    if list1[i]['doc_id'] == list2[j]['doc_id']:
                        valid1.append(list1[i])
                        valid2.append(list2[j])
                        j += 1
                        j4 +=1
                        i += 1
                        i4 += 1
                        max += 1
                    elif  list1[i]['doc_id'] >= list2[j4]['doc_id']:
                        j = j4
                        j4 = j + 3
                    elif list1[i4]['doc_id'] < list2[j]['doc_id'] and i4 < len(list1):
                        i = i4
                        i4 = i + 3
                    elif list1[i]['doc_id'] < list2[j]['doc_id']:
                        i += 1
                        i4 += 1
                    elif list1[i]['doc_id'] > list2[j]['doc_id']:
                        j += 1
                        j4 += 1
                    else:
                        j += 1
                        j4 += 1

                else:
                    if list1[i]['doc_id'] == list2[j]['doc_id']:
                        valid1.append(list1[i])
                        valid2.append(list2[j])
                        j += 1
                        j4 +=1
                        i += 1
                        i4 += 1
                        max += 1
                    elif list1[i]['doc_id'] >= list2[j4]['doc_id'] and j4 < len(list2):
                        j = j4
                        j4 = j + 3

                    elif list1[i4]['doc_id'] < list2[j]['doc_id'] and i4 < len(list1):
                        i = i4
                        i4 = i + 3
                    elif list1[i]['doc_id'] < list2[j]['doc_id']:
                        i += 1
                        i4 += 1
                    elif list1[i]['doc_id'] > list2[j]['doc_id']:
                        j += 1
                        j4 += 1
                    else:
                        j += 1
                        j4 +=1
                        i += 1
                        i4 += 1
            except:
                if i == len(list1)-1:
                    if list1[i]['doc_id'] == list2[j]['doc_id']:
                        valid1.append(list1[i])
                        valid2.append(list2[j])
                        j += 1
                        j4 +=1
                        i += 1
                        i4 += 1
                    elif list1[i]['doc_id'] < list2[j]['doc_id']:
                        i += 1
                        i4 += 1
                    elif list1[i]['doc_id'] > list2[j]['doc_id']:
                        j += 1
                        j4 += 1
                    else:
                        j += 1
                        j4 += 1
                else:
                    if list1[i]['doc_id'] == list2[j]['doc_id']:
                        valid1.append(list1[i])
                        valid2.append(list2[j])
                        j += 1
                        j4 +=1
                        i += 1
                        i4 += 1
                    elif list1[i]['doc_id'] < list2[j]['doc_id']:
                        i += 1
                        i4 += 1
                    elif list1[i]['doc_id'] > list2[j]['doc_id']:
                        j += 1
                        j4 += 1
                    else:
                        j += 1
                        j4 +=1
                        i += 1
                        i4 += 1
            # Since list1 is shorter it will hit its max index sooner,
            #   so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2

        return valid1, valid2

    # query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
    def search(self, query):
        tokens = word_tokenize(query)
        stemmed_tokens = list()
        for token in tokens:
            token = self.stemmer.stem(token)
            stemmed_tokens.append(token)

        query_valid_postings = dict()
        temp = []
        for token in stemmed_tokens:
            temp.append(self.indexer.get_postings(token))
            query_valid_postings[token] = []

        tic = perf_counter()
        l = self.two_shortest(temp)
        m = self.merge(temp[l[0]], temp[l[1]])
        if len(m[0]) == 0:
            return -1
        # Keep track of the valid postings for each query as we do merge
        first = stemmed_tokens[l[0]]
        query_valid_postings[first] = m[0]
        query_valid_postings[stemmed_tokens[l[1]]] = m[1]
        toc = perf_counter()
        print("first merge", toc-tic)
        tic = perf_counter()
        while len(temp) > 1:
            # delete from temp the already merged lists
            temp.pop(l[0])
            # Try and except since temp length changes
            try:
                temp.pop(l[1])
            except:
                temp.pop(l[1]-1)

            temp.append(m[0])

            # Delete and append to query to make it consistent with temp
            stemmed_tokens.pop(l[0])
            try:
                stemmed_tokens.pop(l[1])
            except:
                stemmed_tokens.pop(l[1]-1)

            stemmed_tokens.append(None)

            l = self.two_shortest(temp)
            # Checks if contents in l are the same
            if len(set(l)) == 1:
                break
            else:
                m = self.merge(temp[l[0]], temp[l[1]])
                print(len(m[0]), len(m[1]))
                query_valid_postings[first] = m[0]
                query_valid_postings[stemmed_tokens[l[1]]] = m[1]
        toc = perf_counter()
        print("while loop", toc-tic)
        tic = perf_counter()
        # Create list of doc ids of correct merged postings for cross checking

        merge = []
        for posting in query_valid_postings[first]:
            merge.append(posting['doc_id'])


        # Cross checking each query's valid postings list with correct merged set which we donated as being first
        for token, postings in query_valid_postings.items():
            if token == first:
                continue
            else:
                print(token)
                for p in postings:
                    if p['doc_id'] not in merge:
                        postings.remove(p)

        toc = perf_counter()
        print(toc-tic)


        for token, postings in query_valid_postings.items():
            print(token, len(postings))


        tic = perf_counter()
        results = []

        for i in range(len(query_valid_postings[first])):
            q_denom = 0
            norm_q = []
            norm_d = []

            for q in query_valid_postings.keys():
                q_denom += (query_valid_postings[q][i]['tf_idf']/(1 + math.log(query_valid_postings[q][i]['tf_raw'])))**2
            q_denom = math.sqrt(q_denom)

            for q in query_valid_postings.keys():
                x = query_valid_postings[q][i]['tf_idf']/(1 + math.log(query_valid_postings[q][i]['tf_raw']))/q_denom
                norm_q.append(x)
                y = (1 + math.log(query_valid_postings[q][i]['tf_raw']))/self.indexer.get_weight(query_valid_postings[q][i]['doc_id'])
                norm_d.append(y)
            results.append({'url' :query_valid_postings[first][i]['url'], 'cosine' : np.dot(norm_q, norm_d)})

        results = sorted(results, key = lambda x: x['cosine'], reverse = True)
        finalresults = []
        for i in range(20):
            finalresults.append(results[i]['url'])
        print(finalresults)
        return finalresults