Search_Engine/search.py

#Data input
import json
import os
import shelve
from bs4 import BeautifulSoup
from time import perf_counter
import time
import threading
import pickle


#Data process
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np

import re

#Logging postings
from posting import Posting
from worker import Worker

class Search():
    # wrote the code for testing in the file searchtesting.py so many of the variables and function calls are wrong.
    def __init__(self):
        self.stemmer = PorterStemmer()
        p = os.path.dirname(os.path.abspath(__file__))
        my_filename = os.path.join(p, "urlID.pkl")
        self.f = open(my_filename, "rb+")
        self.id = pickle.load(self.f)

    # takes a list of posting lists returns a list of indexes that correspond to search temp list
    def two_shortest(self, l_posting):
        short = []
        location = []
        for postings in l_posting:
            short.append(len(postings))

        for i in range(2):
            x = short.index(min(short))
            location.append(x)
            short[x] = float('inf')

        return location

    # len(list1) <= len(list2) So the code in this function works with that in mind
    def merge(self, list1, list2):
        merged = []
        i = 0
        j = 0
        # TODO: optimize by having a pointer to the current index+4
        while i < len(list1) or j < len(list2):
            if j == len(list2):
                break
            if i == len(list1):
                break
            # Since list1 is shorter it will hit its max index sooner,
            #   so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2
            if i == len(list1)-1:
                if list1[i].url == list2[j].url:
                    merged.append(list1[i])
                    j += 1
                    i += 1
                elif list1[i].url < list2[j].url:
                    break
                else:
                    j += 1
            else:
                if list1[i].url == list2[j].url:
                    merged.append(list1[i])
                    i += 1
                    j += 1
                elif list1[i].url < list2[j].url:
                    break
                else:
                    i += 1
                    j += 1
        return merged

    # query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
    def search(self, query):
        temp = []
        for token in query:
            temp.append(get_index(token))

        l = two_shortest(temp)
        m = merge(temp[l[0]], temp[l[1]])

        while len(temp) > 1:
            # delete from temp the already merged lists
            del temp[l[0]]
            del temp[l[1]]
            temp.append(m)

            l = two_shortest(temp)
            m = merge(temp[l[0]], temp[l[1]])

        for p in m:
            print(p.url)

        # For now going to do a loop through each query's index and match it with the merged list (can be faster if i implement something during merge/search in order to keep track of the postings)