118 lines
		
	
	
		
			3.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			118 lines
		
	
	
		
			3.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import math
 | 
						|
import json
 | 
						|
import os
 | 
						|
import shelve
 | 
						|
from bs4 import BeautifulSoup
 | 
						|
from time import perf_counter
 | 
						|
import time
 | 
						|
import threading
 | 
						|
import pickle
 | 
						|
 | 
						|
 | 
						|
#Data process
 | 
						|
from nltk.tokenize import word_tokenize
 | 
						|
from nltk.stem import PorterStemmer
 | 
						|
from sklearn.feature_extraction.text import TfidfVectorizer
 | 
						|
import pandas as pd
 | 
						|
import numpy as np
 | 
						|
 | 
						|
import re
 | 
						|
 | 
						|
class Posting():
 | 
						|
	def __init__(self, url, rtf, position):
 | 
						|
		self.url = url
 | 
						|
		self.rtf = rtf
 | 
						|
		self.tf = 1
 | 
						|
		self.tfidf = 0
 | 
						|
		self.positions = [position]
 | 
						|
 | 
						|
 | 
						|
d = {
 | 
						|
    'a' : [Posting(0, 1, 1), Posting(2, 1, 1), Posting(3, 1, 1), Posting(8, 1, 1)], 
 | 
						|
    'b' :[Posting(0, 1, 1), Posting(8, 1, 1)],
 | 
						|
    'c' : [Posting(0, 1, 1), Posting(1, 1, 1), Posting(2, 1, 1), Posting(8, 1, 1)]
 | 
						|
    }
 | 
						|
 | 
						|
def get_index(word):
 | 
						|
    for k, v in d.items():
 | 
						|
        if k == word:
 | 
						|
            return v
 | 
						|
 | 
						|
# takes a list of posting lists returns a list of indexes that correspond to search temp list
 | 
						|
def two_shortest(l_posting):
 | 
						|
    short = []
 | 
						|
    location = []
 | 
						|
    for postings in l_posting:
 | 
						|
        short.append(len(postings))
 | 
						|
    
 | 
						|
    for i in range(2):
 | 
						|
        x = short.index(min(short))
 | 
						|
        location.append(x)
 | 
						|
        short[x] = float('inf')
 | 
						|
    
 | 
						|
    return location
 | 
						|
 | 
						|
# len(list1) <= len(list2) So the code in this function works with that in mind
 | 
						|
def merge(list1, list2):
 | 
						|
    merged = []
 | 
						|
    i = 0
 | 
						|
    j = 0
 | 
						|
    # TODO: optimize by having a pointer to the current index+4
 | 
						|
    while i < len(list1) or j < len(list2):
 | 
						|
        if j == len(list2):
 | 
						|
            break
 | 
						|
        if i == len(list1):
 | 
						|
            break
 | 
						|
        # Since list1 is shorter it will hit its max index sooner, 
 | 
						|
        #   so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2
 | 
						|
        if i == len(list1)-1:
 | 
						|
            if list1[i].url == list2[j].url:
 | 
						|
                merged.append(list1[i])
 | 
						|
                j += 1
 | 
						|
                i += 1
 | 
						|
            elif list1[i].url < list2[j].url:
 | 
						|
                break
 | 
						|
            else:
 | 
						|
                j += 1
 | 
						|
        else:
 | 
						|
            if list1[i].url == list2[j].url:
 | 
						|
                merged.append(list1[i])
 | 
						|
                i += 1
 | 
						|
                j += 1
 | 
						|
            elif list1[i].url < list2[j].url:
 | 
						|
                break
 | 
						|
            else:
 | 
						|
                i += 1
 | 
						|
                j += 1
 | 
						|
    return merged, 
 | 
						|
 | 
						|
# query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
 | 
						|
def search(query):
 | 
						|
    temp = []
 | 
						|
    for token in query:
 | 
						|
        temp.append(get_index(token))
 | 
						|
    
 | 
						|
    l = two_shortest(temp)
 | 
						|
    m = merge(temp[l[0]], temp[l[1]])
 | 
						|
 | 
						|
    while len(temp) > 1:
 | 
						|
        # delete from temp the already merged lists
 | 
						|
        del temp[l[0]]
 | 
						|
        del temp[l[1]]
 | 
						|
        temp.append(m)
 | 
						|
 | 
						|
        l = two_shortest(temp)
 | 
						|
        m = merge(temp[l[0]], temp[l[1]])
 | 
						|
 | 
						|
    for p in m:
 | 
						|
        print(p.url)
 | 
						|
    
 | 
						|
    # For now going to do a loop through each query's index and match it with the merged list (can be faster if i implement something during merge/search in order to keep track of the postings)
 | 
						|
    
 | 
						|
    
 | 
						|
 | 
						|
 | 
						|
 | 
						|
search(["a", "b", "c"])
 | 
						|
            
 |