118 lines
3.0 KiB
Python
118 lines
3.0 KiB
Python
import math
|
|
import json
|
|
import os
|
|
import shelve
|
|
from bs4 import BeautifulSoup
|
|
from time import perf_counter
|
|
import time
|
|
import threading
|
|
import pickle
|
|
|
|
|
|
#Data process
|
|
from nltk.tokenize import word_tokenize
|
|
from nltk.stem import PorterStemmer
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
import re
|
|
|
|
class Posting():
|
|
def __init__(self, url, rtf, position):
|
|
self.url = url
|
|
self.rtf = rtf
|
|
self.tf = 1
|
|
self.tfidf = 0
|
|
self.positions = [position]
|
|
|
|
|
|
d = {
|
|
'a' : [Posting(0, 1, 1), Posting(2, 1, 1), Posting(3, 1, 1), Posting(8, 1, 1)],
|
|
'b' :[Posting(0, 1, 1), Posting(8, 1, 1)],
|
|
'c' : [Posting(0, 1, 1), Posting(1, 1, 1), Posting(2, 1, 1), Posting(8, 1, 1)]
|
|
}
|
|
|
|
def get_index(word):
|
|
for k, v in d.items():
|
|
if k == word:
|
|
return v
|
|
|
|
# takes a list of posting lists returns a list of indexes that correspond to search temp list
|
|
def two_shortest(l_posting):
|
|
short = []
|
|
location = []
|
|
for postings in l_posting:
|
|
short.append(len(postings))
|
|
|
|
for i in range(2):
|
|
x = short.index(min(short))
|
|
location.append(x)
|
|
short[x] = float('inf')
|
|
|
|
return location
|
|
|
|
# len(list1) <= len(list2) So the code in this function works with that in mind
|
|
def merge(list1, list2):
|
|
merged = []
|
|
i = 0
|
|
j = 0
|
|
# TODO: optimize by having a pointer to the current index+4
|
|
while i < len(list1) or j < len(list2):
|
|
if j == len(list2):
|
|
break
|
|
if i == len(list1):
|
|
break
|
|
# Since list1 is shorter it will hit its max index sooner,
|
|
# so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2
|
|
if i == len(list1)-1:
|
|
if list1[i].url == list2[j].url:
|
|
merged.append(list1[i])
|
|
j += 1
|
|
i += 1
|
|
elif list1[i].url < list2[j].url:
|
|
break
|
|
else:
|
|
j += 1
|
|
else:
|
|
if list1[i].url == list2[j].url:
|
|
merged.append(list1[i])
|
|
i += 1
|
|
j += 1
|
|
elif list1[i].url < list2[j].url:
|
|
break
|
|
else:
|
|
i += 1
|
|
j += 1
|
|
return merged,
|
|
|
|
# query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
|
|
def search(query):
|
|
temp = []
|
|
for token in query:
|
|
temp.append(get_index(token))
|
|
|
|
l = two_shortest(temp)
|
|
m = merge(temp[l[0]], temp[l[1]])
|
|
|
|
while len(temp) > 1:
|
|
# delete from temp the already merged lists
|
|
del temp[l[0]]
|
|
del temp[l[1]]
|
|
temp.append(m)
|
|
|
|
l = two_shortest(temp)
|
|
m = merge(temp[l[0]], temp[l[1]])
|
|
|
|
for p in m:
|
|
print(p.url)
|
|
|
|
# For now going to do a loop through each query's index and match it with the merged list (can be faster if i implement something during merge/search in order to keep track of the postings)
|
|
|
|
|
|
|
|
|
|
|
|
search(["a", "b", "c"])
|
|
|