From 60f6eb0df044105e4184544aaf98fe043940de27 Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 26 May 2022 23:34:29 -0700 Subject: [PATCH] search functionality to obtain set of documents --- indexer.py | 1 - search.py | 135 +++++++++++++++++------------------------- searchtesting.py | 117 ++++++++++++++++++++++++++++++++++++ tempCodeRunnerFile.py | 2 + worker.py | 5 +- 5 files changed, 176 insertions(+), 84 deletions(-) create mode 100644 searchtesting.py create mode 100644 tempCodeRunnerFile.py diff --git a/indexer.py b/indexer.py index 6a17b5f..9369c4b 100644 --- a/indexer.py +++ b/indexer.py @@ -172,7 +172,6 @@ class Indexer(): for i in range(len(split)): if split[i] not in tokens: tokens[split[i]] = Posting(self.get_url_id(url), 1, i) - else: tokens[split[i]].rtf += 1 tokens[split[i]].tf = (1 + math.log(tokens[split[i]].rtf)) diff --git a/search.py b/search.py index 9c83b28..c991576 100644 --- a/search.py +++ b/search.py @@ -23,109 +23,84 @@ from posting import Posting from worker import Worker class Search(): - + # wrote the code for testing in the file searchtesting.py so many of the variables and function calls are wrong. def __init__(self): - self.save_1 = shelve.open("save_1.shelve") - self.save_2 = shelve.open("save_2.shelve") - self.save_3 = shelve.open("save_3.shelve") - self.save_4 = shelve.open("save_4.shelve") - self.save_5 = shelve.open("save_5.shelve") self.stemmer = PorterStemmer() p = os.path.dirname(os.path.abspath(__file__)) my_filename = os.path.join(p, "urlID.pkl") self.f = open(my_filename, "rb+") self.id = pickle.load(self.f) - def get_save_file(self, word): - word_lower = word.lower() + # takes a list of posting lists returns a list of indexes that correspond to search temp list + def two_shortest(self, l_posting): + short = [] + location = [] + for postings in l_posting: + short.append(len(postings)) + + for i in range(2): + x = short.index(min(short)) + location.append(x) + short[x] = float('inf') + + return location - if re.match(r"^[a-d0-1].*", word_lower): - return self.save_1 - elif re.match(r"^[e-k2-3].*", word_lower): - return self.save_2 - elif re.match(r"^[l-q4-7].*", word_lower): - return self.save_3 - elif re.match(r"^[r-z8-9].*", word_lower): - return self.save_4 - else: - return self.save_5 - - # looks for the smallest list and largest list - def find_extremes(self, q): - longest = float('-inf') - shortest = float('inf') - remaining = [] - # Careful if there is a word that the indexer doesn't have - for word in q: - d = self.get_save_file(word) - if len(d[word]) > longest: - longest = len(d[word]) - l = word - elif len(d[word]) < shortest: - shortest = len(d[word]) - s = word - for word in q: - if word != l or word != s: - remaining.append(word) - return s, l, remaining - - def merge(self, short, long, r): - m = [] + # len(list1) <= len(list2) So the code in this function works with that in mind + def merge(self, list1, list2): + merged = [] i = 0 j = 0 - s = self.get_save_file(short) - l = self.get_save_file(long) - while i < len(s[short]) or j < len(l[long]): - if i == len(d[short])-1: - if s[short][i].url == l[long][j].url: - m.append(s[short][i].url) + # TODO: optimize by having a pointer to the current index+4 + while i < len(list1) or j < len(list2): + if j == len(list2): + break + if i == len(list1): + break + # Since list1 is shorter it will hit its max index sooner, + # so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2 + if i == len(list1)-1: + if list1[i].url == list2[j].url: + merged.append(list1[i]) j += 1 - elif s[short][i].url < l[long][j].url: + i += 1 + elif list1[i].url < list2[j].url: break else: j += 1 else: - if s[short][i].url == l[long][j].url: - m.append(d[short][i].url) + if list1[i].url == list2[j].url: + merged.append(list1[i]) i += 1 j += 1 - elif s[short][i].url < l[long][j].url: + elif list1[i].url < list2[j].url: break else: i += 1 j += 1 + return merged + + # query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id) + def search(self, query): + temp = [] + for token in query: + temp.append(get_index(token)) - final = [] - if len(m) > 0: - while len(r) > 0: - d = self.get_save_file(r[0]) - for i in d[r[0]]: - if i.url > m[len(m) -1]: - break - elif i.url in m: - final.append(i.url) - if len(final) != len(m): - m = final - final = [] - r.pop(0) - else: - final = [] - r.pop(0) - - return m - else: - return -1 + l = two_shortest(temp) + m = merge(temp[l[0]], temp[l[1]]) - def search(self): - query = input("Enter query: ") - query = [self.stemmer.stem(i) for i in query.split()] - x = self.find_extremes(query) - match = self.merge(x[0], x[1], x[2]) - if match == -1: - print("No valid matches") - else: - for i in match: - print(self.id[i]) + while len(temp) > 1: + # delete from temp the already merged lists + del temp[l[0]] + del temp[l[1]] + temp.append(m) + + l = two_shortest(temp) + m = merge(temp[l[0]], temp[l[1]]) + + for p in m: + print(p.url) + + # For now going to do a loop through each query's index and match it with the merged list (can be faster if i implement something during merge/search in order to keep track of the postings) diff --git a/searchtesting.py b/searchtesting.py new file mode 100644 index 0000000..b7379e9 --- /dev/null +++ b/searchtesting.py @@ -0,0 +1,117 @@ +import math +import json +import os +import shelve +from bs4 import BeautifulSoup +from time import perf_counter +import time +import threading +import pickle + + +#Data process +from nltk.tokenize import word_tokenize +from nltk.stem import PorterStemmer +from sklearn.feature_extraction.text import TfidfVectorizer +import pandas as pd +import numpy as np + +import re + +class Posting(): + def __init__(self, url, rtf, position): + self.url = url + self.rtf = rtf + self.tf = 1 + self.tfidf = 0 + self.positions = [position] + + +d = { + 'a' : [Posting(0, 1, 1), Posting(2, 1, 1), Posting(3, 1, 1), Posting(8, 1, 1)], + 'b' :[Posting(0, 1, 1), Posting(8, 1, 1)], + 'c' : [Posting(0, 1, 1), Posting(1, 1, 1), Posting(2, 1, 1), Posting(8, 1, 1)] + } + +def get_index(word): + for k, v in d.items(): + if k == word: + return v + +# takes a list of posting lists returns a list of indexes that correspond to search temp list +def two_shortest(l_posting): + short = [] + location = [] + for postings in l_posting: + short.append(len(postings)) + + for i in range(2): + x = short.index(min(short)) + location.append(x) + short[x] = float('inf') + + return location + +# len(list1) <= len(list2) So the code in this function works with that in mind +def merge(list1, list2): + merged = [] + i = 0 + j = 0 + # TODO: optimize by having a pointer to the current index+4 + while i < len(list1) or j < len(list2): + if j == len(list2): + break + if i == len(list1): + break + # Since list1 is shorter it will hit its max index sooner, + # so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2 + if i == len(list1)-1: + if list1[i].url == list2[j].url: + merged.append(list1[i]) + j += 1 + i += 1 + elif list1[i].url < list2[j].url: + break + else: + j += 1 + else: + if list1[i].url == list2[j].url: + merged.append(list1[i]) + i += 1 + j += 1 + elif list1[i].url < list2[j].url: + break + else: + i += 1 + j += 1 + return merged, + +# query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id) +def search(query): + temp = [] + for token in query: + temp.append(get_index(token)) + + l = two_shortest(temp) + m = merge(temp[l[0]], temp[l[1]]) + + while len(temp) > 1: + # delete from temp the already merged lists + del temp[l[0]] + del temp[l[1]] + temp.append(m) + + l = two_shortest(temp) + m = merge(temp[l[0]], temp[l[1]]) + + for p in m: + print(p.url) + + # For now going to do a loop through each query's index and match it with the merged list (can be faster if i implement something during merge/search in order to keep track of the postings) + + + + + +search(["a", "b", "c"]) + diff --git a/tempCodeRunnerFile.py b/tempCodeRunnerFile.py new file mode 100644 index 0000000..02987ef --- /dev/null +++ b/tempCodeRunnerFile.py @@ -0,0 +1,2 @@ + + for postings in l_posting: \ No newline at end of file diff --git a/worker.py b/worker.py index c73d5c1..fe37356 100644 --- a/worker.py +++ b/worker.py @@ -54,9 +54,8 @@ class Worker(Thread): ngrams = self.indexer.get_tf_idf(x) if ngrams != -1: tic = perf_counter() - for ngram, tfidf in ngrams.items(): - posting = Posting(self.indexer.get_url_id(data["url"]), tfidf[0]) - self.indexer.save_index(ngram,posting) + for ngram, posting in ngrams.items(): + self.indexer.save_index(ngram, posting) toc = perf_counter() print("Took " + str(toc - tic) + " seconds to save ngram")