diff --git a/README.md b/README.md index fbf04a7..043994a 100644 --- a/README.md +++ b/README.md @@ -1,39 +1,14 @@ # Search_Engine Developing a mini search-engine in python using reverse-indexed stemming and other SEOs implementations -## Part 1: The Reversed-Index +Start the program by running python3 launcher.py +A flask webpage will start. +If you do not have any indexes files, the webpage will show you an error +There is a button at the top of the page called Run Indexer +THIS IS EXTREMELY TIME CONSUMING AND DANGEROUS. IT WILL DELETE THE INDEX IF YOU ALREADY HAVE ONE ! +So to safeguard this, you have to click the button five times in a row in five different refreshes of the page -### Create an inverted index for the corpus with data structures designed by you. +You can also create the index by running python3 indexer.py -- Tokens: all alphanumeric sequences in the dataset. +After the indices are created you can go ahead and search through them -- Stop words: do not use stopping while indexing, i.e. use all words, even -the frequently occurring ones. - -- Stemming: use stemming for better textual matches. Suggestion: Porter -stemming, but it is up to you to choose. - -- Important text: text in bold (b, strong), in headings (h1, h2, h3), and -in titles should be treated as more important than the in other places. - -Verify which are the relevant HTML tags to select the important words. - -### Building the inverted index -Now that you have been provided the HTML files to index, you may build your -inverted index off of them. The inverted index is simply a map with the token -as a key and a list of its corresponding postings. A posting is the representation -of the token’s occurrence in a document. The posting typically (not limited to) -contains the following info (you are encouraged to think of other attributes that -you could add to the index): -- The document name/id the token was found in. -- Its tf-idf score for that document (for MS1, add only the term frequency). - -### Some tips: -- When designing your inverted index, you will think about the structure -of your posting first. -- You would normally begin by implementing the code to calculate/fetch -the elements which will constitute your posting. -- Modularize. Use scripts/classes that will perform a function or a set of -closely related functions. This helps in keeping track of your progress, -debugging, and also dividing work amongst teammates if you’re in a group. -- We recommend you use GitHub as a mechanism to work with your team -members on this project, but you are not required to do so. +Notably \ No newline at end of file diff --git a/indexer.py b/indexer.py index 973d888..9fc91c6 100644 --- a/indexer.py +++ b/indexer.py @@ -19,7 +19,6 @@ from os.path import exists #Data process from nltk.tokenize import word_tokenize from nltk.stem import PorterStemmer -from sklearn.feature_extraction.text import TfidfVectorizer import pandas as pd import numpy as np import re @@ -60,7 +59,7 @@ class Indexer(): self.list_partials_lock = Lock() #Loading index_index into memory - if exists("merged_index_index"): + if exists("merged_index.index"): merged_index_index = open("merged_index.index",'r') merged_index_index.seek(0,0) json_value = merged_index_index.readline() @@ -79,6 +78,19 @@ class Indexer(): json_value = merged_index_index.readline() data = json.loads(json_value) self.index_index = dict(data['index']) + return self.index_index + else: + print("Index files do not exists, please run the indexer first") + return None + + def load_weight_index(self): + if exists("docs.weight"): + weight_file = open("docs.weight",'r') + weight_file.seek(0,0) + json_value = weight_file.readline() + data = json.loads(json_value) + self.weight = data + return self.weight else: print("Index files do not exists, please run the indexer first") return None @@ -118,15 +130,7 @@ class Indexer(): weight_file.close() def get_weight(self,doc_id): - if exists('docs.weight'): - weight = open('docs.weight','r') - weight.seek(0,0) - json_value = weight.readline() - data = json.loads(json_value) - return data[doc_id] - else: - print("Index files do not exists, please run the indexer first") - return None + return self.weight[doc_id] def get_data_path(self): for directory in os.listdir(self.path): for file in os.listdir(self.path + "/" + directory + "/"): @@ -239,3 +243,7 @@ class Indexer(): print("Finished merging into 1 big happy family") self.set_weight() print("I AM DONE INDEXING !") + +if __name__ == "__main__": + indexer = Indexer(list(),dict(),list()) + indexer.create_index() \ No newline at end of file diff --git a/launcher.py b/launcher.py index ecaca4c..d7265cb 100644 --- a/launcher.py +++ b/launcher.py @@ -1,37 +1,70 @@ from indexer import Indexer +from search import Search import time from flask import Flask from flask import render_template from flask import request app = Flask(__name__) -errors = list() -indexer = Indexer(list(),dict(),list()) + +errors = None +indexer = None search = None +safe_guard = 1 + +def get_data(): + global indexer + indexer = Indexer(list(),dict(),list()) + + global search + search = Search(indexer) + + global safe_guard + safe_guard = 1 + + global errors + errors = list() + if not indexer.load_index_index(): + errors.append("Index of index is missing, probably should run the indexer") + if not indexer.load_weight_index(): + errors.append("Index of index is missing, probably should run the indexer") + + @app.route('/',methods=['POST','GET']) def index(): - errors = list() - if not indexer: - errors.append("Error in creating indexer module") - elif not indexer.load_index_index(): - errors.append("Indexer does not exists, please run it first") - if not search: - errors.append("Error in creating search module") + global errors + global search + global indexer + global safe_guard + local_errors = errors if request.method == 'POST': if request.form.get('start-index') == "start": print("Making the indexer") - indexer.create_index() - return render_template('index.html',ips="Thanks for waiting you are ready to search.") + if safe_guard == 5: + safe_guard = 1 + indexer.create_index() + indexer.load_index_index() + return render_template('index.html',ips="Thanks for waiting you are ready to search.") + safe_guard = safe_guard + 1 + return render_template('index.html',ips=str(safe_guard) + " DANGER ! PROCEED IF YOU ARE KNOWING WHAT YOU DOING, OTHERWISE STOP, INDEX MIGHT GET YEETED") if request.form.get('search_query') != "": search_query = request.form['search_query'] - result = [['lorem','ipsi'],['lores','dolores']] - return render_template('index.html',results=result,errors=errors) - return render_template('index.html',errors=errors) + result = search.search(search_query) + safe_guard = 1 + errors = list() + return render_template('index.html',results=result,errors=local_errors) + safe_guard = 1 + errors = list() + return render_template('index.html',errors=local_errors) else: - return render_template('index.html',errors=errors) + safe_guard = 1 + errors = list() + return render_template('index.html',errors=local_errors) if __name__ == "__main__": - app.run(debug=True) - \ No newline at end of file + get_data() + + app.run(debug=False) + \ No newline at end of file diff --git a/search.py b/search.py new file mode 100644 index 0000000..4c8ae6f --- /dev/null +++ b/search.py @@ -0,0 +1,279 @@ +#Data input +import json +import os +import shelve +from bs4 import BeautifulSoup +from time import perf_counter +import time +import threading +import pickle +import sys +import math +import numpy as np + +sys.path.append('D:/Visual Studio Workspace') + +#Data process +from nltk.tokenize import word_tokenize +from nltk.stem import PorterStemmer +from sklearn.feature_extraction.text import TfidfVectorizer +import pandas as pd +import numpy as np + +import re +from indexer import Indexer + +#Logging postings +from posting import Posting +from worker import Worker +import indexer + +class Search(): + # wrote the code for testing in the file searchtesting.py so many of the variables and function calls are wrong. + def __init__(self, indexer): + self.indexer = indexer + self.indexer.load_index_index() + self.indexer.load_weight_index() + self.stemmer = PorterStemmer() + + # takes a list of posting lists returns a list of indexes of the querys with the two shortest postings list that corresponds to search temp list + def two_shortest(self, l_posting): + short = [] + location = [] + for postings in l_posting: + short.append(len(postings)) + + for i in range(2): + x = short.index(min(short)) + location.append(x) + short[x] = float('inf') + + return location + + # len(list1) <= len(list2) So the code in this function works with that in mind + def merge(self, list1, list2): + max = 0 + valid1 = [] + valid2 = [] + i = 0 + j = 0 + # TODO: optimize by having a pointer to the current index+4 + i4 = 3 + j4 = 3 + while i < len(list1) or j < len(list2): + if j == len(list2): + break + if i == len(list1): + break + #if max == 40: + #break + try: + if i == len(list1)-1: + if list1[i]['doc_id'] == list2[j]['doc_id']: + valid1.append(list1[i]) + valid2.append(list2[j]) + j += 1 + j4 +=1 + i += 1 + i4 += 1 + max += 1 + elif list1[i]['doc_id'] >= list2[j4]['doc_id']: + j = j4 + j4 = j + 3 + elif list1[i4]['doc_id'] < list2[j]['doc_id'] and i4 < len(list1): + i = i4 + i4 = i + 3 + elif list1[i]['doc_id'] < list2[j]['doc_id']: + i += 1 + i4 += 1 + elif list1[i]['doc_id'] > list2[j]['doc_id']: + j += 1 + j4 += 1 + else: + j += 1 + j4 += 1 + + else: + if list1[i]['doc_id'] == list2[j]['doc_id']: + valid1.append(list1[i]) + valid2.append(list2[j]) + j += 1 + j4 +=1 + i += 1 + i4 += 1 + max += 1 + elif list1[i]['doc_id'] >= list2[j4]['doc_id'] and j4 < len(list2): + j = j4 + j4 = j + 3 + + elif list1[i4]['doc_id'] < list2[j]['doc_id'] and i4 < len(list1): + i = i4 + i4 = i + 3 + elif list1[i]['doc_id'] < list2[j]['doc_id']: + i += 1 + i4 += 1 + elif list1[i]['doc_id'] > list2[j]['doc_id']: + j += 1 + j4 += 1 + else: + j += 1 + j4 +=1 + i += 1 + i4 += 1 + except: + if i == len(list1)-1: + if list1[i]['doc_id'] == list2[j]['doc_id']: + valid1.append(list1[i]) + valid2.append(list2[j]) + j += 1 + j4 +=1 + i += 1 + i4 += 1 + elif list1[i]['doc_id'] < list2[j]['doc_id']: + i += 1 + i4 += 1 + elif list1[i]['doc_id'] > list2[j]['doc_id']: + j += 1 + j4 += 1 + else: + j += 1 + j4 += 1 + else: + if list1[i]['doc_id'] == list2[j]['doc_id']: + valid1.append(list1[i]) + valid2.append(list2[j]) + j += 1 + j4 +=1 + i += 1 + i4 += 1 + elif list1[i]['doc_id'] < list2[j]['doc_id']: + i += 1 + i4 += 1 + elif list1[i]['doc_id'] > list2[j]['doc_id']: + j += 1 + j4 += 1 + else: + j += 1 + j4 +=1 + i += 1 + i4 += 1 + # Since list1 is shorter it will hit its max index sooner, + # so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2 + + return valid1, valid2 + + # query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id) + def search(self, query): + tokens = word_tokenize(query) + stemmed_tokens = list() + for token in tokens: + token = self.stemmer.stem(token) + stemmed_tokens.append(token) + + query_valid_postings = dict() + temp = [] + for token in stemmed_tokens: + temp.append(self.indexer.get_postings(token)) + query_valid_postings[token] = [] + + tic = perf_counter() + l = self.two_shortest(temp) + m = self.merge(temp[l[0]], temp[l[1]]) + if len(m[0]) == 0: + return -1 + # Keep track of the valid postings for each query as we do merge + first = stemmed_tokens[l[0]] + query_valid_postings[first] = m[0] + query_valid_postings[stemmed_tokens[l[1]]] = m[1] + toc = perf_counter() + print("first merge", toc-tic) + tic = perf_counter() + while len(temp) > 1: + # delete from temp the already merged lists + temp.pop(l[0]) + # Try and except since temp length changes + try: + temp.pop(l[1]) + except: + temp.pop(l[1]-1) + + temp.append(m[0]) + + # Delete and append to query to make it consistent with temp + stemmed_tokens.pop(l[0]) + try: + stemmed_tokens.pop(l[1]) + except: + stemmed_tokens.pop(l[1]-1) + + stemmed_tokens.append(None) + + l = self.two_shortest(temp) + # Checks if contents in l are the same + if len(set(l)) == 1: + break + else: + m = self.merge(temp[l[0]], temp[l[1]]) + print(len(m[0]), len(m[1])) + query_valid_postings[first] = m[0] + query_valid_postings[stemmed_tokens[l[1]]] = m[1] + toc = perf_counter() + print("while loop", toc-tic) + tic = perf_counter() + # Create list of doc ids of correct merged postings for cross checking + + merge = [] + for posting in query_valid_postings[first]: + merge.append(posting['doc_id']) + + + # Cross checking each query's valid postings list with correct merged set which we donated as being first + for token, postings in query_valid_postings.items(): + if token == first: + continue + else: + print(token) + for p in postings: + if p['doc_id'] not in merge: + postings.remove(p) + + toc = perf_counter() + print(toc-tic) + + + for token, postings in query_valid_postings.items(): + print(token, len(postings)) + + + tic = perf_counter() + results = [] + + for i in range(len(query_valid_postings[first])): + q_denom = 0 + norm_q = [] + norm_d = [] + + for q in query_valid_postings.keys(): + q_denom += (query_valid_postings[q][i]['tf_idf']/(1 + math.log(query_valid_postings[q][i]['tf_raw'])))**2 + q_denom = math.sqrt(q_denom) + + for q in query_valid_postings.keys(): + x = query_valid_postings[q][i]['tf_idf']/(1 + math.log(query_valid_postings[q][i]['tf_raw']))/q_denom + norm_q.append(x) + y = (1 + math.log(query_valid_postings[q][i]['tf_raw']))/self.indexer.get_weight(query_valid_postings[q][i]['doc_id']) + norm_d.append(y) + results.append({'url' :query_valid_postings[first][i]['url'], 'cosine' : np.dot(norm_q, norm_d)}) + + results = sorted(results, key = lambda x: x['cosine'], reverse = True) + finalresults = [] + for i in range(20): + finalresults.append(results[i]['url']) + print(finalresults) + return finalresults + + + + + + + diff --git a/templates/index.html b/templates/index.html index 43565db..c24d75e 100644 --- a/templates/index.html +++ b/templates/index.html @@ -19,8 +19,11 @@ + +
{{ips}}
+ {% for result in results %} -{{result[0]}} at {{result[1]}}
+ {% endfor %} {% for error in errors %}