Everything done and ready to test

This commit is contained in:
inocturnis 2022-05-27 23:00:45 -07:00
parent 63c9bbee6f
commit 5fd5319ffb
5 changed files with 361 additions and 63 deletions

View File

@ -1,39 +1,14 @@
# Search_Engine # Search_Engine
Developing a mini search-engine in python using reverse-indexed stemming and other SEOs implementations Developing a mini search-engine in python using reverse-indexed stemming and other SEOs implementations
## Part 1: The Reversed-Index Start the program by running python3 launcher.py
A flask webpage will start.
If you do not have any indexes files, the webpage will show you an error
There is a button at the top of the page called Run Indexer
THIS IS EXTREMELY TIME CONSUMING AND DANGEROUS. IT WILL DELETE THE INDEX IF YOU ALREADY HAVE ONE !
So to safeguard this, you have to click the button five times in a row in five different refreshes of the page
### Create an inverted index for the corpus with data structures designed by you. You can also create the index by running python3 indexer.py
- Tokens: all alphanumeric sequences in the dataset. After the indices are created you can go ahead and search through them
- Stop words: do not use stopping while indexing, i.e. use all words, even Notably
the frequently occurring ones.
- Stemming: use stemming for better textual matches. Suggestion: Porter
stemming, but it is up to you to choose.
- Important text: text in bold (b, strong), in headings (h1, h2, h3), and
in titles should be treated as more important than the in other places.
Verify which are the relevant HTML tags to select the important words.
### Building the inverted index
Now that you have been provided the HTML files to index, you may build your
inverted index off of them. The inverted index is simply a map with the token
as a key and a list of its corresponding postings. A posting is the representation
of the tokens occurrence in a document. The posting typically (not limited to)
contains the following info (you are encouraged to think of other attributes that
you could add to the index):
- The document name/id the token was found in.
- Its tf-idf score for that document (for MS1, add only the term frequency).
### Some tips:
- When designing your inverted index, you will think about the structure
of your posting first.
- You would normally begin by implementing the code to calculate/fetch
the elements which will constitute your posting.
- Modularize. Use scripts/classes that will perform a function or a set of
closely related functions. This helps in keeping track of your progress,
debugging, and also dividing work amongst teammates if youre in a group.
- We recommend you use GitHub as a mechanism to work with your team
members on this project, but you are not required to do so.

View File

@ -19,7 +19,6 @@ from os.path import exists
#Data process #Data process
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import re import re
@ -60,7 +59,7 @@ class Indexer():
self.list_partials_lock = Lock() self.list_partials_lock = Lock()
#Loading index_index into memory #Loading index_index into memory
if exists("merged_index_index"): if exists("merged_index.index"):
merged_index_index = open("merged_index.index",'r') merged_index_index = open("merged_index.index",'r')
merged_index_index.seek(0,0) merged_index_index.seek(0,0)
json_value = merged_index_index.readline() json_value = merged_index_index.readline()
@ -79,6 +78,19 @@ class Indexer():
json_value = merged_index_index.readline() json_value = merged_index_index.readline()
data = json.loads(json_value) data = json.loads(json_value)
self.index_index = dict(data['index']) self.index_index = dict(data['index'])
return self.index_index
else:
print("Index files do not exists, please run the indexer first")
return None
def load_weight_index(self):
if exists("docs.weight"):
weight_file = open("docs.weight",'r')
weight_file.seek(0,0)
json_value = weight_file.readline()
data = json.loads(json_value)
self.weight = data
return self.weight
else: else:
print("Index files do not exists, please run the indexer first") print("Index files do not exists, please run the indexer first")
return None return None
@ -118,15 +130,7 @@ class Indexer():
weight_file.close() weight_file.close()
def get_weight(self,doc_id): def get_weight(self,doc_id):
if exists('docs.weight'): return self.weight[doc_id]
weight = open('docs.weight','r')
weight.seek(0,0)
json_value = weight.readline()
data = json.loads(json_value)
return data[doc_id]
else:
print("Index files do not exists, please run the indexer first")
return None
def get_data_path(self): def get_data_path(self):
for directory in os.listdir(self.path): for directory in os.listdir(self.path):
for file in os.listdir(self.path + "/" + directory + "/"): for file in os.listdir(self.path + "/" + directory + "/"):
@ -239,3 +243,7 @@ class Indexer():
print("Finished merging into 1 big happy family") print("Finished merging into 1 big happy family")
self.set_weight() self.set_weight()
print("I AM DONE INDEXING !") print("I AM DONE INDEXING !")
if __name__ == "__main__":
indexer = Indexer(list(),dict(),list())
indexer.create_index()

View File

@ -1,37 +1,70 @@
from indexer import Indexer from indexer import Indexer
from search import Search
import time import time
from flask import Flask from flask import Flask
from flask import render_template from flask import render_template
from flask import request from flask import request
app = Flask(__name__) app = Flask(__name__)
errors = list()
indexer = Indexer(list(),dict(),list()) errors = None
indexer = None
search = None search = None
safe_guard = 1
def get_data():
global indexer
indexer = Indexer(list(),dict(),list())
global search
search = Search(indexer)
global safe_guard
safe_guard = 1
global errors
errors = list()
if not indexer.load_index_index():
errors.append("Index of index is missing, probably should run the indexer")
if not indexer.load_weight_index():
errors.append("Index of index is missing, probably should run the indexer")
@app.route('/',methods=['POST','GET']) @app.route('/',methods=['POST','GET'])
def index(): def index():
errors = list() global errors
if not indexer: global search
errors.append("Error in creating indexer module") global indexer
elif not indexer.load_index_index(): global safe_guard
errors.append("Indexer does not exists, please run it first") local_errors = errors
if not search:
errors.append("Error in creating search module")
if request.method == 'POST': if request.method == 'POST':
if request.form.get('start-index') == "start": if request.form.get('start-index') == "start":
print("Making the indexer") print("Making the indexer")
if safe_guard == 5:
safe_guard = 1
indexer.create_index() indexer.create_index()
indexer.load_index_index()
return render_template('index.html',ips="Thanks for waiting you are ready to search.") return render_template('index.html',ips="Thanks for waiting you are ready to search.")
safe_guard = safe_guard + 1
return render_template('index.html',ips=str(safe_guard) + " DANGER ! PROCEED IF YOU ARE KNOWING WHAT YOU DOING, OTHERWISE STOP, INDEX MIGHT GET YEETED")
if request.form.get('search_query') != "": if request.form.get('search_query') != "":
search_query = request.form['search_query'] search_query = request.form['search_query']
result = [['lorem','ipsi'],['lores','dolores']] result = search.search(search_query)
return render_template('index.html',results=result,errors=errors) safe_guard = 1
return render_template('index.html',errors=errors) errors = list()
return render_template('index.html',results=result,errors=local_errors)
safe_guard = 1
errors = list()
return render_template('index.html',errors=local_errors)
else: else:
return render_template('index.html',errors=errors) safe_guard = 1
errors = list()
return render_template('index.html',errors=local_errors)
if __name__ == "__main__": if __name__ == "__main__":
app.run(debug=True) get_data()
app.run(debug=False)

279
search.py Normal file
View File

@ -0,0 +1,279 @@
#Data input
import json
import os
import shelve
from bs4 import BeautifulSoup
from time import perf_counter
import time
import threading
import pickle
import sys
import math
import numpy as np
sys.path.append('D:/Visual Studio Workspace')
#Data process
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import re
from indexer import Indexer
#Logging postings
from posting import Posting
from worker import Worker
import indexer
class Search():
# wrote the code for testing in the file searchtesting.py so many of the variables and function calls are wrong.
def __init__(self, indexer):
self.indexer = indexer
self.indexer.load_index_index()
self.indexer.load_weight_index()
self.stemmer = PorterStemmer()
# takes a list of posting lists returns a list of indexes of the querys with the two shortest postings list that corresponds to search temp list
def two_shortest(self, l_posting):
short = []
location = []
for postings in l_posting:
short.append(len(postings))
for i in range(2):
x = short.index(min(short))
location.append(x)
short[x] = float('inf')
return location
# len(list1) <= len(list2) So the code in this function works with that in mind
def merge(self, list1, list2):
max = 0
valid1 = []
valid2 = []
i = 0
j = 0
# TODO: optimize by having a pointer to the current index+4
i4 = 3
j4 = 3
while i < len(list1) or j < len(list2):
if j == len(list2):
break
if i == len(list1):
break
#if max == 40:
#break
try:
if i == len(list1)-1:
if list1[i]['doc_id'] == list2[j]['doc_id']:
valid1.append(list1[i])
valid2.append(list2[j])
j += 1
j4 +=1
i += 1
i4 += 1
max += 1
elif list1[i]['doc_id'] >= list2[j4]['doc_id']:
j = j4
j4 = j + 3
elif list1[i4]['doc_id'] < list2[j]['doc_id'] and i4 < len(list1):
i = i4
i4 = i + 3
elif list1[i]['doc_id'] < list2[j]['doc_id']:
i += 1
i4 += 1
elif list1[i]['doc_id'] > list2[j]['doc_id']:
j += 1
j4 += 1
else:
j += 1
j4 += 1
else:
if list1[i]['doc_id'] == list2[j]['doc_id']:
valid1.append(list1[i])
valid2.append(list2[j])
j += 1
j4 +=1
i += 1
i4 += 1
max += 1
elif list1[i]['doc_id'] >= list2[j4]['doc_id'] and j4 < len(list2):
j = j4
j4 = j + 3
elif list1[i4]['doc_id'] < list2[j]['doc_id'] and i4 < len(list1):
i = i4
i4 = i + 3
elif list1[i]['doc_id'] < list2[j]['doc_id']:
i += 1
i4 += 1
elif list1[i]['doc_id'] > list2[j]['doc_id']:
j += 1
j4 += 1
else:
j += 1
j4 +=1
i += 1
i4 += 1
except:
if i == len(list1)-1:
if list1[i]['doc_id'] == list2[j]['doc_id']:
valid1.append(list1[i])
valid2.append(list2[j])
j += 1
j4 +=1
i += 1
i4 += 1
elif list1[i]['doc_id'] < list2[j]['doc_id']:
i += 1
i4 += 1
elif list1[i]['doc_id'] > list2[j]['doc_id']:
j += 1
j4 += 1
else:
j += 1
j4 += 1
else:
if list1[i]['doc_id'] == list2[j]['doc_id']:
valid1.append(list1[i])
valid2.append(list2[j])
j += 1
j4 +=1
i += 1
i4 += 1
elif list1[i]['doc_id'] < list2[j]['doc_id']:
i += 1
i4 += 1
elif list1[i]['doc_id'] > list2[j]['doc_id']:
j += 1
j4 += 1
else:
j += 1
j4 +=1
i += 1
i4 += 1
# Since list1 is shorter it will hit its max index sooner,
# so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2
return valid1, valid2
# query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
def search(self, query):
tokens = word_tokenize(query)
stemmed_tokens = list()
for token in tokens:
token = self.stemmer.stem(token)
stemmed_tokens.append(token)
query_valid_postings = dict()
temp = []
for token in stemmed_tokens:
temp.append(self.indexer.get_postings(token))
query_valid_postings[token] = []
tic = perf_counter()
l = self.two_shortest(temp)
m = self.merge(temp[l[0]], temp[l[1]])
if len(m[0]) == 0:
return -1
# Keep track of the valid postings for each query as we do merge
first = stemmed_tokens[l[0]]
query_valid_postings[first] = m[0]
query_valid_postings[stemmed_tokens[l[1]]] = m[1]
toc = perf_counter()
print("first merge", toc-tic)
tic = perf_counter()
while len(temp) > 1:
# delete from temp the already merged lists
temp.pop(l[0])
# Try and except since temp length changes
try:
temp.pop(l[1])
except:
temp.pop(l[1]-1)
temp.append(m[0])
# Delete and append to query to make it consistent with temp
stemmed_tokens.pop(l[0])
try:
stemmed_tokens.pop(l[1])
except:
stemmed_tokens.pop(l[1]-1)
stemmed_tokens.append(None)
l = self.two_shortest(temp)
# Checks if contents in l are the same
if len(set(l)) == 1:
break
else:
m = self.merge(temp[l[0]], temp[l[1]])
print(len(m[0]), len(m[1]))
query_valid_postings[first] = m[0]
query_valid_postings[stemmed_tokens[l[1]]] = m[1]
toc = perf_counter()
print("while loop", toc-tic)
tic = perf_counter()
# Create list of doc ids of correct merged postings for cross checking
merge = []
for posting in query_valid_postings[first]:
merge.append(posting['doc_id'])
# Cross checking each query's valid postings list with correct merged set which we donated as being first
for token, postings in query_valid_postings.items():
if token == first:
continue
else:
print(token)
for p in postings:
if p['doc_id'] not in merge:
postings.remove(p)
toc = perf_counter()
print(toc-tic)
for token, postings in query_valid_postings.items():
print(token, len(postings))
tic = perf_counter()
results = []
for i in range(len(query_valid_postings[first])):
q_denom = 0
norm_q = []
norm_d = []
for q in query_valid_postings.keys():
q_denom += (query_valid_postings[q][i]['tf_idf']/(1 + math.log(query_valid_postings[q][i]['tf_raw'])))**2
q_denom = math.sqrt(q_denom)
for q in query_valid_postings.keys():
x = query_valid_postings[q][i]['tf_idf']/(1 + math.log(query_valid_postings[q][i]['tf_raw']))/q_denom
norm_q.append(x)
y = (1 + math.log(query_valid_postings[q][i]['tf_raw']))/self.indexer.get_weight(query_valid_postings[q][i]['doc_id'])
norm_d.append(y)
results.append({'url' :query_valid_postings[first][i]['url'], 'cosine' : np.dot(norm_q, norm_d)})
results = sorted(results, key = lambda x: x['cosine'], reverse = True)
finalresults = []
for i in range(20):
finalresults.append(results[i]['url'])
print(finalresults)
return finalresults

View File

@ -19,8 +19,11 @@
</form> </form>
</div> </div>
<p>{{ips}}</p>
{% for result in results %} {% for result in results %}
<p>{{result[0]}} at <a href="{{result[1]}}">{{result[1]}}</a></p> <p> <a href="{{result}}">{{result}}</a></p>
{% endfor %} {% endfor %}
{% for error in errors %} {% for error in errors %}