Everything done and ready to test
This commit is contained in:
parent
63c9bbee6f
commit
5fd5319ffb
43
README.md
43
README.md
@ -1,39 +1,14 @@
|
||||
# Search_Engine
|
||||
Developing a mini search-engine in python using reverse-indexed stemming and other SEOs implementations
|
||||
## Part 1: The Reversed-Index
|
||||
Start the program by running python3 launcher.py
|
||||
A flask webpage will start.
|
||||
If you do not have any indexes files, the webpage will show you an error
|
||||
There is a button at the top of the page called Run Indexer
|
||||
THIS IS EXTREMELY TIME CONSUMING AND DANGEROUS. IT WILL DELETE THE INDEX IF YOU ALREADY HAVE ONE !
|
||||
So to safeguard this, you have to click the button five times in a row in five different refreshes of the page
|
||||
|
||||
### Create an inverted index for the corpus with data structures designed by you.
|
||||
You can also create the index by running python3 indexer.py
|
||||
|
||||
- Tokens: all alphanumeric sequences in the dataset.
|
||||
After the indices are created you can go ahead and search through them
|
||||
|
||||
- Stop words: do not use stopping while indexing, i.e. use all words, even
|
||||
the frequently occurring ones.
|
||||
|
||||
- Stemming: use stemming for better textual matches. Suggestion: Porter
|
||||
stemming, but it is up to you to choose.
|
||||
|
||||
- Important text: text in bold (b, strong), in headings (h1, h2, h3), and
|
||||
in titles should be treated as more important than the in other places.
|
||||
|
||||
Verify which are the relevant HTML tags to select the important words.
|
||||
|
||||
### Building the inverted index
|
||||
Now that you have been provided the HTML files to index, you may build your
|
||||
inverted index off of them. The inverted index is simply a map with the token
|
||||
as a key and a list of its corresponding postings. A posting is the representation
|
||||
of the token’s occurrence in a document. The posting typically (not limited to)
|
||||
contains the following info (you are encouraged to think of other attributes that
|
||||
you could add to the index):
|
||||
- The document name/id the token was found in.
|
||||
- Its tf-idf score for that document (for MS1, add only the term frequency).
|
||||
|
||||
### Some tips:
|
||||
- When designing your inverted index, you will think about the structure
|
||||
of your posting first.
|
||||
- You would normally begin by implementing the code to calculate/fetch
|
||||
the elements which will constitute your posting.
|
||||
- Modularize. Use scripts/classes that will perform a function or a set of
|
||||
closely related functions. This helps in keeping track of your progress,
|
||||
debugging, and also dividing work amongst teammates if you’re in a group.
|
||||
- We recommend you use GitHub as a mechanism to work with your team
|
||||
members on this project, but you are not required to do so.
|
||||
Notably
|
30
indexer.py
30
indexer.py
@ -19,7 +19,6 @@ from os.path import exists
|
||||
#Data process
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.stem import PorterStemmer
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import re
|
||||
@ -60,7 +59,7 @@ class Indexer():
|
||||
self.list_partials_lock = Lock()
|
||||
|
||||
#Loading index_index into memory
|
||||
if exists("merged_index_index"):
|
||||
if exists("merged_index.index"):
|
||||
merged_index_index = open("merged_index.index",'r')
|
||||
merged_index_index.seek(0,0)
|
||||
json_value = merged_index_index.readline()
|
||||
@ -79,6 +78,19 @@ class Indexer():
|
||||
json_value = merged_index_index.readline()
|
||||
data = json.loads(json_value)
|
||||
self.index_index = dict(data['index'])
|
||||
return self.index_index
|
||||
else:
|
||||
print("Index files do not exists, please run the indexer first")
|
||||
return None
|
||||
|
||||
def load_weight_index(self):
|
||||
if exists("docs.weight"):
|
||||
weight_file = open("docs.weight",'r')
|
||||
weight_file.seek(0,0)
|
||||
json_value = weight_file.readline()
|
||||
data = json.loads(json_value)
|
||||
self.weight = data
|
||||
return self.weight
|
||||
else:
|
||||
print("Index files do not exists, please run the indexer first")
|
||||
return None
|
||||
@ -118,15 +130,7 @@ class Indexer():
|
||||
weight_file.close()
|
||||
|
||||
def get_weight(self,doc_id):
|
||||
if exists('docs.weight'):
|
||||
weight = open('docs.weight','r')
|
||||
weight.seek(0,0)
|
||||
json_value = weight.readline()
|
||||
data = json.loads(json_value)
|
||||
return data[doc_id]
|
||||
else:
|
||||
print("Index files do not exists, please run the indexer first")
|
||||
return None
|
||||
return self.weight[doc_id]
|
||||
def get_data_path(self):
|
||||
for directory in os.listdir(self.path):
|
||||
for file in os.listdir(self.path + "/" + directory + "/"):
|
||||
@ -239,3 +243,7 @@ class Indexer():
|
||||
print("Finished merging into 1 big happy family")
|
||||
self.set_weight()
|
||||
print("I AM DONE INDEXING !")
|
||||
|
||||
if __name__ == "__main__":
|
||||
indexer = Indexer(list(),dict(),list())
|
||||
indexer.create_index()
|
65
launcher.py
65
launcher.py
@ -1,37 +1,70 @@
|
||||
from indexer import Indexer
|
||||
from search import Search
|
||||
import time
|
||||
from flask import Flask
|
||||
from flask import render_template
|
||||
from flask import request
|
||||
|
||||
app = Flask(__name__)
|
||||
errors = list()
|
||||
indexer = Indexer(list(),dict(),list())
|
||||
|
||||
errors = None
|
||||
indexer = None
|
||||
search = None
|
||||
safe_guard = 1
|
||||
|
||||
def get_data():
|
||||
global indexer
|
||||
indexer = Indexer(list(),dict(),list())
|
||||
|
||||
global search
|
||||
search = Search(indexer)
|
||||
|
||||
global safe_guard
|
||||
safe_guard = 1
|
||||
|
||||
global errors
|
||||
errors = list()
|
||||
if not indexer.load_index_index():
|
||||
errors.append("Index of index is missing, probably should run the indexer")
|
||||
if not indexer.load_weight_index():
|
||||
errors.append("Index of index is missing, probably should run the indexer")
|
||||
|
||||
|
||||
|
||||
@app.route('/',methods=['POST','GET'])
|
||||
def index():
|
||||
errors = list()
|
||||
if not indexer:
|
||||
errors.append("Error in creating indexer module")
|
||||
elif not indexer.load_index_index():
|
||||
errors.append("Indexer does not exists, please run it first")
|
||||
if not search:
|
||||
errors.append("Error in creating search module")
|
||||
global errors
|
||||
global search
|
||||
global indexer
|
||||
global safe_guard
|
||||
local_errors = errors
|
||||
|
||||
if request.method == 'POST':
|
||||
if request.form.get('start-index') == "start":
|
||||
print("Making the indexer")
|
||||
indexer.create_index()
|
||||
return render_template('index.html',ips="Thanks for waiting you are ready to search.")
|
||||
if safe_guard == 5:
|
||||
safe_guard = 1
|
||||
indexer.create_index()
|
||||
indexer.load_index_index()
|
||||
return render_template('index.html',ips="Thanks for waiting you are ready to search.")
|
||||
safe_guard = safe_guard + 1
|
||||
return render_template('index.html',ips=str(safe_guard) + " DANGER ! PROCEED IF YOU ARE KNOWING WHAT YOU DOING, OTHERWISE STOP, INDEX MIGHT GET YEETED")
|
||||
if request.form.get('search_query') != "":
|
||||
search_query = request.form['search_query']
|
||||
result = [['lorem','ipsi'],['lores','dolores']]
|
||||
return render_template('index.html',results=result,errors=errors)
|
||||
return render_template('index.html',errors=errors)
|
||||
result = search.search(search_query)
|
||||
safe_guard = 1
|
||||
errors = list()
|
||||
return render_template('index.html',results=result,errors=local_errors)
|
||||
safe_guard = 1
|
||||
errors = list()
|
||||
return render_template('index.html',errors=local_errors)
|
||||
else:
|
||||
return render_template('index.html',errors=errors)
|
||||
safe_guard = 1
|
||||
errors = list()
|
||||
return render_template('index.html',errors=local_errors)
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(debug=True)
|
||||
get_data()
|
||||
|
||||
app.run(debug=False)
|
||||
|
279
search.py
Normal file
279
search.py
Normal file
@ -0,0 +1,279 @@
|
||||
#Data input
|
||||
import json
|
||||
import os
|
||||
import shelve
|
||||
from bs4 import BeautifulSoup
|
||||
from time import perf_counter
|
||||
import time
|
||||
import threading
|
||||
import pickle
|
||||
import sys
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
sys.path.append('D:/Visual Studio Workspace')
|
||||
|
||||
#Data process
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.stem import PorterStemmer
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
import re
|
||||
from indexer import Indexer
|
||||
|
||||
#Logging postings
|
||||
from posting import Posting
|
||||
from worker import Worker
|
||||
import indexer
|
||||
|
||||
class Search():
|
||||
# wrote the code for testing in the file searchtesting.py so many of the variables and function calls are wrong.
|
||||
def __init__(self, indexer):
|
||||
self.indexer = indexer
|
||||
self.indexer.load_index_index()
|
||||
self.indexer.load_weight_index()
|
||||
self.stemmer = PorterStemmer()
|
||||
|
||||
# takes a list of posting lists returns a list of indexes of the querys with the two shortest postings list that corresponds to search temp list
|
||||
def two_shortest(self, l_posting):
|
||||
short = []
|
||||
location = []
|
||||
for postings in l_posting:
|
||||
short.append(len(postings))
|
||||
|
||||
for i in range(2):
|
||||
x = short.index(min(short))
|
||||
location.append(x)
|
||||
short[x] = float('inf')
|
||||
|
||||
return location
|
||||
|
||||
# len(list1) <= len(list2) So the code in this function works with that in mind
|
||||
def merge(self, list1, list2):
|
||||
max = 0
|
||||
valid1 = []
|
||||
valid2 = []
|
||||
i = 0
|
||||
j = 0
|
||||
# TODO: optimize by having a pointer to the current index+4
|
||||
i4 = 3
|
||||
j4 = 3
|
||||
while i < len(list1) or j < len(list2):
|
||||
if j == len(list2):
|
||||
break
|
||||
if i == len(list1):
|
||||
break
|
||||
#if max == 40:
|
||||
#break
|
||||
try:
|
||||
if i == len(list1)-1:
|
||||
if list1[i]['doc_id'] == list2[j]['doc_id']:
|
||||
valid1.append(list1[i])
|
||||
valid2.append(list2[j])
|
||||
j += 1
|
||||
j4 +=1
|
||||
i += 1
|
||||
i4 += 1
|
||||
max += 1
|
||||
elif list1[i]['doc_id'] >= list2[j4]['doc_id']:
|
||||
j = j4
|
||||
j4 = j + 3
|
||||
elif list1[i4]['doc_id'] < list2[j]['doc_id'] and i4 < len(list1):
|
||||
i = i4
|
||||
i4 = i + 3
|
||||
elif list1[i]['doc_id'] < list2[j]['doc_id']:
|
||||
i += 1
|
||||
i4 += 1
|
||||
elif list1[i]['doc_id'] > list2[j]['doc_id']:
|
||||
j += 1
|
||||
j4 += 1
|
||||
else:
|
||||
j += 1
|
||||
j4 += 1
|
||||
|
||||
else:
|
||||
if list1[i]['doc_id'] == list2[j]['doc_id']:
|
||||
valid1.append(list1[i])
|
||||
valid2.append(list2[j])
|
||||
j += 1
|
||||
j4 +=1
|
||||
i += 1
|
||||
i4 += 1
|
||||
max += 1
|
||||
elif list1[i]['doc_id'] >= list2[j4]['doc_id'] and j4 < len(list2):
|
||||
j = j4
|
||||
j4 = j + 3
|
||||
|
||||
elif list1[i4]['doc_id'] < list2[j]['doc_id'] and i4 < len(list1):
|
||||
i = i4
|
||||
i4 = i + 3
|
||||
elif list1[i]['doc_id'] < list2[j]['doc_id']:
|
||||
i += 1
|
||||
i4 += 1
|
||||
elif list1[i]['doc_id'] > list2[j]['doc_id']:
|
||||
j += 1
|
||||
j4 += 1
|
||||
else:
|
||||
j += 1
|
||||
j4 +=1
|
||||
i += 1
|
||||
i4 += 1
|
||||
except:
|
||||
if i == len(list1)-1:
|
||||
if list1[i]['doc_id'] == list2[j]['doc_id']:
|
||||
valid1.append(list1[i])
|
||||
valid2.append(list2[j])
|
||||
j += 1
|
||||
j4 +=1
|
||||
i += 1
|
||||
i4 += 1
|
||||
elif list1[i]['doc_id'] < list2[j]['doc_id']:
|
||||
i += 1
|
||||
i4 += 1
|
||||
elif list1[i]['doc_id'] > list2[j]['doc_id']:
|
||||
j += 1
|
||||
j4 += 1
|
||||
else:
|
||||
j += 1
|
||||
j4 += 1
|
||||
else:
|
||||
if list1[i]['doc_id'] == list2[j]['doc_id']:
|
||||
valid1.append(list1[i])
|
||||
valid2.append(list2[j])
|
||||
j += 1
|
||||
j4 +=1
|
||||
i += 1
|
||||
i4 += 1
|
||||
elif list1[i]['doc_id'] < list2[j]['doc_id']:
|
||||
i += 1
|
||||
i4 += 1
|
||||
elif list1[i]['doc_id'] > list2[j]['doc_id']:
|
||||
j += 1
|
||||
j4 += 1
|
||||
else:
|
||||
j += 1
|
||||
j4 +=1
|
||||
i += 1
|
||||
i4 += 1
|
||||
# Since list1 is shorter it will hit its max index sooner,
|
||||
# so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2
|
||||
|
||||
return valid1, valid2
|
||||
|
||||
# query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
|
||||
def search(self, query):
|
||||
tokens = word_tokenize(query)
|
||||
stemmed_tokens = list()
|
||||
for token in tokens:
|
||||
token = self.stemmer.stem(token)
|
||||
stemmed_tokens.append(token)
|
||||
|
||||
query_valid_postings = dict()
|
||||
temp = []
|
||||
for token in stemmed_tokens:
|
||||
temp.append(self.indexer.get_postings(token))
|
||||
query_valid_postings[token] = []
|
||||
|
||||
tic = perf_counter()
|
||||
l = self.two_shortest(temp)
|
||||
m = self.merge(temp[l[0]], temp[l[1]])
|
||||
if len(m[0]) == 0:
|
||||
return -1
|
||||
# Keep track of the valid postings for each query as we do merge
|
||||
first = stemmed_tokens[l[0]]
|
||||
query_valid_postings[first] = m[0]
|
||||
query_valid_postings[stemmed_tokens[l[1]]] = m[1]
|
||||
toc = perf_counter()
|
||||
print("first merge", toc-tic)
|
||||
tic = perf_counter()
|
||||
while len(temp) > 1:
|
||||
# delete from temp the already merged lists
|
||||
temp.pop(l[0])
|
||||
# Try and except since temp length changes
|
||||
try:
|
||||
temp.pop(l[1])
|
||||
except:
|
||||
temp.pop(l[1]-1)
|
||||
|
||||
temp.append(m[0])
|
||||
|
||||
# Delete and append to query to make it consistent with temp
|
||||
stemmed_tokens.pop(l[0])
|
||||
try:
|
||||
stemmed_tokens.pop(l[1])
|
||||
except:
|
||||
stemmed_tokens.pop(l[1]-1)
|
||||
|
||||
stemmed_tokens.append(None)
|
||||
|
||||
l = self.two_shortest(temp)
|
||||
# Checks if contents in l are the same
|
||||
if len(set(l)) == 1:
|
||||
break
|
||||
else:
|
||||
m = self.merge(temp[l[0]], temp[l[1]])
|
||||
print(len(m[0]), len(m[1]))
|
||||
query_valid_postings[first] = m[0]
|
||||
query_valid_postings[stemmed_tokens[l[1]]] = m[1]
|
||||
toc = perf_counter()
|
||||
print("while loop", toc-tic)
|
||||
tic = perf_counter()
|
||||
# Create list of doc ids of correct merged postings for cross checking
|
||||
|
||||
merge = []
|
||||
for posting in query_valid_postings[first]:
|
||||
merge.append(posting['doc_id'])
|
||||
|
||||
|
||||
# Cross checking each query's valid postings list with correct merged set which we donated as being first
|
||||
for token, postings in query_valid_postings.items():
|
||||
if token == first:
|
||||
continue
|
||||
else:
|
||||
print(token)
|
||||
for p in postings:
|
||||
if p['doc_id'] not in merge:
|
||||
postings.remove(p)
|
||||
|
||||
toc = perf_counter()
|
||||
print(toc-tic)
|
||||
|
||||
|
||||
for token, postings in query_valid_postings.items():
|
||||
print(token, len(postings))
|
||||
|
||||
|
||||
tic = perf_counter()
|
||||
results = []
|
||||
|
||||
for i in range(len(query_valid_postings[first])):
|
||||
q_denom = 0
|
||||
norm_q = []
|
||||
norm_d = []
|
||||
|
||||
for q in query_valid_postings.keys():
|
||||
q_denom += (query_valid_postings[q][i]['tf_idf']/(1 + math.log(query_valid_postings[q][i]['tf_raw'])))**2
|
||||
q_denom = math.sqrt(q_denom)
|
||||
|
||||
for q in query_valid_postings.keys():
|
||||
x = query_valid_postings[q][i]['tf_idf']/(1 + math.log(query_valid_postings[q][i]['tf_raw']))/q_denom
|
||||
norm_q.append(x)
|
||||
y = (1 + math.log(query_valid_postings[q][i]['tf_raw']))/self.indexer.get_weight(query_valid_postings[q][i]['doc_id'])
|
||||
norm_d.append(y)
|
||||
results.append({'url' :query_valid_postings[first][i]['url'], 'cosine' : np.dot(norm_q, norm_d)})
|
||||
|
||||
results = sorted(results, key = lambda x: x['cosine'], reverse = True)
|
||||
finalresults = []
|
||||
for i in range(20):
|
||||
finalresults.append(results[i]['url'])
|
||||
print(finalresults)
|
||||
return finalresults
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -19,8 +19,11 @@
|
||||
</form>
|
||||
</div>
|
||||
|
||||
|
||||
<p>{{ips}}</p>
|
||||
|
||||
{% for result in results %}
|
||||
<p>{{result[0]}} at <a href="{{result[1]}}">{{result[1]}}</a></p>
|
||||
<p> <a href="{{result}}">{{result}}</a></p>
|
||||
{% endfor %}
|
||||
|
||||
{% for error in errors %}
|
||||
|
Loading…
Reference in New Issue
Block a user