Everything done and ready to test
This commit is contained in:
parent
63c9bbee6f
commit
5fd5319ffb
43
README.md
43
README.md
@ -1,39 +1,14 @@
|
|||||||
# Search_Engine
|
# Search_Engine
|
||||||
Developing a mini search-engine in python using reverse-indexed stemming and other SEOs implementations
|
Developing a mini search-engine in python using reverse-indexed stemming and other SEOs implementations
|
||||||
## Part 1: The Reversed-Index
|
Start the program by running python3 launcher.py
|
||||||
|
A flask webpage will start.
|
||||||
|
If you do not have any indexes files, the webpage will show you an error
|
||||||
|
There is a button at the top of the page called Run Indexer
|
||||||
|
THIS IS EXTREMELY TIME CONSUMING AND DANGEROUS. IT WILL DELETE THE INDEX IF YOU ALREADY HAVE ONE !
|
||||||
|
So to safeguard this, you have to click the button five times in a row in five different refreshes of the page
|
||||||
|
|
||||||
### Create an inverted index for the corpus with data structures designed by you.
|
You can also create the index by running python3 indexer.py
|
||||||
|
|
||||||
- Tokens: all alphanumeric sequences in the dataset.
|
After the indices are created you can go ahead and search through them
|
||||||
|
|
||||||
- Stop words: do not use stopping while indexing, i.e. use all words, even
|
Notably
|
||||||
the frequently occurring ones.
|
|
||||||
|
|
||||||
- Stemming: use stemming for better textual matches. Suggestion: Porter
|
|
||||||
stemming, but it is up to you to choose.
|
|
||||||
|
|
||||||
- Important text: text in bold (b, strong), in headings (h1, h2, h3), and
|
|
||||||
in titles should be treated as more important than the in other places.
|
|
||||||
|
|
||||||
Verify which are the relevant HTML tags to select the important words.
|
|
||||||
|
|
||||||
### Building the inverted index
|
|
||||||
Now that you have been provided the HTML files to index, you may build your
|
|
||||||
inverted index off of them. The inverted index is simply a map with the token
|
|
||||||
as a key and a list of its corresponding postings. A posting is the representation
|
|
||||||
of the token’s occurrence in a document. The posting typically (not limited to)
|
|
||||||
contains the following info (you are encouraged to think of other attributes that
|
|
||||||
you could add to the index):
|
|
||||||
- The document name/id the token was found in.
|
|
||||||
- Its tf-idf score for that document (for MS1, add only the term frequency).
|
|
||||||
|
|
||||||
### Some tips:
|
|
||||||
- When designing your inverted index, you will think about the structure
|
|
||||||
of your posting first.
|
|
||||||
- You would normally begin by implementing the code to calculate/fetch
|
|
||||||
the elements which will constitute your posting.
|
|
||||||
- Modularize. Use scripts/classes that will perform a function or a set of
|
|
||||||
closely related functions. This helps in keeping track of your progress,
|
|
||||||
debugging, and also dividing work amongst teammates if you’re in a group.
|
|
||||||
- We recommend you use GitHub as a mechanism to work with your team
|
|
||||||
members on this project, but you are not required to do so.
|
|
30
indexer.py
30
indexer.py
@ -19,7 +19,6 @@ from os.path import exists
|
|||||||
#Data process
|
#Data process
|
||||||
from nltk.tokenize import word_tokenize
|
from nltk.tokenize import word_tokenize
|
||||||
from nltk.stem import PorterStemmer
|
from nltk.stem import PorterStemmer
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import re
|
import re
|
||||||
@ -60,7 +59,7 @@ class Indexer():
|
|||||||
self.list_partials_lock = Lock()
|
self.list_partials_lock = Lock()
|
||||||
|
|
||||||
#Loading index_index into memory
|
#Loading index_index into memory
|
||||||
if exists("merged_index_index"):
|
if exists("merged_index.index"):
|
||||||
merged_index_index = open("merged_index.index",'r')
|
merged_index_index = open("merged_index.index",'r')
|
||||||
merged_index_index.seek(0,0)
|
merged_index_index.seek(0,0)
|
||||||
json_value = merged_index_index.readline()
|
json_value = merged_index_index.readline()
|
||||||
@ -79,6 +78,19 @@ class Indexer():
|
|||||||
json_value = merged_index_index.readline()
|
json_value = merged_index_index.readline()
|
||||||
data = json.loads(json_value)
|
data = json.loads(json_value)
|
||||||
self.index_index = dict(data['index'])
|
self.index_index = dict(data['index'])
|
||||||
|
return self.index_index
|
||||||
|
else:
|
||||||
|
print("Index files do not exists, please run the indexer first")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def load_weight_index(self):
|
||||||
|
if exists("docs.weight"):
|
||||||
|
weight_file = open("docs.weight",'r')
|
||||||
|
weight_file.seek(0,0)
|
||||||
|
json_value = weight_file.readline()
|
||||||
|
data = json.loads(json_value)
|
||||||
|
self.weight = data
|
||||||
|
return self.weight
|
||||||
else:
|
else:
|
||||||
print("Index files do not exists, please run the indexer first")
|
print("Index files do not exists, please run the indexer first")
|
||||||
return None
|
return None
|
||||||
@ -118,15 +130,7 @@ class Indexer():
|
|||||||
weight_file.close()
|
weight_file.close()
|
||||||
|
|
||||||
def get_weight(self,doc_id):
|
def get_weight(self,doc_id):
|
||||||
if exists('docs.weight'):
|
return self.weight[doc_id]
|
||||||
weight = open('docs.weight','r')
|
|
||||||
weight.seek(0,0)
|
|
||||||
json_value = weight.readline()
|
|
||||||
data = json.loads(json_value)
|
|
||||||
return data[doc_id]
|
|
||||||
else:
|
|
||||||
print("Index files do not exists, please run the indexer first")
|
|
||||||
return None
|
|
||||||
def get_data_path(self):
|
def get_data_path(self):
|
||||||
for directory in os.listdir(self.path):
|
for directory in os.listdir(self.path):
|
||||||
for file in os.listdir(self.path + "/" + directory + "/"):
|
for file in os.listdir(self.path + "/" + directory + "/"):
|
||||||
@ -239,3 +243,7 @@ class Indexer():
|
|||||||
print("Finished merging into 1 big happy family")
|
print("Finished merging into 1 big happy family")
|
||||||
self.set_weight()
|
self.set_weight()
|
||||||
print("I AM DONE INDEXING !")
|
print("I AM DONE INDEXING !")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
indexer = Indexer(list(),dict(),list())
|
||||||
|
indexer.create_index()
|
61
launcher.py
61
launcher.py
@ -1,37 +1,70 @@
|
|||||||
from indexer import Indexer
|
from indexer import Indexer
|
||||||
|
from search import Search
|
||||||
import time
|
import time
|
||||||
from flask import Flask
|
from flask import Flask
|
||||||
from flask import render_template
|
from flask import render_template
|
||||||
from flask import request
|
from flask import request
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
errors = list()
|
|
||||||
indexer = Indexer(list(),dict(),list())
|
errors = None
|
||||||
|
indexer = None
|
||||||
search = None
|
search = None
|
||||||
|
safe_guard = 1
|
||||||
|
|
||||||
|
def get_data():
|
||||||
|
global indexer
|
||||||
|
indexer = Indexer(list(),dict(),list())
|
||||||
|
|
||||||
|
global search
|
||||||
|
search = Search(indexer)
|
||||||
|
|
||||||
|
global safe_guard
|
||||||
|
safe_guard = 1
|
||||||
|
|
||||||
|
global errors
|
||||||
|
errors = list()
|
||||||
|
if not indexer.load_index_index():
|
||||||
|
errors.append("Index of index is missing, probably should run the indexer")
|
||||||
|
if not indexer.load_weight_index():
|
||||||
|
errors.append("Index of index is missing, probably should run the indexer")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@app.route('/',methods=['POST','GET'])
|
@app.route('/',methods=['POST','GET'])
|
||||||
def index():
|
def index():
|
||||||
errors = list()
|
global errors
|
||||||
if not indexer:
|
global search
|
||||||
errors.append("Error in creating indexer module")
|
global indexer
|
||||||
elif not indexer.load_index_index():
|
global safe_guard
|
||||||
errors.append("Indexer does not exists, please run it first")
|
local_errors = errors
|
||||||
if not search:
|
|
||||||
errors.append("Error in creating search module")
|
|
||||||
|
|
||||||
if request.method == 'POST':
|
if request.method == 'POST':
|
||||||
if request.form.get('start-index') == "start":
|
if request.form.get('start-index') == "start":
|
||||||
print("Making the indexer")
|
print("Making the indexer")
|
||||||
|
if safe_guard == 5:
|
||||||
|
safe_guard = 1
|
||||||
indexer.create_index()
|
indexer.create_index()
|
||||||
|
indexer.load_index_index()
|
||||||
return render_template('index.html',ips="Thanks for waiting you are ready to search.")
|
return render_template('index.html',ips="Thanks for waiting you are ready to search.")
|
||||||
|
safe_guard = safe_guard + 1
|
||||||
|
return render_template('index.html',ips=str(safe_guard) + " DANGER ! PROCEED IF YOU ARE KNOWING WHAT YOU DOING, OTHERWISE STOP, INDEX MIGHT GET YEETED")
|
||||||
if request.form.get('search_query') != "":
|
if request.form.get('search_query') != "":
|
||||||
search_query = request.form['search_query']
|
search_query = request.form['search_query']
|
||||||
result = [['lorem','ipsi'],['lores','dolores']]
|
result = search.search(search_query)
|
||||||
return render_template('index.html',results=result,errors=errors)
|
safe_guard = 1
|
||||||
return render_template('index.html',errors=errors)
|
errors = list()
|
||||||
|
return render_template('index.html',results=result,errors=local_errors)
|
||||||
|
safe_guard = 1
|
||||||
|
errors = list()
|
||||||
|
return render_template('index.html',errors=local_errors)
|
||||||
else:
|
else:
|
||||||
return render_template('index.html',errors=errors)
|
safe_guard = 1
|
||||||
|
errors = list()
|
||||||
|
return render_template('index.html',errors=local_errors)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
app.run(debug=True)
|
get_data()
|
||||||
|
|
||||||
|
app.run(debug=False)
|
||||||
|
|
279
search.py
Normal file
279
search.py
Normal file
@ -0,0 +1,279 @@
|
|||||||
|
#Data input
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import shelve
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from time import perf_counter
|
||||||
|
import time
|
||||||
|
import threading
|
||||||
|
import pickle
|
||||||
|
import sys
|
||||||
|
import math
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
sys.path.append('D:/Visual Studio Workspace')
|
||||||
|
|
||||||
|
#Data process
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
from nltk.stem import PorterStemmer
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
import re
|
||||||
|
from indexer import Indexer
|
||||||
|
|
||||||
|
#Logging postings
|
||||||
|
from posting import Posting
|
||||||
|
from worker import Worker
|
||||||
|
import indexer
|
||||||
|
|
||||||
|
class Search():
|
||||||
|
# wrote the code for testing in the file searchtesting.py so many of the variables and function calls are wrong.
|
||||||
|
def __init__(self, indexer):
|
||||||
|
self.indexer = indexer
|
||||||
|
self.indexer.load_index_index()
|
||||||
|
self.indexer.load_weight_index()
|
||||||
|
self.stemmer = PorterStemmer()
|
||||||
|
|
||||||
|
# takes a list of posting lists returns a list of indexes of the querys with the two shortest postings list that corresponds to search temp list
|
||||||
|
def two_shortest(self, l_posting):
|
||||||
|
short = []
|
||||||
|
location = []
|
||||||
|
for postings in l_posting:
|
||||||
|
short.append(len(postings))
|
||||||
|
|
||||||
|
for i in range(2):
|
||||||
|
x = short.index(min(short))
|
||||||
|
location.append(x)
|
||||||
|
short[x] = float('inf')
|
||||||
|
|
||||||
|
return location
|
||||||
|
|
||||||
|
# len(list1) <= len(list2) So the code in this function works with that in mind
|
||||||
|
def merge(self, list1, list2):
|
||||||
|
max = 0
|
||||||
|
valid1 = []
|
||||||
|
valid2 = []
|
||||||
|
i = 0
|
||||||
|
j = 0
|
||||||
|
# TODO: optimize by having a pointer to the current index+4
|
||||||
|
i4 = 3
|
||||||
|
j4 = 3
|
||||||
|
while i < len(list1) or j < len(list2):
|
||||||
|
if j == len(list2):
|
||||||
|
break
|
||||||
|
if i == len(list1):
|
||||||
|
break
|
||||||
|
#if max == 40:
|
||||||
|
#break
|
||||||
|
try:
|
||||||
|
if i == len(list1)-1:
|
||||||
|
if list1[i]['doc_id'] == list2[j]['doc_id']:
|
||||||
|
valid1.append(list1[i])
|
||||||
|
valid2.append(list2[j])
|
||||||
|
j += 1
|
||||||
|
j4 +=1
|
||||||
|
i += 1
|
||||||
|
i4 += 1
|
||||||
|
max += 1
|
||||||
|
elif list1[i]['doc_id'] >= list2[j4]['doc_id']:
|
||||||
|
j = j4
|
||||||
|
j4 = j + 3
|
||||||
|
elif list1[i4]['doc_id'] < list2[j]['doc_id'] and i4 < len(list1):
|
||||||
|
i = i4
|
||||||
|
i4 = i + 3
|
||||||
|
elif list1[i]['doc_id'] < list2[j]['doc_id']:
|
||||||
|
i += 1
|
||||||
|
i4 += 1
|
||||||
|
elif list1[i]['doc_id'] > list2[j]['doc_id']:
|
||||||
|
j += 1
|
||||||
|
j4 += 1
|
||||||
|
else:
|
||||||
|
j += 1
|
||||||
|
j4 += 1
|
||||||
|
|
||||||
|
else:
|
||||||
|
if list1[i]['doc_id'] == list2[j]['doc_id']:
|
||||||
|
valid1.append(list1[i])
|
||||||
|
valid2.append(list2[j])
|
||||||
|
j += 1
|
||||||
|
j4 +=1
|
||||||
|
i += 1
|
||||||
|
i4 += 1
|
||||||
|
max += 1
|
||||||
|
elif list1[i]['doc_id'] >= list2[j4]['doc_id'] and j4 < len(list2):
|
||||||
|
j = j4
|
||||||
|
j4 = j + 3
|
||||||
|
|
||||||
|
elif list1[i4]['doc_id'] < list2[j]['doc_id'] and i4 < len(list1):
|
||||||
|
i = i4
|
||||||
|
i4 = i + 3
|
||||||
|
elif list1[i]['doc_id'] < list2[j]['doc_id']:
|
||||||
|
i += 1
|
||||||
|
i4 += 1
|
||||||
|
elif list1[i]['doc_id'] > list2[j]['doc_id']:
|
||||||
|
j += 1
|
||||||
|
j4 += 1
|
||||||
|
else:
|
||||||
|
j += 1
|
||||||
|
j4 +=1
|
||||||
|
i += 1
|
||||||
|
i4 += 1
|
||||||
|
except:
|
||||||
|
if i == len(list1)-1:
|
||||||
|
if list1[i]['doc_id'] == list2[j]['doc_id']:
|
||||||
|
valid1.append(list1[i])
|
||||||
|
valid2.append(list2[j])
|
||||||
|
j += 1
|
||||||
|
j4 +=1
|
||||||
|
i += 1
|
||||||
|
i4 += 1
|
||||||
|
elif list1[i]['doc_id'] < list2[j]['doc_id']:
|
||||||
|
i += 1
|
||||||
|
i4 += 1
|
||||||
|
elif list1[i]['doc_id'] > list2[j]['doc_id']:
|
||||||
|
j += 1
|
||||||
|
j4 += 1
|
||||||
|
else:
|
||||||
|
j += 1
|
||||||
|
j4 += 1
|
||||||
|
else:
|
||||||
|
if list1[i]['doc_id'] == list2[j]['doc_id']:
|
||||||
|
valid1.append(list1[i])
|
||||||
|
valid2.append(list2[j])
|
||||||
|
j += 1
|
||||||
|
j4 +=1
|
||||||
|
i += 1
|
||||||
|
i4 += 1
|
||||||
|
elif list1[i]['doc_id'] < list2[j]['doc_id']:
|
||||||
|
i += 1
|
||||||
|
i4 += 1
|
||||||
|
elif list1[i]['doc_id'] > list2[j]['doc_id']:
|
||||||
|
j += 1
|
||||||
|
j4 += 1
|
||||||
|
else:
|
||||||
|
j += 1
|
||||||
|
j4 +=1
|
||||||
|
i += 1
|
||||||
|
i4 += 1
|
||||||
|
# Since list1 is shorter it will hit its max index sooner,
|
||||||
|
# so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2
|
||||||
|
|
||||||
|
return valid1, valid2
|
||||||
|
|
||||||
|
# query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
|
||||||
|
def search(self, query):
|
||||||
|
tokens = word_tokenize(query)
|
||||||
|
stemmed_tokens = list()
|
||||||
|
for token in tokens:
|
||||||
|
token = self.stemmer.stem(token)
|
||||||
|
stemmed_tokens.append(token)
|
||||||
|
|
||||||
|
query_valid_postings = dict()
|
||||||
|
temp = []
|
||||||
|
for token in stemmed_tokens:
|
||||||
|
temp.append(self.indexer.get_postings(token))
|
||||||
|
query_valid_postings[token] = []
|
||||||
|
|
||||||
|
tic = perf_counter()
|
||||||
|
l = self.two_shortest(temp)
|
||||||
|
m = self.merge(temp[l[0]], temp[l[1]])
|
||||||
|
if len(m[0]) == 0:
|
||||||
|
return -1
|
||||||
|
# Keep track of the valid postings for each query as we do merge
|
||||||
|
first = stemmed_tokens[l[0]]
|
||||||
|
query_valid_postings[first] = m[0]
|
||||||
|
query_valid_postings[stemmed_tokens[l[1]]] = m[1]
|
||||||
|
toc = perf_counter()
|
||||||
|
print("first merge", toc-tic)
|
||||||
|
tic = perf_counter()
|
||||||
|
while len(temp) > 1:
|
||||||
|
# delete from temp the already merged lists
|
||||||
|
temp.pop(l[0])
|
||||||
|
# Try and except since temp length changes
|
||||||
|
try:
|
||||||
|
temp.pop(l[1])
|
||||||
|
except:
|
||||||
|
temp.pop(l[1]-1)
|
||||||
|
|
||||||
|
temp.append(m[0])
|
||||||
|
|
||||||
|
# Delete and append to query to make it consistent with temp
|
||||||
|
stemmed_tokens.pop(l[0])
|
||||||
|
try:
|
||||||
|
stemmed_tokens.pop(l[1])
|
||||||
|
except:
|
||||||
|
stemmed_tokens.pop(l[1]-1)
|
||||||
|
|
||||||
|
stemmed_tokens.append(None)
|
||||||
|
|
||||||
|
l = self.two_shortest(temp)
|
||||||
|
# Checks if contents in l are the same
|
||||||
|
if len(set(l)) == 1:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
m = self.merge(temp[l[0]], temp[l[1]])
|
||||||
|
print(len(m[0]), len(m[1]))
|
||||||
|
query_valid_postings[first] = m[0]
|
||||||
|
query_valid_postings[stemmed_tokens[l[1]]] = m[1]
|
||||||
|
toc = perf_counter()
|
||||||
|
print("while loop", toc-tic)
|
||||||
|
tic = perf_counter()
|
||||||
|
# Create list of doc ids of correct merged postings for cross checking
|
||||||
|
|
||||||
|
merge = []
|
||||||
|
for posting in query_valid_postings[first]:
|
||||||
|
merge.append(posting['doc_id'])
|
||||||
|
|
||||||
|
|
||||||
|
# Cross checking each query's valid postings list with correct merged set which we donated as being first
|
||||||
|
for token, postings in query_valid_postings.items():
|
||||||
|
if token == first:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
print(token)
|
||||||
|
for p in postings:
|
||||||
|
if p['doc_id'] not in merge:
|
||||||
|
postings.remove(p)
|
||||||
|
|
||||||
|
toc = perf_counter()
|
||||||
|
print(toc-tic)
|
||||||
|
|
||||||
|
|
||||||
|
for token, postings in query_valid_postings.items():
|
||||||
|
print(token, len(postings))
|
||||||
|
|
||||||
|
|
||||||
|
tic = perf_counter()
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for i in range(len(query_valid_postings[first])):
|
||||||
|
q_denom = 0
|
||||||
|
norm_q = []
|
||||||
|
norm_d = []
|
||||||
|
|
||||||
|
for q in query_valid_postings.keys():
|
||||||
|
q_denom += (query_valid_postings[q][i]['tf_idf']/(1 + math.log(query_valid_postings[q][i]['tf_raw'])))**2
|
||||||
|
q_denom = math.sqrt(q_denom)
|
||||||
|
|
||||||
|
for q in query_valid_postings.keys():
|
||||||
|
x = query_valid_postings[q][i]['tf_idf']/(1 + math.log(query_valid_postings[q][i]['tf_raw']))/q_denom
|
||||||
|
norm_q.append(x)
|
||||||
|
y = (1 + math.log(query_valid_postings[q][i]['tf_raw']))/self.indexer.get_weight(query_valid_postings[q][i]['doc_id'])
|
||||||
|
norm_d.append(y)
|
||||||
|
results.append({'url' :query_valid_postings[first][i]['url'], 'cosine' : np.dot(norm_q, norm_d)})
|
||||||
|
|
||||||
|
results = sorted(results, key = lambda x: x['cosine'], reverse = True)
|
||||||
|
finalresults = []
|
||||||
|
for i in range(20):
|
||||||
|
finalresults.append(results[i]['url'])
|
||||||
|
print(finalresults)
|
||||||
|
return finalresults
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -19,8 +19,11 @@
|
|||||||
</form>
|
</form>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
<p>{{ips}}</p>
|
||||||
|
|
||||||
{% for result in results %}
|
{% for result in results %}
|
||||||
<p>{{result[0]}} at <a href="{{result[1]}}">{{result[1]}}</a></p>
|
<p> <a href="{{result}}">{{result}}</a></p>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
||||||
{% for error in errors %}
|
{% for error in errors %}
|
||||||
|
Loading…
Reference in New Issue
Block a user