Everything done and ready to test

2022-05-27 23:00:45 -07:00 · 2022-05-27 23:00:45 -07:00 · 5fd5319ffb
commit 5fd5319ffb
parent 63c9bbee6f
5 changed files with 361 additions and 63 deletions
--- a/README.md
+++ b/README.md
@ -1,39 +1,14 @@
 # Search_Engine
 Developing a mini search-engine in python using reverse-indexed stemming and other SEOs implementations
-## Part 1: The Reversed-Index
+Start the program by running python3 launcher.py
 A flask webpage will start.
 If you do not have any indexes files, the webpage will show you an error
 There is a button at the top of the page called Run Indexer
 THIS IS EXTREMELY TIME CONSUMING AND DANGEROUS. IT WILL DELETE THE INDEX IF YOU ALREADY HAVE ONE !
 So to safeguard this, you have to click the button five times in a row in five different refreshes of the page
-### Create an inverted index for the corpus with data structures designed by you.
+You can also create the index by running python3 indexer.py
- Tokens: all alphanumeric sequences in the dataset.
+After the indices are created you can go ahead and search through them
- Stop words: do not use stopping while indexing, i.e. use all words, even
+Notably
 the frequently occurring ones.
 - Stemming: use stemming for better textual matches. Suggestion: Porter
 stemming, but it is up to you to choose.
 - Important text: text in bold (b, strong), in headings (h1, h2, h3), and
 in titles should be treated as more important than the in other places.
 Verify which are the relevant HTML tags to select the important words.
 ### Building the inverted index
 Now that you have been provided the HTML files to index, you may build your
 inverted index off of them. The inverted index is simply a map with the token
 as a key and a list of its corresponding postings. A posting is the representation
 of the token’s occurrence in a document. The posting typically (not limited to)
 contains the following info (you are encouraged to think of other attributes that
 you could add to the index):
 - The document name/id the token was found in.
 - Its tf-idf score for that document (for MS1, add only the term frequency).
 ### Some tips:
 - When designing your inverted index, you will think about the structure
 of your posting first.
 - You would normally begin by implementing the code to calculate/fetch
 the elements which will constitute your posting.
 - Modularize. Use scripts/classes that will perform a function or a set of
 closely related functions. This helps in keeping track of your progress,
 debugging, and also dividing work amongst teammates if you’re in a group.
 - We recommend you use GitHub as a mechanism to work with your team
 members on this project, but you are not required to do so.
--- a/indexer.py
+++ b/indexer.py
@ -19,7 +19,6 @@ from os.path import exists
 #Data process
 from nltk.tokenize import word_tokenize
 from nltk.stem import PorterStemmer
 from sklearn.feature_extraction.text import TfidfVectorizer
 import pandas as pd
 import numpy as np
 import re
@ -60,7 +59,7 @@ class Indexer():
 		self.list_partials_lock = Lock()
 		#Loading index_index into memory
-		if exists("merged_index_index"):
+		if exists("merged_index.index"):
 			merged_index_index = open("merged_index.index",'r')
 			merged_index_index.seek(0,0)
 			json_value = merged_index_index.readline()
@ -79,6 +78,19 @@ class Indexer():
 			json_value = merged_index_index.readline()
 			data = json.loads(json_value)
 			self.index_index = dict(data['index'])
 			return self.index_index
 		else:
 			print("Index files do not exists, please run the indexer first")
 			return None
 	def load_weight_index(self):
 		if exists("docs.weight"):
 			weight_file = open("docs.weight",'r')
 			weight_file.seek(0,0)
 			json_value = weight_file.readline()
 			data = json.loads(json_value)
 			self.weight = data
 			return self.weight
 		else:
 			print("Index files do not exists, please run the indexer first")
 			return None
@ -118,15 +130,7 @@ class Indexer():
 		weight_file.close()
 	def get_weight(self,doc_id):
-		if exists('docs.weight'):
+		return self.weight[doc_id]
 			weight = open('docs.weight','r')
 			weight.seek(0,0)
 			json_value = weight.readline()
 			data = json.loads(json_value)
 			return data[doc_id]
 		else:
 			print("Index files do not exists, please run the indexer first")
 			return None
 	def get_data_path(self):
 		for directory in os.listdir(self.path):
 			for file in os.listdir(self.path + "/" + directory + "/"):
@ -239,3 +243,7 @@ class Indexer():
 		print("Finished merging into 1 big happy family")
 		self.set_weight()
 		print("I AM DONE INDEXING !")
 if __name__ == "__main__":
 	indexer = Indexer(list(),dict(),list())
 	indexer.create_index()
--- a/launcher.py
+++ b/launcher.py
@ -1,37 +1,70 @@
 from indexer import Indexer
 from search import Search
 import time
 from flask import Flask
 from flask import render_template
 from flask import request
 app = Flask(__name__)
-errors = list()
+
-indexer = Indexer(list(),dict(),list())	
+errors = None
 indexer = None	
 search = None
 safe_guard = 1
 def get_data():
 	global indexer
 	indexer = Indexer(list(),dict(),list())
 	global search
 	search = Search(indexer)
 	global safe_guard
 	safe_guard = 1
 	global errors
 	errors = list()
 	if not indexer.load_index_index():
 		errors.append("Index of index is missing, probably should run the indexer")
 	if not indexer.load_weight_index():
 		errors.append("Index of index is missing, probably should run the indexer")
@app.route('/',methods=['POST','GET'])
 def index():
-	errors = list()
+	global errors
-	if not indexer:
+	global search
-		errors.append("Error in creating indexer module")
+	global indexer
-	elif not indexer.load_index_index():
+	global safe_guard
-		errors.append("Indexer does not exists, please run it first")
+	local_errors = errors
 	if not search:
 		errors.append("Error in creating search module")
 	if request.method == 'POST':
 		if request.form.get('start-index') == "start":
 			print("Making the indexer")
 			if safe_guard == 5:
 				safe_guard = 1
 				indexer.create_index()
 				indexer.load_index_index()
 				return render_template('index.html',ips="Thanks for waiting you are ready to search.")
 			safe_guard = safe_guard + 1
 			return render_template('index.html',ips=str(safe_guard) + " DANGER ! PROCEED IF YOU ARE KNOWING WHAT YOU DOING, OTHERWISE STOP, INDEX MIGHT GET YEETED")
 		if request.form.get('search_query') != "":
 			search_query = request.form['search_query']
-			result = [['lorem','ipsi'],['lores','dolores']]
+			result = search.search(search_query)
-			return render_template('index.html',results=result,errors=errors)
+			safe_guard = 1
-		return render_template('index.html',errors=errors)
+			errors = list()
 			return render_template('index.html',results=result,errors=local_errors)
 		safe_guard = 1
 		errors = list()
 		return render_template('index.html',errors=local_errors)
 	else:
-		return render_template('index.html',errors=errors)
+		safe_guard = 1
 		errors = list()
 		return render_template('index.html',errors=local_errors)
 if __name__ == "__main__":
-	app.run(debug=True)
+	get_data()
 	app.run(debug=False)
--- a/search.py
+++ b/search.py
@ -0,0 +1,279 @@
 #Data input
 import json
 import os
 import shelve
 from bs4 import BeautifulSoup
 from time import perf_counter
 import time
 import threading
 import pickle
 import sys
 import math
 import numpy as np
 sys.path.append('D:/Visual Studio Workspace')
 #Data process
 from nltk.tokenize import word_tokenize
 from nltk.stem import PorterStemmer
 from sklearn.feature_extraction.text import TfidfVectorizer
 import pandas as pd
 import numpy as np
 import re
 from indexer import Indexer
 #Logging postings
 from posting import Posting
 from worker import Worker
 import indexer
 class Search():
    # wrote the code for testing in the file searchtesting.py so many of the variables and function calls are wrong.
    def __init__(self, indexer):
        self.indexer = indexer
        self.indexer.load_index_index()
        self.indexer.load_weight_index()
        self.stemmer = PorterStemmer()
    # takes a list of posting lists returns a list of indexes of the querys with the two shortest postings list that corresponds to search temp list
    def two_shortest(self, l_posting):
        short = []
        location = []
        for postings in l_posting:
            short.append(len(postings))
        for i in range(2):
            x = short.index(min(short))
            location.append(x)
            short[x] = float('inf')
        return location
    # len(list1) <= len(list2) So the code in this function works with that in mind
    def merge(self, list1, list2):
        max = 0
        valid1 = []
        valid2 = []
        i = 0
        j = 0
        # TODO: optimize by having a pointer to the current index+4
        i4 = 3
        j4 = 3
        while i < len(list1) or j < len(list2):
            if j == len(list2):
                break
            if i == len(list1):
                break
            #if max == 40:
                #break
            try:
                if i == len(list1)-1:
                    if list1[i]['doc_id'] == list2[j]['doc_id']:
                        valid1.append(list1[i])
                        valid2.append(list2[j])
                        j += 1
                        j4 +=1
                        i += 1
                        i4 += 1
                        max += 1
                    elif  list1[i]['doc_id'] >= list2[j4]['doc_id']:
                        j = j4
                        j4 = j + 3
                    elif list1[i4]['doc_id'] < list2[j]['doc_id'] and i4 < len(list1):
                        i = i4
                        i4 = i + 3
                    elif list1[i]['doc_id'] < list2[j]['doc_id']:
                        i += 1
                        i4 += 1
                    elif list1[i]['doc_id'] > list2[j]['doc_id']:
                        j += 1
                        j4 += 1
                    else:
                        j += 1
                        j4 += 1
                else:
                    if list1[i]['doc_id'] == list2[j]['doc_id']:
                        valid1.append(list1[i])
                        valid2.append(list2[j])
                        j += 1
                        j4 +=1
                        i += 1
                        i4 += 1
                        max += 1
                    elif list1[i]['doc_id'] >= list2[j4]['doc_id'] and j4 < len(list2):
                        j = j4
                        j4 = j + 3
                    elif list1[i4]['doc_id'] < list2[j]['doc_id'] and i4 < len(list1):
                        i = i4
                        i4 = i + 3
                    elif list1[i]['doc_id'] < list2[j]['doc_id']:
                        i += 1
                        i4 += 1
                    elif list1[i]['doc_id'] > list2[j]['doc_id']:
                        j += 1
                        j4 += 1
                    else:
                        j += 1
                        j4 +=1
                        i += 1
                        i4 += 1
            except:
                if i == len(list1)-1:
                    if list1[i]['doc_id'] == list2[j]['doc_id']:
                        valid1.append(list1[i])
                        valid2.append(list2[j])
                        j += 1
                        j4 +=1
                        i += 1
                        i4 += 1
                    elif list1[i]['doc_id'] < list2[j]['doc_id']:
                        i += 1
                        i4 += 1
                    elif list1[i]['doc_id'] > list2[j]['doc_id']:
                        j += 1
                        j4 += 1
                    else:
                        j += 1
                        j4 += 1
                else:
                    if list1[i]['doc_id'] == list2[j]['doc_id']:
                        valid1.append(list1[i])
                        valid2.append(list2[j])
                        j += 1
                        j4 +=1
                        i += 1
                        i4 += 1
                    elif list1[i]['doc_id'] < list2[j]['doc_id']:
                        i += 1
                        i4 += 1
                    elif list1[i]['doc_id'] > list2[j]['doc_id']:
                        j += 1
                        j4 += 1
                    else:
                        j += 1
                        j4 +=1
                        i += 1
                        i4 += 1
            # Since list1 is shorter it will hit its max index sooner, 
            #   so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2
        return valid1, valid2
    # query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
    def search(self, query):
        tokens = word_tokenize(query)
        stemmed_tokens = list()
        for token in tokens:
            token = self.stemmer.stem(token)
            stemmed_tokens.append(token)
        query_valid_postings = dict()
        temp = []
        for token in stemmed_tokens:
            temp.append(self.indexer.get_postings(token))
            query_valid_postings[token] = []
        tic = perf_counter()
        l = self.two_shortest(temp)
        m = self.merge(temp[l[0]], temp[l[1]])
        if len(m[0]) == 0:
            return -1
        # Keep track of the valid postings for each query as we do merge
        first = stemmed_tokens[l[0]]
        query_valid_postings[first] = m[0]
        query_valid_postings[stemmed_tokens[l[1]]] = m[1]
        toc = perf_counter()
        print("first merge", toc-tic)
        tic = perf_counter()
        while len(temp) > 1:
            # delete from temp the already merged lists
            temp.pop(l[0])
            # Try and except since temp length changes
            try:
                temp.pop(l[1])
            except:
                temp.pop(l[1]-1)
            temp.append(m[0])
            # Delete and append to query to make it consistent with temp
            stemmed_tokens.pop(l[0])
            try:
                stemmed_tokens.pop(l[1])
            except:
                stemmed_tokens.pop(l[1]-1)
            stemmed_tokens.append(None)
            l = self.two_shortest(temp)
            # Checks if contents in l are the same
            if len(set(l)) == 1:
                break
            else:
                m = self.merge(temp[l[0]], temp[l[1]])
                print(len(m[0]), len(m[1]))
                query_valid_postings[first] = m[0]
                query_valid_postings[stemmed_tokens[l[1]]] = m[1]
        toc = perf_counter()
        print("while loop", toc-tic)
        tic = perf_counter()
        # Create list of doc ids of correct merged postings for cross checking
        merge = []
        for posting in query_valid_postings[first]:
            merge.append(posting['doc_id'])
        # Cross checking each query's valid postings list with correct merged set which we donated as being first
        for token, postings in query_valid_postings.items():
            if token == first:
                continue
            else:
                print(token)
                for p in postings:
                    if p['doc_id'] not in merge:
                        postings.remove(p)
        toc = perf_counter()
        print(toc-tic)
        for token, postings in query_valid_postings.items():
            print(token, len(postings))
        tic = perf_counter()
        results = []
        for i in range(len(query_valid_postings[first])):
            q_denom = 0
            norm_q = []
            norm_d = []
            for q in query_valid_postings.keys():
                q_denom += (query_valid_postings[q][i]['tf_idf']/(1 + math.log(query_valid_postings[q][i]['tf_raw'])))**2
            q_denom = math.sqrt(q_denom)
            for q in query_valid_postings.keys():
                x = query_valid_postings[q][i]['tf_idf']/(1 + math.log(query_valid_postings[q][i]['tf_raw']))/q_denom
                norm_q.append(x)
                y = (1 + math.log(query_valid_postings[q][i]['tf_raw']))/self.indexer.get_weight(query_valid_postings[q][i]['doc_id'])
                norm_d.append(y)
            results.append({'url' :query_valid_postings[first][i]['url'], 'cosine' : np.dot(norm_q, norm_d)})
        results = sorted(results, key = lambda x: x['cosine'], reverse = True)
        finalresults = []
        for i in range(20):
            finalresults.append(results[i]['url'])
        print(finalresults)
        return finalresults
--- a/templates/index.html
+++ b/templates/index.html
@ -19,8 +19,11 @@
 		</form>
 	</div>
 	<p>{{ips}}</p>
 	{% for result in results %}
-		<p>{{result[0]}} at <a href="{{result[1]}}">{{result[1]}}</a></p>
+		<p> <a href="{{result}}">{{result}}</a></p>
 	{% endfor %}
 	{% for error in errors %}