Everything done and ready to test

2022-05-27 23:00:45 -07:00
parent 63c9bbee6f
commit 5fd5319ffb
5 changed files with 361 additions and 63 deletions
--- a/README.md
+++ b/README.md
@@ -1,39 +1,14 @@
 # Search_Engine
 Developing a mini search-engine in python using reverse-indexed stemming and other SEOs implementations
-## Part 1: The Reversed-Index
+Start the program by running python3 launcher.py
+A flask webpage will start.
+If you do not have any indexes files, the webpage will show you an error
+There is a button at the top of the page called Run Indexer
+THIS IS EXTREMELY TIME CONSUMING AND DANGEROUS. IT WILL DELETE THE INDEX IF YOU ALREADY HAVE ONE !
+So to safeguard this, you have to click the button five times in a row in five different refreshes of the page

-### Create an inverted index for the corpus with data structures designed by you.
+You can also create the index by running python3 indexer.py

- Tokens: all alphanumeric sequences in the dataset.
+After the indices are created you can go ahead and search through them

- Stop words: do not use stopping while indexing, i.e. use all words, even
-the frequently occurring ones.
-
- Stemming: use stemming for better textual matches. Suggestion: Porter
-stemming, but it is up to you to choose.
-
- Important text: text in bold (b, strong), in headings (h1, h2, h3), and
-in titles should be treated as more important than the in other places.
-
-Verify which are the relevant HTML tags to select the important words.
-
-### Building the inverted index
-Now that you have been provided the HTML files to index, you may build your
-inverted index off of them. The inverted index is simply a map with the token
-as a key and a list of its corresponding postings. A posting is the representation
-of the token’s occurrence in a document. The posting typically (not limited to)
-contains the following info (you are encouraged to think of other attributes that
-you could add to the index):
- The document name/id the token was found in.
- Its tf-idf score for that document (for MS1, add only the term frequency).
-
-### Some tips:
- When designing your inverted index, you will think about the structure
-of your posting first.
- You would normally begin by implementing the code to calculate/fetch
-the elements which will constitute your posting.
- Modularize. Use scripts/classes that will perform a function or a set of
-closely related functions. This helps in keeping track of your progress,
-debugging, and also dividing work amongst teammates if you’re in a group.
- We recommend you use GitHub as a mechanism to work with your team
-members on this project, but you are not required to do so.
+Notably
--- a/indexer.py
+++ b/indexer.py
@@ -19,7 +19,6 @@ from os.path import exists
 #Data process
 from nltk.tokenize import word_tokenize
 from nltk.stem import PorterStemmer
-from sklearn.feature_extraction.text import TfidfVectorizer
 import pandas as pd
 import numpy as np
 import re
@@ -60,7 +59,7 @@ class Indexer():
 		self.list_partials_lock = Lock()

 		#Loading index_index into memory
-		if exists("merged_index_index"):
+		if exists("merged_index.index"):
 			merged_index_index = open("merged_index.index",'r')
 			merged_index_index.seek(0,0)
 			json_value = merged_index_index.readline()
@@ -79,6 +78,19 @@ class Indexer():
 			json_value = merged_index_index.readline()
 			data = json.loads(json_value)
 			self.index_index = dict(data['index'])
+			return self.index_index
+		else:
+			print("Index files do not exists, please run the indexer first")
+			return None
+
+	def load_weight_index(self):
+		if exists("docs.weight"):
+			weight_file = open("docs.weight",'r')
+			weight_file.seek(0,0)
+			json_value = weight_file.readline()
+			data = json.loads(json_value)
+			self.weight = data
+			return self.weight
 		else:
 			print("Index files do not exists, please run the indexer first")
 			return None
@@ -118,15 +130,7 @@ class Indexer():
 		weight_file.close()

 	def get_weight(self,doc_id):
-		if exists('docs.weight'):
-			weight = open('docs.weight','r')
-			weight.seek(0,0)
-			json_value = weight.readline()
-			data = json.loads(json_value)
-			return data[doc_id]
-		else:
-			print("Index files do not exists, please run the indexer first")
-			return None
+		return self.weight[doc_id]
 	def get_data_path(self):
 		for directory in os.listdir(self.path):
 			for file in os.listdir(self.path + "/" + directory + "/"):
@@ -239,3 +243,7 @@ class Indexer():
 		print("Finished merging into 1 big happy family")
 		self.set_weight()
 		print("I AM DONE INDEXING !")
+
+if __name__ == "__main__":
+	indexer = Indexer(list(),dict(),list())
+	indexer.create_index()
--- a/launcher.py
+++ b/launcher.py
@@ -1,37 +1,70 @@
 from indexer import Indexer
+from search import Search
 import time
 from flask import Flask
 from flask import render_template
 from flask import request

 app = Flask(__name__)
-errors = list()
-indexer = Indexer(list(),dict(),list())	
+
+errors = None
+indexer = None	
 search = None
+safe_guard = 1
+
+def get_data():
+	global indexer
+	indexer = Indexer(list(),dict(),list())
+
+	global search
+	search = Search(indexer)
+
+	global safe_guard
+	safe_guard = 1
+
+	global errors
+	errors = list()
+	if not indexer.load_index_index():
+		errors.append("Index of index is missing, probably should run the indexer")
+	if not indexer.load_weight_index():
+		errors.append("Index of index is missing, probably should run the indexer")
+
+

@app.route('/',methods=['POST','GET'])
 def index():
-	errors = list()
-	if not indexer:
-		errors.append("Error in creating indexer module")
-	elif not indexer.load_index_index():
-		errors.append("Indexer does not exists, please run it first")
-	if not search:
-		errors.append("Error in creating search module")
+	global errors
+	global search
+	global indexer
+	global safe_guard
+	local_errors = errors

 	if request.method == 'POST':
 		if request.form.get('start-index') == "start":
 			print("Making the indexer")
-			indexer.create_index()
-			return render_template('index.html',ips="Thanks for waiting you are ready to search.")
+			if safe_guard == 5:
+				safe_guard = 1
+				indexer.create_index()
+				indexer.load_index_index()
+				return render_template('index.html',ips="Thanks for waiting you are ready to search.")
+			safe_guard = safe_guard + 1
+			return render_template('index.html',ips=str(safe_guard) + " DANGER ! PROCEED IF YOU ARE KNOWING WHAT YOU DOING, OTHERWISE STOP, INDEX MIGHT GET YEETED")
 		if request.form.get('search_query') != "":
 			search_query = request.form['search_query']
-			result = [['lorem','ipsi'],['lores','dolores']]
-			return render_template('index.html',results=result,errors=errors)
-		return render_template('index.html',errors=errors)
+			result = search.search(search_query)
+			safe_guard = 1
+			errors = list()
+			return render_template('index.html',results=result,errors=local_errors)
+		safe_guard = 1
+		errors = list()
+		return render_template('index.html',errors=local_errors)
 	else:
-		return render_template('index.html',errors=errors)
+		safe_guard = 1
+		errors = list()
+		return render_template('index.html',errors=local_errors)

 if __name__ == "__main__":
-	app.run(debug=True)
+	get_data()
+	
+	app.run(debug=False)
 		
--- a/search.py
+++ b/search.py
@@ -0,0 +1,279 @@
+#Data input
+import json
+import os
+import shelve
+from bs4 import BeautifulSoup
+from time import perf_counter
+import time
+import threading
+import pickle
+import sys
+import math
+import numpy as np
+
+sys.path.append('D:/Visual Studio Workspace')
+
+#Data process
+from nltk.tokenize import word_tokenize
+from nltk.stem import PorterStemmer
+from sklearn.feature_extraction.text import TfidfVectorizer
+import pandas as pd
+import numpy as np
+
+import re
+from indexer import Indexer
+
+#Logging postings
+from posting import Posting
+from worker import Worker
+import indexer
+
+class Search():
+    # wrote the code for testing in the file searchtesting.py so many of the variables and function calls are wrong.
+    def __init__(self, indexer):
+        self.indexer = indexer
+        self.indexer.load_index_index()
+        self.indexer.load_weight_index()
+        self.stemmer = PorterStemmer()
+
+    # takes a list of posting lists returns a list of indexes of the querys with the two shortest postings list that corresponds to search temp list
+    def two_shortest(self, l_posting):
+        short = []
+        location = []
+        for postings in l_posting:
+            short.append(len(postings))
+        
+        for i in range(2):
+            x = short.index(min(short))
+            location.append(x)
+            short[x] = float('inf')
+        
+        return location
+
+    # len(list1) <= len(list2) So the code in this function works with that in mind
+    def merge(self, list1, list2):
+        max = 0
+        valid1 = []
+        valid2 = []
+        i = 0
+        j = 0
+        # TODO: optimize by having a pointer to the current index+4
+        i4 = 3
+        j4 = 3
+        while i < len(list1) or j < len(list2):
+            if j == len(list2):
+                break
+            if i == len(list1):
+                break
+            #if max == 40:
+                #break
+            try:
+                if i == len(list1)-1:
+                    if list1[i]['doc_id'] == list2[j]['doc_id']:
+                        valid1.append(list1[i])
+                        valid2.append(list2[j])
+                        j += 1
+                        j4 +=1
+                        i += 1
+                        i4 += 1
+                        max += 1
+                    elif  list1[i]['doc_id'] >= list2[j4]['doc_id']:
+                        j = j4
+                        j4 = j + 3
+                    elif list1[i4]['doc_id'] < list2[j]['doc_id'] and i4 < len(list1):
+                        i = i4
+                        i4 = i + 3
+                    elif list1[i]['doc_id'] < list2[j]['doc_id']:
+                        i += 1
+                        i4 += 1
+                    elif list1[i]['doc_id'] > list2[j]['doc_id']:
+                        j += 1
+                        j4 += 1
+                    else:
+                        j += 1
+                        j4 += 1
+
+                else:
+                    if list1[i]['doc_id'] == list2[j]['doc_id']:
+                        valid1.append(list1[i])
+                        valid2.append(list2[j])
+                        j += 1
+                        j4 +=1
+                        i += 1
+                        i4 += 1
+                        max += 1
+                    elif list1[i]['doc_id'] >= list2[j4]['doc_id'] and j4 < len(list2):
+                        j = j4
+                        j4 = j + 3
+                        
+                    elif list1[i4]['doc_id'] < list2[j]['doc_id'] and i4 < len(list1):
+                        i = i4
+                        i4 = i + 3
+                    elif list1[i]['doc_id'] < list2[j]['doc_id']:
+                        i += 1
+                        i4 += 1
+                    elif list1[i]['doc_id'] > list2[j]['doc_id']:
+                        j += 1
+                        j4 += 1
+                    else:
+                        j += 1
+                        j4 +=1
+                        i += 1
+                        i4 += 1
+            except:
+                if i == len(list1)-1:
+                    if list1[i]['doc_id'] == list2[j]['doc_id']:
+                        valid1.append(list1[i])
+                        valid2.append(list2[j])
+                        j += 1
+                        j4 +=1
+                        i += 1
+                        i4 += 1
+                    elif list1[i]['doc_id'] < list2[j]['doc_id']:
+                        i += 1
+                        i4 += 1
+                    elif list1[i]['doc_id'] > list2[j]['doc_id']:
+                        j += 1
+                        j4 += 1
+                    else:
+                        j += 1
+                        j4 += 1
+                else:
+                    if list1[i]['doc_id'] == list2[j]['doc_id']:
+                        valid1.append(list1[i])
+                        valid2.append(list2[j])
+                        j += 1
+                        j4 +=1
+                        i += 1
+                        i4 += 1
+                    elif list1[i]['doc_id'] < list2[j]['doc_id']:
+                        i += 1
+                        i4 += 1
+                    elif list1[i]['doc_id'] > list2[j]['doc_id']:
+                        j += 1
+                        j4 += 1
+                    else:
+                        j += 1
+                        j4 +=1
+                        i += 1
+                        i4 += 1
+            # Since list1 is shorter it will hit its max index sooner, 
+            #   so in the cases were it does we still need to go through list2 to see if the last element of list1 appears anywhere in the rest of list2
+            
+        return valid1, valid2
+    
+    # query is a list of stemmed tokens, returns a list of postings (which we'll directly ignore except for the doc id)
+    def search(self, query):
+        tokens = word_tokenize(query)
+        stemmed_tokens = list()
+        for token in tokens:
+            token = self.stemmer.stem(token)
+            stemmed_tokens.append(token)
+        
+        query_valid_postings = dict()
+        temp = []
+        for token in stemmed_tokens:
+            temp.append(self.indexer.get_postings(token))
+            query_valid_postings[token] = []
+
+        tic = perf_counter()
+        l = self.two_shortest(temp)
+        m = self.merge(temp[l[0]], temp[l[1]])
+        if len(m[0]) == 0:
+            return -1
+        # Keep track of the valid postings for each query as we do merge
+        first = stemmed_tokens[l[0]]
+        query_valid_postings[first] = m[0]
+        query_valid_postings[stemmed_tokens[l[1]]] = m[1]
+        toc = perf_counter()
+        print("first merge", toc-tic)
+        tic = perf_counter()
+        while len(temp) > 1:
+            # delete from temp the already merged lists
+            temp.pop(l[0])
+            # Try and except since temp length changes
+            try:
+                temp.pop(l[1])
+            except:
+                temp.pop(l[1]-1)
+
+            temp.append(m[0])
+
+            # Delete and append to query to make it consistent with temp
+            stemmed_tokens.pop(l[0])
+            try:
+                stemmed_tokens.pop(l[1])
+            except:
+                stemmed_tokens.pop(l[1]-1)
+
+            stemmed_tokens.append(None)
+            
+            l = self.two_shortest(temp)
+            # Checks if contents in l are the same
+            if len(set(l)) == 1:
+                break
+            else:
+                m = self.merge(temp[l[0]], temp[l[1]])
+                print(len(m[0]), len(m[1]))
+                query_valid_postings[first] = m[0]
+                query_valid_postings[stemmed_tokens[l[1]]] = m[1]
+        toc = perf_counter()
+        print("while loop", toc-tic)
+        tic = perf_counter()
+        # Create list of doc ids of correct merged postings for cross checking
+
+        merge = []
+        for posting in query_valid_postings[first]:
+            merge.append(posting['doc_id'])
+        
+
+        # Cross checking each query's valid postings list with correct merged set which we donated as being first
+        for token, postings in query_valid_postings.items():
+            if token == first:
+                continue
+            else:
+                print(token)
+                for p in postings:
+                    if p['doc_id'] not in merge:
+                        postings.remove(p)
+        
+        toc = perf_counter()
+        print(toc-tic)
+        
+        
+        for token, postings in query_valid_postings.items():
+            print(token, len(postings))
+        
+        
+        tic = perf_counter()
+        results = []
+        
+        for i in range(len(query_valid_postings[first])):
+            q_denom = 0
+            norm_q = []
+            norm_d = []
+
+            for q in query_valid_postings.keys():
+                q_denom += (query_valid_postings[q][i]['tf_idf']/(1 + math.log(query_valid_postings[q][i]['tf_raw'])))**2
+            q_denom = math.sqrt(q_denom)
+    
+            for q in query_valid_postings.keys():
+                x = query_valid_postings[q][i]['tf_idf']/(1 + math.log(query_valid_postings[q][i]['tf_raw']))/q_denom
+                norm_q.append(x)
+                y = (1 + math.log(query_valid_postings[q][i]['tf_raw']))/self.indexer.get_weight(query_valid_postings[q][i]['doc_id'])
+                norm_d.append(y)
+            results.append({'url' :query_valid_postings[first][i]['url'], 'cosine' : np.dot(norm_q, norm_d)})
+        
+        results = sorted(results, key = lambda x: x['cosine'], reverse = True)
+        finalresults = []
+        for i in range(20):
+            finalresults.append(results[i]['url'])
+        print(finalresults)
+        return finalresults
+    
+
+
+
+
+
+
--- a/templates/index.html
+++ b/templates/index.html
@@ -19,8 +19,11 @@
 		</form>
 	</div>

+	
+	<p>{{ips}}</p>
+
 	{% for result in results %}
-		<p>{{result[0]}} at <a href="{{result[1]}}">{{result[1]}}</a></p>
+		<p> <a href="{{result}}">{{result}}</a></p>
 	{% endfor %}

 	{% for error in errors %}