Basic web-gui

This commit is contained in:
inocturnis
2022-05-27 17:01:35 -07:00
parent 0b127af5a9
commit 107d1b2a46
5 changed files with 134 additions and 50 deletions

File diff suppressed because one or more lines are too long

View File

@@ -14,12 +14,7 @@ import json
import os import os
import shelve import shelve
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from time import perf_counter from os.path import exists
import time
import threading
from threading import Lock
import math
#Data process #Data process
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
@@ -27,13 +22,23 @@ from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import re import re
import math
#Logging postings #Logging postings
from posting import Posting from posting import Posting
from worker import Worker from worker import Worker
#Multi-threading
import threading
from threading import Lock
#Performance
from time import perf_counter
import time
class Node(): class Node():
index_value = '' index_value = ''
postings = list() postings = list()
@@ -53,10 +58,30 @@ class Indexer():
self.stemmer = PorterStemmer() self.stemmer = PorterStemmer()
self.data_paths_lock = Lock() self.data_paths_lock = Lock()
self.list_partials_lock = Lock() self.list_partials_lock = Lock()
#Loading index_index into memory
if exists("merged_index_index"):
merged_index_index = open("merged_index.index",'r')
merged_index_index.seek(0,0)
json_value = merged_index_index.readline()
data = json.loads(json_value)
self.index_index = dict(data['index'])
else:
self.index_index = dict()
self.workers = list() self.workers = list()
self.worker_factory = worker_factory self.worker_factory = worker_factory
def load_index_index(self):
if exists("merged_index.index"):
merged_index_index = open("merged_index.index",'r')
merged_index_index.seek(0,0)
json_value = merged_index_index.readline()
data = json.loads(json_value)
self.index_index = dict(data['index'])
else:
print("Index files do not exists, please run the indexer first")
return None
def start_async(self): def start_async(self):
self.workers = [ self.workers = [
@@ -75,17 +100,16 @@ class Indexer():
def get_postings(self,index): def get_postings(self,index):
merged_index_index = open("merged_index.index" ,'r') try:
merged_index = open("merged_index.full",'r') merged_index = open("merged_index.full",'r')
merged_index_index.seek(0,0) to_seek = self.index_index[index]
json_value = merged_index_index.readline() merged_index.seek(to_seek,0)
data = json.loads(json_value) json_value = merged_index.readline()
index_index = dict(data['index']) data = json.loads(json_value)
to_seek = index_index[index] return data['postings']
merged_index.seek(to_seek,0) except FileNotFoundError:
json_value = merged_index.readline() print("Index files do not exists, please run the indexer first")
data = json.loads(json_value) return None
return data['postings']
def set_weight(self): def set_weight(self):
weight_file = open('docs.weight','w') weight_file = open('docs.weight','w')
@@ -94,12 +118,15 @@ class Indexer():
weight_file.close() weight_file.close()
def get_weight(self,doc_id): def get_weight(self,doc_id):
weight = open('docs.weight','r') if exists('docs.weight'):
weight.seek(0,0) weight = open('docs.weight','r')
json_value = weight.readline() weight.seek(0,0)
data = json.loads(json_value) json_value = weight.readline()
return data[doc_id] data = json.loads(json_value)
return data[doc_id]
else:
print("Index files do not exists, please run the indexer first")
return None
def get_data_path(self): def get_data_path(self):
for directory in os.listdir(self.path): for directory in os.listdir(self.path):
for file in os.listdir(self.path + "/" + directory + "/"): for file in os.listdir(self.path + "/" + directory + "/"):
@@ -202,28 +229,13 @@ class Indexer():
merged_index_index.close() merged_index_index.close()
merged_index.close() merged_index.close()
load_index_index()
def create_index(self):
def main(): indexer.get_data_path()
indexer = Indexer(list(),dict(),list()) print("We have " + str(len(indexer.data_paths)) + " documents to go through !" )
indexer.get_data_path() indexer.start()
print("We have " + str(len(indexer.data_paths)) + " documents to go through !" ) indexer.merge()
indexer.start() print("Finished merging into 1 big happy family")
indexer.merge() indexer.set_weight()
print("Finished merging into 1 big happy family") print("I AM DONE INDEXING !")
indexer.set_weight()
tic = time.perf_counter()
indexer.get_postings('artifici')
toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to get postings for artifici")
tic = time.perf_counter()
indexer.get_weight('00ba3af6a00b7cfb4928e5d234342c5dc46b4e31714d4a8f315a2dd4d8e49860')
print(f"Took {toc - tic:0.4f} seconds to get weight for some random page ")
toc = time.perf_counter()
if __name__ == "__main__":
main()

31
launcher.py Normal file
View File

@@ -0,0 +1,31 @@
from indexer import Indexer
import time
from flask import Flask
from flask import render_template
from flask import request
app = Flask(__name__)
def main():
indexer = Indexer(False,list(),dict(),list())
indexer.load_index_index()
search = Search()
@app.route('/',methods=['POST','GET'])
def index():
if request.method == 'POST':
if request.form.get('start-index') == "start":
print("make the indexer")
return render_template('index.html',ips="Thanks for waiting you are ready to search.")
if request.form.get('search_query') != "":
search = request.form['search_query']
result = [['lorem','ipsi'],['lores','dolores']]
return render_template('index.html',results=result)
return render_template('index.html')
else:
return render_template('index.html')
if __name__ == "__main__":
app.run(debug=True)
main()

16
templates/base.html Normal file
View File

@@ -0,0 +1,16 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=deivce-width,initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content = "ie=edge">
<title> Scruffy Search Engine </title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.2.0-beta1/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-0evHe/X+R7YkIZDRvuzKMRqM+OrBnVFBL6DOitfPri4tjfHxaWutUpFmBp4vmVor" crossorigin="anonymous">
</head>
<body>
{% block content %}
{% endblock %}
</body>
</html>

25
templates/index.html Normal file
View File

@@ -0,0 +1,25 @@
{% extends 'base.html' %}
{% block content %}
<h1 align="center">Scruffy Search Engine<h1>
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.2.0-beta1/dist/js/bootstrap.bundle.min.js" integrity="sha384-pprn3073KE6tl6bjs2QrFaJGz5/SUsLqktiwsUTF55Jfv3qYSDhgCecCxMW52nD2" crossorigin="anonymous"></script>
<div class="d-flex justify-content-center">
<form method="post" action="/">
<button align="center" type="submit" value ="start" name="start-index" class="btn btn-outline-primary">Run Indexer</button>
</div>
<div class="d-flex justify-content-center">
<img src="https://theinfosphere.org/images/9/96/Scruffy_promo.jpg" class="img-fluid">
</div>
<div class="d-flex justify-content-center input-group mb-3">
<form method='POST' action=" {{ url_for('index') }} " enctype="multipart/form-data">
<input type="text" class="form-control" placeholder="Search Query" aria-label="Search Query" aria-describedby="basic-addon1" name='search_query'>
<button type="submit" name="start-index" class="btn btn-primary">Search</button>
</form>
</div>
{% for result in results %}
<p>{{result[0]}} at <a href="{{result[1]}}">{{result[1]}}</a></p>
{% endfor %}
{% endblock %}