Basic web-gui
This commit is contained in:
parent
0b127af5a9
commit
107d1b2a46
File diff suppressed because one or more lines are too long
76
indexer.py
76
indexer.py
@ -14,12 +14,7 @@ import json
|
|||||||
import os
|
import os
|
||||||
import shelve
|
import shelve
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from time import perf_counter
|
from os.path import exists
|
||||||
import time
|
|
||||||
import threading
|
|
||||||
from threading import Lock
|
|
||||||
import math
|
|
||||||
|
|
||||||
|
|
||||||
#Data process
|
#Data process
|
||||||
from nltk.tokenize import word_tokenize
|
from nltk.tokenize import word_tokenize
|
||||||
@ -27,13 +22,23 @@ from nltk.stem import PorterStemmer
|
|||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
import math
|
||||||
|
|
||||||
#Logging postings
|
#Logging postings
|
||||||
from posting import Posting
|
from posting import Posting
|
||||||
from worker import Worker
|
from worker import Worker
|
||||||
|
|
||||||
|
#Multi-threading
|
||||||
|
import threading
|
||||||
|
from threading import Lock
|
||||||
|
|
||||||
|
#Performance
|
||||||
|
from time import perf_counter
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Node():
|
class Node():
|
||||||
index_value = ''
|
index_value = ''
|
||||||
postings = list()
|
postings = list()
|
||||||
@ -54,9 +59,29 @@ class Indexer():
|
|||||||
self.data_paths_lock = Lock()
|
self.data_paths_lock = Lock()
|
||||||
self.list_partials_lock = Lock()
|
self.list_partials_lock = Lock()
|
||||||
|
|
||||||
|
#Loading index_index into memory
|
||||||
|
if exists("merged_index_index"):
|
||||||
|
merged_index_index = open("merged_index.index",'r')
|
||||||
|
merged_index_index.seek(0,0)
|
||||||
|
json_value = merged_index_index.readline()
|
||||||
|
data = json.loads(json_value)
|
||||||
|
self.index_index = dict(data['index'])
|
||||||
|
else:
|
||||||
|
self.index_index = dict()
|
||||||
|
|
||||||
self.workers = list()
|
self.workers = list()
|
||||||
self.worker_factory = worker_factory
|
self.worker_factory = worker_factory
|
||||||
|
|
||||||
|
def load_index_index(self):
|
||||||
|
if exists("merged_index.index"):
|
||||||
|
merged_index_index = open("merged_index.index",'r')
|
||||||
|
merged_index_index.seek(0,0)
|
||||||
|
json_value = merged_index_index.readline()
|
||||||
|
data = json.loads(json_value)
|
||||||
|
self.index_index = dict(data['index'])
|
||||||
|
else:
|
||||||
|
print("Index files do not exists, please run the indexer first")
|
||||||
|
return None
|
||||||
|
|
||||||
def start_async(self):
|
def start_async(self):
|
||||||
self.workers = [
|
self.workers = [
|
||||||
@ -75,17 +100,16 @@ class Indexer():
|
|||||||
|
|
||||||
|
|
||||||
def get_postings(self,index):
|
def get_postings(self,index):
|
||||||
merged_index_index = open("merged_index.index" ,'r')
|
try:
|
||||||
merged_index = open("merged_index.full",'r')
|
merged_index = open("merged_index.full",'r')
|
||||||
merged_index_index.seek(0,0)
|
to_seek = self.index_index[index]
|
||||||
json_value = merged_index_index.readline()
|
|
||||||
data = json.loads(json_value)
|
|
||||||
index_index = dict(data['index'])
|
|
||||||
to_seek = index_index[index]
|
|
||||||
merged_index.seek(to_seek,0)
|
merged_index.seek(to_seek,0)
|
||||||
json_value = merged_index.readline()
|
json_value = merged_index.readline()
|
||||||
data = json.loads(json_value)
|
data = json.loads(json_value)
|
||||||
return data['postings']
|
return data['postings']
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("Index files do not exists, please run the indexer first")
|
||||||
|
return None
|
||||||
|
|
||||||
def set_weight(self):
|
def set_weight(self):
|
||||||
weight_file = open('docs.weight','w')
|
weight_file = open('docs.weight','w')
|
||||||
@ -94,12 +118,15 @@ class Indexer():
|
|||||||
weight_file.close()
|
weight_file.close()
|
||||||
|
|
||||||
def get_weight(self,doc_id):
|
def get_weight(self,doc_id):
|
||||||
|
if exists('docs.weight'):
|
||||||
weight = open('docs.weight','r')
|
weight = open('docs.weight','r')
|
||||||
weight.seek(0,0)
|
weight.seek(0,0)
|
||||||
json_value = weight.readline()
|
json_value = weight.readline()
|
||||||
data = json.loads(json_value)
|
data = json.loads(json_value)
|
||||||
return data[doc_id]
|
return data[doc_id]
|
||||||
|
else:
|
||||||
|
print("Index files do not exists, please run the indexer first")
|
||||||
|
return None
|
||||||
def get_data_path(self):
|
def get_data_path(self):
|
||||||
for directory in os.listdir(self.path):
|
for directory in os.listdir(self.path):
|
||||||
for file in os.listdir(self.path + "/" + directory + "/"):
|
for file in os.listdir(self.path + "/" + directory + "/"):
|
||||||
@ -202,28 +229,13 @@ class Indexer():
|
|||||||
|
|
||||||
merged_index_index.close()
|
merged_index_index.close()
|
||||||
merged_index.close()
|
merged_index.close()
|
||||||
|
load_index_index()
|
||||||
|
|
||||||
|
def create_index(self):
|
||||||
def main():
|
|
||||||
indexer = Indexer(list(),dict(),list())
|
|
||||||
indexer.get_data_path()
|
indexer.get_data_path()
|
||||||
print("We have " + str(len(indexer.data_paths)) + " documents to go through !" )
|
print("We have " + str(len(indexer.data_paths)) + " documents to go through !" )
|
||||||
indexer.start()
|
indexer.start()
|
||||||
indexer.merge()
|
indexer.merge()
|
||||||
print("Finished merging into 1 big happy family")
|
print("Finished merging into 1 big happy family")
|
||||||
indexer.set_weight()
|
indexer.set_weight()
|
||||||
|
print("I AM DONE INDEXING !")
|
||||||
tic = time.perf_counter()
|
|
||||||
indexer.get_postings('artifici')
|
|
||||||
toc = time.perf_counter()
|
|
||||||
print(f"Took {toc - tic:0.4f} seconds to get postings for artifici")
|
|
||||||
tic = time.perf_counter()
|
|
||||||
indexer.get_weight('00ba3af6a00b7cfb4928e5d234342c5dc46b4e31714d4a8f315a2dd4d8e49860')
|
|
||||||
print(f"Took {toc - tic:0.4f} seconds to get weight for some random page ")
|
|
||||||
toc = time.perf_counter()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
31
launcher.py
Normal file
31
launcher.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
from indexer import Indexer
|
||||||
|
import time
|
||||||
|
from flask import Flask
|
||||||
|
from flask import render_template
|
||||||
|
from flask import request
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
def main():
|
||||||
|
indexer = Indexer(False,list(),dict(),list())
|
||||||
|
indexer.load_index_index()
|
||||||
|
search = Search()
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/',methods=['POST','GET'])
|
||||||
|
def index():
|
||||||
|
if request.method == 'POST':
|
||||||
|
if request.form.get('start-index') == "start":
|
||||||
|
print("make the indexer")
|
||||||
|
return render_template('index.html',ips="Thanks for waiting you are ready to search.")
|
||||||
|
if request.form.get('search_query') != "":
|
||||||
|
search = request.form['search_query']
|
||||||
|
result = [['lorem','ipsi'],['lores','dolores']]
|
||||||
|
return render_template('index.html',results=result)
|
||||||
|
return render_template('index.html')
|
||||||
|
else:
|
||||||
|
return render_template('index.html')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app.run(debug=True)
|
||||||
|
main()
|
16
templates/base.html
Normal file
16
templates/base.html
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=deivce-width,initial-scale=1.0">
|
||||||
|
<meta http-equiv="X-UA-Compatible" content = "ie=edge">
|
||||||
|
<title> Scruffy Search Engine </title>
|
||||||
|
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.2.0-beta1/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-0evHe/X+R7YkIZDRvuzKMRqM+OrBnVFBL6DOitfPri4tjfHxaWutUpFmBp4vmVor" crossorigin="anonymous">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
{% block content %}
|
||||||
|
|
||||||
|
{% endblock %}
|
||||||
|
</body>
|
||||||
|
|
||||||
|
</html>
|
25
templates/index.html
Normal file
25
templates/index.html
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
{% extends 'base.html' %}
|
||||||
|
|
||||||
|
{% block content %}
|
||||||
|
<h1 align="center">Scruffy Search Engine<h1>
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.2.0-beta1/dist/js/bootstrap.bundle.min.js" integrity="sha384-pprn3073KE6tl6bjs2QrFaJGz5/SUsLqktiwsUTF55Jfv3qYSDhgCecCxMW52nD2" crossorigin="anonymous"></script>
|
||||||
|
<div class="d-flex justify-content-center">
|
||||||
|
<form method="post" action="/">
|
||||||
|
|
||||||
|
<button align="center" type="submit" value ="start" name="start-index" class="btn btn-outline-primary">Run Indexer</button>
|
||||||
|
</div>
|
||||||
|
<div class="d-flex justify-content-center">
|
||||||
|
<img src="https://theinfosphere.org/images/9/96/Scruffy_promo.jpg" class="img-fluid">
|
||||||
|
|
||||||
|
</div>
|
||||||
|
<div class="d-flex justify-content-center input-group mb-3">
|
||||||
|
<form method='POST' action=" {{ url_for('index') }} " enctype="multipart/form-data">
|
||||||
|
<input type="text" class="form-control" placeholder="Search Query" aria-label="Search Query" aria-describedby="basic-addon1" name='search_query'>
|
||||||
|
<button type="submit" name="start-index" class="btn btn-primary">Search</button>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{% for result in results %}
|
||||||
|
<p>{{result[0]}} at <a href="{{result[1]}}">{{result[1]}}</a></p>
|
||||||
|
{% endfor %}
|
||||||
|
{% endblock %}
|
Loading…
Reference in New Issue
Block a user