Basic web-gui
This commit is contained in:
parent
0b127af5a9
commit
107d1b2a46
File diff suppressed because one or more lines are too long
110
indexer.py
110
indexer.py
@ -14,12 +14,7 @@ import json
|
||||
import os
|
||||
import shelve
|
||||
from bs4 import BeautifulSoup
|
||||
from time import perf_counter
|
||||
import time
|
||||
import threading
|
||||
from threading import Lock
|
||||
import math
|
||||
|
||||
from os.path import exists
|
||||
|
||||
#Data process
|
||||
from nltk.tokenize import word_tokenize
|
||||
@ -27,13 +22,23 @@ from nltk.stem import PorterStemmer
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
import re
|
||||
import math
|
||||
|
||||
#Logging postings
|
||||
from posting import Posting
|
||||
from worker import Worker
|
||||
|
||||
#Multi-threading
|
||||
import threading
|
||||
from threading import Lock
|
||||
|
||||
#Performance
|
||||
from time import perf_counter
|
||||
import time
|
||||
|
||||
|
||||
|
||||
class Node():
|
||||
index_value = ''
|
||||
postings = list()
|
||||
@ -53,10 +58,30 @@ class Indexer():
|
||||
self.stemmer = PorterStemmer()
|
||||
self.data_paths_lock = Lock()
|
||||
self.list_partials_lock = Lock()
|
||||
|
||||
|
||||
#Loading index_index into memory
|
||||
if exists("merged_index_index"):
|
||||
merged_index_index = open("merged_index.index",'r')
|
||||
merged_index_index.seek(0,0)
|
||||
json_value = merged_index_index.readline()
|
||||
data = json.loads(json_value)
|
||||
self.index_index = dict(data['index'])
|
||||
else:
|
||||
self.index_index = dict()
|
||||
|
||||
self.workers = list()
|
||||
self.worker_factory = worker_factory
|
||||
|
||||
def load_index_index(self):
|
||||
if exists("merged_index.index"):
|
||||
merged_index_index = open("merged_index.index",'r')
|
||||
merged_index_index.seek(0,0)
|
||||
json_value = merged_index_index.readline()
|
||||
data = json.loads(json_value)
|
||||
self.index_index = dict(data['index'])
|
||||
else:
|
||||
print("Index files do not exists, please run the indexer first")
|
||||
return None
|
||||
|
||||
def start_async(self):
|
||||
self.workers = [
|
||||
@ -75,17 +100,16 @@ class Indexer():
|
||||
|
||||
|
||||
def get_postings(self,index):
|
||||
merged_index_index = open("merged_index.index" ,'r')
|
||||
merged_index = open("merged_index.full",'r')
|
||||
merged_index_index.seek(0,0)
|
||||
json_value = merged_index_index.readline()
|
||||
data = json.loads(json_value)
|
||||
index_index = dict(data['index'])
|
||||
to_seek = index_index[index]
|
||||
merged_index.seek(to_seek,0)
|
||||
json_value = merged_index.readline()
|
||||
data = json.loads(json_value)
|
||||
return data['postings']
|
||||
try:
|
||||
merged_index = open("merged_index.full",'r')
|
||||
to_seek = self.index_index[index]
|
||||
merged_index.seek(to_seek,0)
|
||||
json_value = merged_index.readline()
|
||||
data = json.loads(json_value)
|
||||
return data['postings']
|
||||
except FileNotFoundError:
|
||||
print("Index files do not exists, please run the indexer first")
|
||||
return None
|
||||
|
||||
def set_weight(self):
|
||||
weight_file = open('docs.weight','w')
|
||||
@ -94,12 +118,15 @@ class Indexer():
|
||||
weight_file.close()
|
||||
|
||||
def get_weight(self,doc_id):
|
||||
weight = open('docs.weight','r')
|
||||
weight.seek(0,0)
|
||||
json_value = weight.readline()
|
||||
data = json.loads(json_value)
|
||||
return data[doc_id]
|
||||
|
||||
if exists('docs.weight'):
|
||||
weight = open('docs.weight','r')
|
||||
weight.seek(0,0)
|
||||
json_value = weight.readline()
|
||||
data = json.loads(json_value)
|
||||
return data[doc_id]
|
||||
else:
|
||||
print("Index files do not exists, please run the indexer first")
|
||||
return None
|
||||
def get_data_path(self):
|
||||
for directory in os.listdir(self.path):
|
||||
for file in os.listdir(self.path + "/" + directory + "/"):
|
||||
@ -202,28 +229,13 @@ class Indexer():
|
||||
|
||||
merged_index_index.close()
|
||||
merged_index.close()
|
||||
load_index_index()
|
||||
|
||||
|
||||
def main():
|
||||
indexer = Indexer(list(),dict(),list())
|
||||
indexer.get_data_path()
|
||||
print("We have " + str(len(indexer.data_paths)) + " documents to go through !" )
|
||||
indexer.start()
|
||||
indexer.merge()
|
||||
print("Finished merging into 1 big happy family")
|
||||
indexer.set_weight()
|
||||
|
||||
tic = time.perf_counter()
|
||||
indexer.get_postings('artifici')
|
||||
toc = time.perf_counter()
|
||||
print(f"Took {toc - tic:0.4f} seconds to get postings for artifici")
|
||||
tic = time.perf_counter()
|
||||
indexer.get_weight('00ba3af6a00b7cfb4928e5d234342c5dc46b4e31714d4a8f315a2dd4d8e49860')
|
||||
print(f"Took {toc - tic:0.4f} seconds to get weight for some random page ")
|
||||
toc = time.perf_counter()
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
def create_index(self):
|
||||
indexer.get_data_path()
|
||||
print("We have " + str(len(indexer.data_paths)) + " documents to go through !" )
|
||||
indexer.start()
|
||||
indexer.merge()
|
||||
print("Finished merging into 1 big happy family")
|
||||
indexer.set_weight()
|
||||
print("I AM DONE INDEXING !")
|
||||
|
31
launcher.py
Normal file
31
launcher.py
Normal file
@ -0,0 +1,31 @@
|
||||
from indexer import Indexer
|
||||
import time
|
||||
from flask import Flask
|
||||
from flask import render_template
|
||||
from flask import request
|
||||
|
||||
app = Flask(__name__)
|
||||
def main():
|
||||
indexer = Indexer(False,list(),dict(),list())
|
||||
indexer.load_index_index()
|
||||
search = Search()
|
||||
|
||||
|
||||
@app.route('/',methods=['POST','GET'])
|
||||
def index():
|
||||
if request.method == 'POST':
|
||||
if request.form.get('start-index') == "start":
|
||||
print("make the indexer")
|
||||
return render_template('index.html',ips="Thanks for waiting you are ready to search.")
|
||||
if request.form.get('search_query') != "":
|
||||
search = request.form['search_query']
|
||||
result = [['lorem','ipsi'],['lores','dolores']]
|
||||
return render_template('index.html',results=result)
|
||||
return render_template('index.html')
|
||||
else:
|
||||
return render_template('index.html')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(debug=True)
|
||||
main()
|
16
templates/base.html
Normal file
16
templates/base.html
Normal file
@ -0,0 +1,16 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=deivce-width,initial-scale=1.0">
|
||||
<meta http-equiv="X-UA-Compatible" content = "ie=edge">
|
||||
<title> Scruffy Search Engine </title>
|
||||
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.2.0-beta1/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-0evHe/X+R7YkIZDRvuzKMRqM+OrBnVFBL6DOitfPri4tjfHxaWutUpFmBp4vmVor" crossorigin="anonymous">
|
||||
</head>
|
||||
<body>
|
||||
{% block content %}
|
||||
|
||||
{% endblock %}
|
||||
</body>
|
||||
|
||||
</html>
|
25
templates/index.html
Normal file
25
templates/index.html
Normal file
@ -0,0 +1,25 @@
|
||||
{% extends 'base.html' %}
|
||||
|
||||
{% block content %}
|
||||
<h1 align="center">Scruffy Search Engine<h1>
|
||||
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.2.0-beta1/dist/js/bootstrap.bundle.min.js" integrity="sha384-pprn3073KE6tl6bjs2QrFaJGz5/SUsLqktiwsUTF55Jfv3qYSDhgCecCxMW52nD2" crossorigin="anonymous"></script>
|
||||
<div class="d-flex justify-content-center">
|
||||
<form method="post" action="/">
|
||||
|
||||
<button align="center" type="submit" value ="start" name="start-index" class="btn btn-outline-primary">Run Indexer</button>
|
||||
</div>
|
||||
<div class="d-flex justify-content-center">
|
||||
<img src="https://theinfosphere.org/images/9/96/Scruffy_promo.jpg" class="img-fluid">
|
||||
|
||||
</div>
|
||||
<div class="d-flex justify-content-center input-group mb-3">
|
||||
<form method='POST' action=" {{ url_for('index') }} " enctype="multipart/form-data">
|
||||
<input type="text" class="form-control" placeholder="Search Query" aria-label="Search Query" aria-describedby="basic-addon1" name='search_query'>
|
||||
<button type="submit" name="start-index" class="btn btn-primary">Search</button>
|
||||
</form>
|
||||
</div>
|
||||
|
||||
{% for result in results %}
|
||||
<p>{{result[0]}} at <a href="{{result[1]}}">{{result[1]}}</a></p>
|
||||
{% endfor %}
|
||||
{% endblock %}
|
Loading…
Reference in New Issue
Block a user