Added way to save ngrams to index
This commit is contained in:
63
search.py
Normal file
63
search.py
Normal file
@@ -0,0 +1,63 @@
|
||||
#Data input
|
||||
import json
|
||||
import os
|
||||
import shelve
|
||||
from bs4 import BeautifulSoup
|
||||
from time import perf_counter
|
||||
import time
|
||||
import threading
|
||||
|
||||
|
||||
#Data process
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.stem import PorterStemmer
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
import re
|
||||
|
||||
#Logging postings
|
||||
from posting import Posting
|
||||
from worker import Worker
|
||||
|
||||
class Search():
|
||||
|
||||
def __init__(self):
|
||||
self.save_1 = shelve.open("save_1.shelve")
|
||||
self.save_2 = shelve.open("save_2.shelve")
|
||||
self.save_3 = shelve.open("save_3.shelve")
|
||||
self.save_4 = shelve.open("save_4.shelve")
|
||||
self.save_5 = shelve.open("save_5.shelve")
|
||||
|
||||
def get_save_file(self, word):
|
||||
word_lower = word.lower()
|
||||
|
||||
if re.match(r"^[a-d0-1].*", word_lower):
|
||||
return self.save_1
|
||||
elif re.match(r"^[e-k2-3].*", word_lower):
|
||||
return self.save_2
|
||||
elif re.match(r"^[l-q4-7].*", word_lower):
|
||||
return self.save_3
|
||||
elif re.match(r"^[r-z8-9].*", word_lower):
|
||||
return self.save_4
|
||||
else:
|
||||
return self.save_5
|
||||
|
||||
def get_userinput():
|
||||
return
|
||||
|
||||
def get_tf_idf(self, words):
|
||||
try:
|
||||
tfidf = TfidfVectorizer(ngram_range=(1,3))
|
||||
|
||||
def search(query):
|
||||
x = [query]
|
||||
|
||||
file = self.get_save_file()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user