Search_Engine/indexer.py
2022-05-27 03:08:56 -07:00

76 lines
1.8 KiB
Python

#We have to import the files
#Split the indexer into 4 parts
#Alphanumeric sequences into the dataset
#Stemming
#Text in bold, headings and other titles should be treated as more important
#Posting structure > tf-idf score. Name/id the token was found in . So hashmap.
#We need shelves to hold the data.
#Posting ---> Source of file, tf-idf score. #for now we will only use these two, as we get more complex posting will be change accordingly
#Data input
import json
import os
import shelve
from bs4 import BeautifulSoup
from time import perf_counter
import time
import threading
#Data process
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import re
#Logging postings
from posting import Posting
from worker import Worker
class Indexer():
def __init__(self,restart):
#Config stuffs
self.path = "data/DEV/"
self.restart = restart
def get_data(self):
num_threads = 1
threads = list()
for directory in os.listdir(self.path):
for file in os.listdir(self.path + "/" + directory + "/"):
while True:
file_path = self.path + "" + directory + "/"+file
if len(threads) < num_threads:
thread = Worker(self,file_path)
threads.append(thread)
thread.start()
break
else:
if not threads[index].is_alive():
threads[index] = Worker(self,file_path)
threads[index].start()
break
else:
index = index + 1
if(index >= num_threads):
index = 0
time.sleep(.1)
#Found 55770 documents
#
#getting important tokens
def main():
indexer = Indexer(True,0)
indexer.get_data()
if __name__ == "__main__":
main()