Search_Engine/indexer.py

#We have to import the files
#Split the indexer into 4 parts
#Alphanumeric sequences into the dataset
#Stemming
#Text in bold, headings and other titles should be treated as more important

#Posting structure > tf-idf score. Name/id the token was found in . So hashmap.
#We need shelves to hold the data.

#Posting ---> Source of file, tf-idf score. #for now we will only use these two, as we get more complex posting will be change accordingly

#Data input
import json
import os
import shelve
from bs4 import BeautifulSoup
from time import perf_counter
import time
import threading


#Data process
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np

import re

#Logging postings
from posting import Posting
from worker import Worker


class Indexer():
	def __init__(self,restart):
		#Config stuffs
		self.path = "data/DEV/"
		self.restart = restart

	def get_data(self):
		num_threads = 1
		threads = list()

		for directory in os.listdir(self.path):
			for file in os.listdir(self.path + "/" + directory + "/"):
				while True:
					file_path = self.path + "" + directory + "/"+file
					if len(threads) < num_threads:
						thread = Worker(self,file_path)
						threads.append(thread)
						thread.start()
						break
					else:
						if not threads[index].is_alive():
							threads[index] = Worker(self,file_path)
							threads[index].start()
							break
						else:
							index = index + 1
							if(index >= num_threads):
								index = 0
							time.sleep(.1)

	#Found 55770 documents
	#

				#getting important tokens

def main():
	indexer = Indexer(True,0)
	indexer.get_data()

if __name__ == "__main__":
	main()