Search_Engine/test.py

from threading import Thread
import json
import os
import shelve
import sys
from bs4 import BeautifulSoup
from time import perf_counter
from nltk.stem import PorterStemmer
import nltk
import time
from posting import Posting

import re

self_index = dict()
stemmer = PorterStemmer()
target = 'data/DEV/aiclub_ics_uci_edu/8ef6d99d9f9264fc84514cdd2e680d35843785310331e1db4bbd06dd2b8eda9b.json'
file_load = open(target)
data = json.load(file_load)
doc_id = target[target.rfind('/')+1:-5]
url = data['url']
soup = BeautifulSoup(data["content"],features="lxml")
# Gets a cleaner version text comparative to soup.get_text()
clean_text = ' '.join(soup.stripped_strings)
# Looks for large white space, tabbed space, and other forms of spacing and removes it
# Regex expression matches for space characters excluding a single space or words
clean_text = re.sub(r'\s[^ \w]', '', clean_text)
# Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended
clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)])
# Stems tokenized text
clean_text = " ".join([stemmer.stem(i) for i in clean_text.split()])

tokens = nltk.word_tokenize(clean_text)

#counter(count,positionals)

counter = dict()
for i in range(len(tokens)):
	word = tokens[i]
	if word in counter:
		counter[word][0] = counter[word][0] + 1
		counter[word][1].append(i)
	else:
		counter[word] = [1,list()]
		counter[word][1].append(i)
print(counter)
doc_length = len(tokens)
for index in counter:
	if index in self_index:
		postings = self_index[index]
		postings.append(Posting(doc_id,url,counter[index][0]/doc_length,0,counter[index][1]))
	else:
		self_index[index] = list()
		self_index[index].append(Posting(doc_id,url,counter[index][0]/doc_length,0,counter[index][1]))

for index in self_index:
	print(index + str(self_index[index]) + '\n')

print("The size of the dictionary is {} bytes".format(sys.getsizeof(self_index)))