Search_Engine/test.py

59 lines
2.0 KiB
Python

from threading import Thread
import json
import os
import shelve
import sys
from bs4 import BeautifulSoup
from time import perf_counter
from nltk.stem import PorterStemmer
import nltk
import time
from posting import Posting
import re
self_index = dict()
stemmer = PorterStemmer()
target = 'data/DEV/aiclub_ics_uci_edu/8ef6d99d9f9264fc84514cdd2e680d35843785310331e1db4bbd06dd2b8eda9b.json'
file_load = open(target)
data = json.load(file_load)
doc_id = target[target.rfind('/')+1:-5]
url = data['url']
soup = BeautifulSoup(data["content"],features="lxml")
# Gets a cleaner version text comparative to soup.get_text()
clean_text = ' '.join(soup.stripped_strings)
# Looks for large white space, tabbed space, and other forms of spacing and removes it
# Regex expression matches for space characters excluding a single space or words
clean_text = re.sub(r'\s[^ \w]', '', clean_text)
# Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended
clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)])
# Stems tokenized text
clean_text = " ".join([stemmer.stem(i) for i in clean_text.split()])
tokens = nltk.word_tokenize(clean_text)
#counter(count,positionals)
counter = dict()
for i in range(len(tokens)):
word = tokens[i]
if word in counter:
counter[word][0] = counter[word][0] + 1
counter[word][1].append(i)
else:
counter[word] = [1,list()]
counter[word][1].append(i)
print(counter)
doc_length = len(tokens)
for index in counter:
if index in self_index:
postings = self_index[index]
postings.append(Posting(doc_id,url,counter[index][0]/doc_length,0,counter[index][1]))
else:
self_index[index] = list()
self_index[index].append(Posting(doc_id,url,counter[index][0]/doc_length,0,counter[index][1]))
for index in self_index:
print(index + str(self_index[index]) + '\n')
print("The size of the dictionary is {} bytes".format(sys.getsizeof(self_index)))