59 lines
2.0 KiB
Python
59 lines
2.0 KiB
Python
from threading import Thread
|
|
import json
|
|
import os
|
|
import shelve
|
|
import sys
|
|
from bs4 import BeautifulSoup
|
|
from time import perf_counter
|
|
from nltk.stem import PorterStemmer
|
|
import nltk
|
|
import time
|
|
from posting import Posting
|
|
|
|
import re
|
|
|
|
self_index = dict()
|
|
stemmer = PorterStemmer()
|
|
target = 'data/DEV/aiclub_ics_uci_edu/8ef6d99d9f9264fc84514cdd2e680d35843785310331e1db4bbd06dd2b8eda9b.json'
|
|
file_load = open(target)
|
|
data = json.load(file_load)
|
|
doc_id = target[target.rfind('/')+1:-5]
|
|
url = data['url']
|
|
soup = BeautifulSoup(data["content"],features="lxml")
|
|
# Gets a cleaner version text comparative to soup.get_text()
|
|
clean_text = ' '.join(soup.stripped_strings)
|
|
# Looks for large white space, tabbed space, and other forms of spacing and removes it
|
|
# Regex expression matches for space characters excluding a single space or words
|
|
clean_text = re.sub(r'\s[^ \w]', '', clean_text)
|
|
# Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended
|
|
clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)])
|
|
# Stems tokenized text
|
|
clean_text = " ".join([stemmer.stem(i) for i in clean_text.split()])
|
|
|
|
tokens = nltk.word_tokenize(clean_text)
|
|
|
|
#counter(count,positionals)
|
|
|
|
counter = dict()
|
|
for i in range(len(tokens)):
|
|
word = tokens[i]
|
|
if word in counter:
|
|
counter[word][0] = counter[word][0] + 1
|
|
counter[word][1].append(i)
|
|
else:
|
|
counter[word] = [1,list()]
|
|
counter[word][1].append(i)
|
|
print(counter)
|
|
doc_length = len(tokens)
|
|
for index in counter:
|
|
if index in self_index:
|
|
postings = self_index[index]
|
|
postings.append(Posting(doc_id,url,counter[index][0]/doc_length,0,counter[index][1]))
|
|
else:
|
|
self_index[index] = list()
|
|
self_index[index].append(Posting(doc_id,url,counter[index][0]/doc_length,0,counter[index][1]))
|
|
|
|
for index in self_index:
|
|
print(index + str(self_index[index]) + '\n')
|
|
|
|
print("The size of the dictionary is {} bytes".format(sys.getsizeof(self_index))) |