Same as previous push

This commit is contained in:
unknown 2022-05-27 13:12:15 -07:00
parent 60f6eb0df0
commit e325b9d810
27 changed files with 434 additions and 325 deletions

2
.gitignore vendored
View File

@ -1,3 +1,5 @@
/data/
*.shelve
/__pycache__/
/test/
merged*

1
docs.weight Normal file

File diff suppressed because one or more lines are too long

View File

@ -10,7 +10,6 @@
#Posting ---> Source of file, tf-idf score. #for now we will only use these two, as we get more complex posting will be change accordingly
#Data input
import math
import json
import os
import shelve
@ -18,7 +17,8 @@ from bs4 import BeautifulSoup
from time import perf_counter
import time
import threading
import pickle
from threading import Lock
import math
#Data process
@ -34,235 +34,196 @@ import re
from posting import Posting
from worker import Worker
class Node():
index_value = ''
postings = list()
class Index():
length = 0
index = list()
class Indexer():
def __init__(self,restart,trimming):
def __init__(self,list_partials,weight,data_paths,worker_factory=Worker):
#Config stuffs
self.path = "D:/Visual Studio Workspace/CS121/assignment3/data/DEV/"
self.restart = restart
self.trimming = trimming
self.path = "data/DEV"
self.num_doc = 0
self.list_partials = list_partials
self.weight = weight
self.data_paths = data_paths
self.stemmer = PorterStemmer()
self.id = list()
# list that contains the denominator for normalization before taking the square root of it. square root will be taken during query time
self.normalize = list()
self.data_paths_lock = Lock()
self.list_partials_lock = Lock()
#Shelves for index
#https://www3.nd.edu/~busiforc/handouts/cryptography/letterfrequencies.html
#https://www.irishtimes.com/news/science/how-many-numbers-begin-with-a-1-more-than-30-per-cent-1.4162466
#According to this will be how we split things
#Save #1 = ABCD + (1) ~ 18.3% of words
#Save #2 = EFGHIJK + (2-3)~ 27.1% of words
#Save #3 = LMNOPQ + (4-7) ~ 25.4% of words
#Save #4 = RSTUVWXYZ + (8-9)~ 29.2% of words
#Save #5 = Special characters
if os.path.exists("save_1.shelve") and restart:
os.remove("save_1.shelve")
if os.path.exists("save_2.shelve") and restart:
os.remove("save_2.shelve")
if os.path.exists("save_3.shelve") and restart:
os.remove("save_3.shelve")
if os.path.exists("save_4.shelve") and restart:
os.remove("save_4.shelve")
if os.path.exists("save_5.shelve") and restart:
os.remove("save_5.shelve")
self.workers = list()
self.worker_factory = worker_factory
self.save_1 = shelve.open("save_1.shelve")
self.save_1_lock = threading.Lock()
self.save_2 = shelve.open("save_2.shelve")
self.save_2_lock = threading.Lock()
self.save_3 = shelve.open("save_3.shelve")
self.save_3_lock = threading.Lock()
self.save_4 = shelve.open("save_4.shelve")
self.save_4_lock = threading.Lock()
self.save_5 = shelve.open("save_5.shelve")
self.save_5_lock = threading.Lock()
def start_async(self):
self.workers = [
self.worker_factory(worker_id,self)
for worker_id in range(8)]
for worker in self.workers:
worker.start()
print(len(list(self.save_1.keys())))
print(len(list(self.save_2.keys())))
print(len(list(self.save_3.keys())))
print(len(list(self.save_4.keys())))
print(len(list(self.save_5.keys())))
def start(self):
self.start_async()
self.join()
def get_url_id(self, url):
return self.id.index(url)
def save_index(self,word,posting):
cur_save = self.get_save_file(word)
lock = self.get_save_lock(word)
lock.acquire()
shelve_list = list()
try:
shelve_list = cur_save[word]
shelve_list.append(posting)
tic = perf_counter()
# Sort by url id to help with query search
shelve_list.sort(key=lambda x: x.url)
# shelve_list.sort(key=lambda x: x.tf_idf, reverse = True)
toc = perf_counter()
if toc - tic > 1 :
print("Took " + str(toc - tic) + "seconds to sort shelve list !")
cur_save.sync()
lock.release()
except:
shelve_list.append(posting)
cur_save[word] = shelve_list
cur_save.sync()
lock.release()
def get_save_file(self,word):
#return the correct save depending on the starting letter of word
word_lower = word.lower()
if re.match(r"^[a-d0-1].*",word_lower):
return self.save_1
elif re.match(r"^[e-k2-3].*",word_lower):
return self.save_2
elif re.match(r"^[l-q4-7].*",word_lower):
return self.save_3
elif re.match(r"^[r-z8-9].*",word_lower):
return self.save_4
else:
print(word)
print("You have somehow went beyond the magic")
return self.save_5
def get_save_lock(self,word):
word_lower = word.lower()
if re.match(r"^[a-d0-1].*",word_lower):
return self.save_1_lock
elif re.match(r"^[e-k2-3].*",word_lower):
return self.save_2_lock
elif re.match(r"^[l-q4-7].*",word_lower):
return self.save_3_lock
elif re.match(r"^[r-z8-9].*",word_lower):
return self.save_4_lock
else:
print(word)
print("You have somehow went beyond the magic")
return self.save_5_lock.acquire()
# I have a test file (mytest.py) with pandas but couldn't figure out how to grab just a single cell.
# so I came up with this, if anyone knows how to get a single cell and can explain it to
# me I would love to know, as I think that method might be quicker, maybe, idk it like
# 4am
# https://stackoverflow.com/questions/34449127/sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-documen
# removed parameter "word" since it wasn't used
# TODO: Add important words scaling
def get_tf_idf(self, words):
# words = [whole text] one element list
# return the score
try:
tfidf = TfidfVectorizer(ngram_range=(1,1)) # ngram_range is range of n-values for different n-grams to be extracted (1,3) gets unigrams, bigrams, trigrams
tfidf_matrix = tfidf.fit_transform(words) # fit trains the model, transform creates matrix
df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out()) # store value of matrix to associated word/n-gram
#return(df.iloc[0][''.join(word)]) #used for finding single word in dataset
data = df.to_dict() # transform dataframe to dict *could be expensive the larger the data gets, tested on ~1000 word doc and took 0.002 secs to run
return data # returns the dict of words/n-grams with tf-idf
#print(df) # debugging
except:
print("Error in tf_idf!")
return -1
def tf(self, text, url):
# tf
tokens = {}
split = text.split(" ")
# loop using index to keep track of position
for i in range(len(split)):
if split[i] not in tokens:
tokens[split[i]] = Posting(self.get_url_id(url), 1, i)
else:
tokens[split[i]].rtf += 1
tokens[split[i]].tf = (1 + math.log(tokens[split[i]].rtf))
tokens[split[i]].positions.append(i)
return tokens
# Does the idf part of the tfidf
def tfidf(self, current_save):
for token, postings in current_save.items():
for p in postings:
p.tfidf = p.tf * math.log(len(self.id)/len(postings))
self.normalize[p.url] += p.tfidf**2
def join(self):
for worker in self.workers:
worker.join()
def get_data(self):
def get_postings(self,index):
merged_index_index = open("merged_index.index" ,'r')
merged_index = open("merged_index.full",'r')
merged_index_index.seek(0,0)
json_value = merged_index_index.readline()
data = json.loads(json_value)
index_index = dict(data['index'])
to_seek = index_index[index]
merged_index.seek(to_seek,0)
json_value = merged_index.readline()
data = json.loads(json_value)
return data['postings']
num_threads = 8
threads = list()
def set_weight(self):
weight_file = open('docs.weight','w')
jsonStr =json.dumps(self.weight, default=lambda o: o.__dict__,sort_keys=False)
weight_file.write(jsonStr)
weight_file.close()
def get_weight(self,doc_id):
weight = open('docs.weight','r')
weight.seek(0,0)
json_value = weight.readline()
data = json.loads(json_value)
return data[doc_id]
def get_data_path(self):
for directory in os.listdir(self.path):
for file in os.listdir(self.path + "/" + directory + "/"):
#Actual files here
#JSON["url"] = url of crawled page, ignore fragments
#JSON["content"] = actual HTML
#JSON["encoding"] = ENCODING
index = 0
while True:
file_path = self.path + "" + directory + "/"+file
# Add url to id here so that there isn't any problems when worker is multi-threaded
self.data_paths.append("data/DEV/" + directory + "/"+file)
self.num_doc = len(self.data_paths)
tic = perf_counter()
load = open(file_path)
data = json.load(load)
if data["url"] not in self.id:
self.id.append(data["url"])
toc = perf_counter()
print("Took " + str(toc - tic) + " seconds to save url to self.id")
def get_next_file(self):
self.data_paths_lock.acquire()
try:
holder = self.data_paths.pop()
self.data_paths_lock.release()
return holder
except IndexError:
self.data_paths_lock.release()
return None
if len(threads) < num_threads:
thread = Worker(self,file_path)
threads.append(thread)
thread.start()
break
else:
if not threads[index].is_alive():
threads[index] = Worker(self,file_path)
threads[index].start()
break
else:
index = index + 1
if(index >= num_threads):
index = 0
time.sleep(.1)
# Make a list the size of the corpus to keep track of document scores
self.normalize = [0] * len(self.id)
def add_partial_index(self,partial_index):
self.list_partials_lock.acquire()
self.list_partials.append(partial_index)
self.list_partials_lock.release()
# These last few function calls calculates idf and finalizes tf-idf weighting for each index
self.tfidf(self.save_1)
self.tfidf(self.save_2)
self.tfidf(self.save_3)
self.tfidf(self.save_4)
self.tfidf(self.save_5)
# Creates a pickle file that is a list of urls where the index of the url is the id that the posting refers to.
p = os.path.dirname(os.path.abspath(__file__))
my_filename = os.path.join(p, "urlID.pkl")
if os.path.exists(my_filename):
os.remove(my_filename)
# Creates file and closes it
f = open(my_filename, "wb")
pickle.dump(self.id, f)
f.close()
# Creates a pickle file that will contain the denominator (before the square root) for normalizing wt
p = os.path.dirname(os.path.abspath(__file__))
my_filename = os.path.join(p, "normalize.pkl")
if os.path.exists(my_filename):
os.remove(my_filename)
# Creates file and closes it
f = open(my_filename, "wb")
pickle.dump(self.normalize, f)
f.close()
#Found 55770 documents
#
#getting important tokens
#getting important tokens
def merge(self):
partial_files = list()
partial_index_files = list()
parital_index_indices = list()
num_indices = len(self.list_partials)
#Full Index.Index and Length
full_index = Index()
full_index.index = list()
full_index.length = 0
for partial_index in self.list_partials:
file = open("temp/" + partial_index+'.partial','r')
partial_files.append(file)
index = open("temp/" + partial_index+'.index','r')
partial_index_files.append(index)
for partial_index_file in partial_index_files:
partial_index_file.seek(0,0)
parital_index_indices.append(json.loads(partial_index_file.readline()))
#Start all indexes at 0
for partial_file in partial_files:
partial_file.seek(0,0)
pointers = [0]*num_indices
merged_index = open("merged_index.full",'w')
merged_index_index = open("merged_index.index" ,'w')
while(True):
#Get all values from all indices to find min
value = None
values = list()
for i in range(num_indices):
if pointers[i] < parital_index_indices[i]['length']:
values.append(parital_index_indices[i]['index'][pointers[i]][0])
if(len(values) == 0):
break
value = min(values)
#Get data from the min value of all indices if exists then save to mergedIndex
if value == None:
print("I have crashed some how by not getting min value")
break
node = Node()
node.index_value = value
for i in range(num_indices):
if pointers[i] < parital_index_indices[i]['length'] and parital_index_indices[i]['index'][pointers[i]][0] == value:
to_seek = parital_index_indices[i]['index'][pointers[i]][1]
partial_files[i].seek(to_seek,0)
json_value = partial_files[i].readline()
temp_node = json.loads(json_value)
node.postings = node.postings + temp_node['postings']
pointers[i] = pointers[i] + 1
#Change postings here with tf*idf idf = log (n/df(t))
node.postings.sort(key=lambda y:y['doc_id'])
for posting in node.postings:
posting['tf_idf'] = posting['tf_raw']*math.log(self.num_doc/len(node.postings))
full_index.index.append((value,merged_index.tell()))
full_index.length = full_index.length + 1
jsonStr = json.dumps(node,default=lambda o: o.__dict__,sort_keys=False)
merged_index.write(jsonStr + '\n')
full_index.index.sort(key=lambda y:y[0])
jsonStr =json.dumps(full_index, default=lambda o: o.__dict__,sort_keys=False)
merged_index_index.write(jsonStr)
for partial_index in self.list_partials:
os.remove("temp/" + partial_index+'.partial')
os.remove("temp/" + partial_index+'.index')
merged_index_index.close()
merged_index.close()
def main():
indexer = Indexer(True,0)
indexer.get_data()
indexer = Indexer(list(),dict(),list())
indexer.get_data_path()
print("We have " + str(len(indexer.data_paths)) + " documents to go through !" )
indexer.start()
indexer.merge()
print("Finished merging into 1 big happy family")
indexer.set_weight()
tic = time.perf_counter()
indexer.get_postings('artifici')
toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to get postings for artifici")
tic = time.perf_counter()
indexer.get_weight('00ba3af6a00b7cfb4928e5d234342c5dc46b4e31714d4a8f315a2dd4d8e49860')
print(f"Took {toc - tic:0.4f} seconds to get weight for some random page ")
toc = time.perf_counter()
if __name__ == "__main__":
main()

View File

@ -4,6 +4,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
#tf_idf
#words = whole text
#word the word we finding the score for
@ -19,13 +20,12 @@ words = ['this is the first document '
doc1 = ["I can't fucking take it any more. Among Us has singlehandedly ruined my life. The other day my teacher was teaching us Greek Mythology and he mentioned a pegasus and I immediately thought 'Pegasus? more like Mega Sus!!!!' and I've never wanted to kms more. I can't look at a vent without breaking down and fucking crying. I can't eat pasta without thinking 'IMPASTA??? THATS PRETTY SUS!!!!' Skit 4 by Kanye West. The lyrics ruined me. A Mongoose, or the 25th island of greece. The scientific name for pig. I can't fucking take it anymore. Please fucking end my suffering."]
doc2 = ["Anyways, um... I bought a whole bunch of shungite rocks, do you know what shungite is? Anybody know what shungite is? No, not Suge Knight, I think he's locked up in prison. I'm talkin' shungite. Anyways, it's a two billion year-old like, rock stone that protects against frequencies and unwanted frequencies that may be traveling in the air. That's my story, I bought a whole bunch of stuff. Put 'em around the la casa. Little pyramids, stuff like that."]
word = 'life'
try:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(doc1)
tfidf = TfidfVectorizer(ngram_range=(3,3)) # ngram_range is range of n-values for different n-grams to be extracted (1,3) gets unigrams, bigrams, trigrams
tfidf_matrix = tfidf.fit_transform(words)
df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out())
print(df.iloc[0][''.join(word)])
#print(df)
#print(df.iloc[0][''.join(word)])
data = df.to_dict()
except KeyError: # word does not exist
print(-1)

View File

@ -1,9 +1,17 @@
#Posting class for indexer, will probably be more complex as we keep adding crap to it
class Posting():
def __init__(self, url, rtf, position):
def __init__(self,doc_id,url,tf_raw,tf_idf,positionals):
self.doc_id = doc_id
self.url = url
self.rtf = rtf
self.tf = 0
self.tfidf = 0
self.positions = [position]
self.tf_raw = tf_raw
self.tf_idf = tf_idf
self.positionals = positionals
def __repr__(self):
return "Doc_id:" + str(self.doc_id) + " URL:" + self.url + " tf_raw:" + str(self.tf_raw) + " tf_idf:" + str(self.tf_idf) + " positionals:" + str(self.positionals)
def __str__(self):
return "Doc_id:" + str(self.doc_id) + " URL:" + self.url + " tf_raw:" + str(self.tf_raw) + " tf_idf:" + str(self.tf_idf) + " positionals:" + str(self.positionals)
def comparator(self):
#Some custom comparator for sorting postings later
pass

View File

View File

View File

View File

View File

View File

View File

View File

View File

View File

View File

View File

View File

View File

View File

View File

@ -1,18 +0,0 @@
#Multiple implementation of stemming here please
class Stemmer():
def __init__(self,mode, data):
#Different type of stemmer = different modes
self.mode = mode
self.data = data
def stem(self):
#Do stuff here
if(self.mode == 0):
#Do stemmer 1
return #stemmed data
#....
def #name of stemmer 1
def #name of stemmer 2

View File

@ -1,2 +0,0 @@
for postings in l_posting:

26
test.py
View File

@ -1,17 +1,13 @@
import re
from threading import Thread
import json
import os
import shelve
import sys
from bs4 import BeautifulSoup
from time import perf_counter
from nltk.stem import PorterStemmer
import nltk
import time
from posting import Posting
for i in range(99):
word_lower = chr(i % 26 + 97) + chr(i % 26 + 97 + 1)
print(word_lower)
if re.match(r"^[a-d1-1].*",word_lower):
print("SAVE 1")
elif re.match(r"^[e-k2-3].*",word_lower):
print("SAVE 2")
elif re.match(r"^[l-q4-7].*",word_lower):
print("SAVE 3")
elif re.match(r"^[r-z8-9].*",word_lower):
print("SAVE 4")
path = "data/DEV/"
print(os.listdir(path))
import re

View File

@ -1,28 +0,0 @@
import json
import os
import shelve
from bs4 import BeautifulSoup
from time import perf_counter
import time
import threading
import pickle
#Data process
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from porter2stemmer import Porter2Stemmer
import re
save_1 = shelve.open("save_1.shelve")
save_2 = shelve.open("save_2.shelve")
save_3 = shelve.open("save_3.shelve")
save_4 = shelve.open("save_4.shelve")
save_5 = shelve.open("save_5.shelve")
key = list(save_1.keys())
print(key)

116
test_merge.py Normal file
View File

@ -0,0 +1,116 @@
import json
from posting import Posting
import math
import sys
import random
from nltk.corpus import words
random_list = [1,2,3,4,5,6,7,8,9,10]
test_data = words.words()
random.shuffle(test_data)
def random_posting(id):
return Posting(id,random.choice(random_list),random.choice(random_list),[random.choice(random_list),random.choice(random_list),random.choice(random_list),random.choice(random_list),
random.choice(random_list),random.choice(random_list),random.choice(random_list),random.choice(random_list)])
class Node():
index_value = 'Something'
postings = list()
class Index():
length = 0
index = list()
def random_partial_index(name):
part_index = Index()
part_index.index = list()
part_index.length = 0
with open(name +'.partial', 'w') as f:
for i in range(1000):
node1 = Node()
node1.index_value = random.choice(test_data).lower()
node1.postings = list()
for i in range(10):
node1.postings.append(random_posting(i))
jsonStr = json.dumps(node1, default=lambda o: o.__dict__,sort_keys=False)
part_index.index.append((node1.index_value,f.tell()))
f.write(jsonStr + '\n')
part_index.length = part_index.length + 1
part_index.index.sort(key=lambda y:y[0])
jsonStr =json.dumps(part_index, default=lambda o: o.__dict__,sort_keys=False)
with open(name + '.index','w') as f:
f.write(jsonStr)
def merge(partial_indices):
partial_files = list()
partial_index_files = list()
parital_index_indices = list()
merged_index = open("merged_index.full",'w')
num_indices = len(partial_indices)
#Full Index.Index and Length
full_index = Index()
full_index.index = list()
full_index.length = 0
for partial_index in partial_indices:
file = open(partial_index+'.partial','r')
partial_files.append(file)
index = open(partial_index+'.index','r')
partial_index_files.append(index)
for partial_index_file in partial_index_files:
partial_index_file.seek(0,0)
parital_index_indices.append(json.loads(partial_index_file.readline()))
#Start all indexes at 0
for partial_file in partial_files:
partial_file.seek(0,0)
pointers = [0]*num_indices
while(True):
#Get all values from all indices to find min
value = None
values = list()
for i in range(num_indices):
if pointers[i] < parital_index_indices[i]['length']:
values.append(parital_index_indices[i]['index'][pointers[i]][0])
if(len(values) == 0):
break
value = min(values)
#Get data from the min value of all indices if exists then save to mergedIndex
if value == None:
print("I have crashed some how by not getting min value")
break
node = Node()
node.index_value = value
for i in range(num_indices):
if pointers[i] < parital_index_indices[i]['length'] and parital_index_indices[i]['index'][pointers[i]][0] == value:
to_seek = parital_index_indices[i]['index'][pointers[i]][1]
partial_files[i].seek(to_seek,0)
json_value = partial_files[i].readline()
temp_node = json.loads(json_value)
node.postings = node.postings + temp_node['postings']
pointers[i] = pointers[i] + 1
node.postings.sort(key=lambda y:y['doc_id'])
full_index.index.append((value,merged_index.tell()))
full_index.length = full_index.length + 1
jsonStr = json.dumps(node,default=lambda o: o.__dict__,sort_keys=False)
merged_index.write(jsonStr + '\n')
full_index.index.sort(key=lambda y:y[0])
jsonStr =json.dumps(full_index, default=lambda o: o.__dict__,sort_keys=False)
with open("merged_index.index" ,'w') as f:
f.write(jsonStr)

BIN
urlID.pkl

Binary file not shown.

155
worker.py
View File

@ -1,64 +1,137 @@
from threading import Thread
import json
import os
import shelve
from bs4 import BeautifulSoup
from time import perf_counter
import time
import pickle
from bs4 import BeautifulSoup
import re
#Data process
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from collections import Counter
from posting import Posting
import math
import sys
class Node():
index_value = ''
postings = list()
class Index():
length = 0
index = list()
class Worker(Thread):
def __init__(self,indexer,target):
self.file = target
def __init__(self,worker_id,indexer):
self.indexer = indexer
self.stemmer = PorterStemmer()
self.worker_id = worker_id
self.num_partial = 0
self.index = dict()
super().__init__(daemon=True)
def dump(self):
part_index = Index()
part_index.length = 0
part_index.index = list()
cur_partial_index_str = "temp/" + str(self.worker_id) + "_" + str(self.num_partial) + '.partial'
cur_partial_index_index_str = "temp/" + str(self.worker_id) + "_" + str(self.num_partial) + '.index'
cur_partial_index = open(cur_partial_index_str,'w')
cur_partial_index_index = open(cur_partial_index_index_str,'w')
for key in self.index:
node = Node()
node.index_value = key
node.postings = self.index[key]
jsonStr = json.dumps(node, default=lambda o: o.__dict__,sort_keys=False)
part_index.index.append((node.index_value,cur_partial_index.tell()))
cur_partial_index.write(jsonStr + '\n')
part_index.length = part_index.length + 1
part_index.index.sort(key=lambda y:y[0])
jsonStr =json.dumps(part_index, default=lambda o: o.__dict__,sort_keys=False)
cur_partial_index_index.write(jsonStr)
self.indexer.add_partial_index(str(self.worker_id) + "_" + str(self.num_partial))
self.num_partial = self.num_partial + 1
self.index.clear()
def run(self):
print("Target: " + str(self.file))
ticker = perf_counter()
file_load = open(self.file)
data = json.load(file_load)
soup = BeautifulSoup(data["content"],features="lxml")
# Gets a cleaner version text comparative to soup.get_text()
tic = perf_counter()
clean_text = ' '.join(soup.stripped_strings)
# Looks for large white space, tabbed space, and other forms of spacing and removes it
# Regex expression matches for space characters excluding a single space or words
clean_text = re.sub(r'\s[^ \w]', '', clean_text)
# Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended
clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)])
# Stems tokenized text
clean_text = " ".join([self.indexer.stemmer.stem(i) for i in clean_text.split()])
# Put clean_text as an element in a list because get_tf_idf workers properly with single element lists
x = [clean_text]
toc = perf_counter()
print("Took " + str(toc - tic) + " seconds to create clean text")
# ngrams is a dict
# structure looks like {ngram : {0: tf-idf score}}
ngrams = self.indexer.get_tf_idf(x)
if ngrams != -1:
tic = perf_counter()
for ngram, posting in ngrams.items():
self.indexer.save_index(ngram, posting)
toc = perf_counter()
print("Took " + str(toc - tic) + " seconds to save ngram")
while True:
target = self.indexer.get_next_file()
if not target:
self.dump()
print("Worker " + str(self.worker_id) + " died")
break
file_load = open(target)
data = json.load(file_load)
soup = BeautifulSoup(data["content"],features="lxml")
doc_id = target[target.rfind('/')+1:-5]
url = data['url']
print("Worker " + str(self.worker_id) + " working on " + url)
important = {'b' : [], 'h1' : [], 'h2' : [], 'h3' : [], 'title' : []}
for key_words in important.keys():
for i in soup.findAll(key_words):
for word in word_tokenize(i.text):
important[key_words].append(self.stemmer.stem(word))
# Gets a cleaner version text comparative to soup.get_text()
clean_text = ' '.join(soup.stripped_strings)
# Looks for large white space, tabbed space, and other forms of spacing and removes it
# Regex expression matches for space characters excluding a single space or words
clean_text = re.sub(r'\s[^ \w]', '', clean_text)
# Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended
clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)])
# Stems tokenized text
clean_text = " ".join([self.stemmer.stem(i) for i in clean_text.split()])
# Put clean_text as an element in a list because get_tf_idf workers properly with single element lists
tokens = word_tokenize(clean_text)
#counter(count,positionals)
counter = dict()
#We calculating tf_raw, and positionals here
for i in range(len(tokens)):
word = tokens[i]
if word in counter:
counter[word][0] = counter[word][0] + 1
counter[word][1].append(i)
else:
counter[word] = [1,list()]
counter[word][1].append(i)
doc_length = len(tokens)
total = 0
for index in counter:
tf = counter[index][0]/doc_length
log_tf = 1 + math.log(tf)
total = total + log_tf * log_tf
if index in self.index:
postings = self.index[index]
postings.append(Posting(doc_id,url,tf,0,counter[index][1]))
else:
self.index[index] = list()
self.index[index].append(Posting(doc_id,url,tf,0,counter[index][1]))
self.index[index].sort(key=lambda y:y.doc_id)
self.indexer.weight[doc_id] = math.sqrt(total)
#10 Megabytes index (in Ram approx)
if sys.getsizeof(self.index) > 1000000:
self.dump()
tocker = perf_counter()
print("Took " + str(tocker - ticker) + " seconds to work")