diff --git a/indexer.py b/indexer.py index acaecb0..9ce39d1 100644 --- a/indexer.py +++ b/indexer.py @@ -17,6 +17,7 @@ from bs4 import BeautifulSoup from time import perf_counter import time import threading +import pickle #Data process @@ -36,11 +37,26 @@ from worker import Worker class Indexer(): def __init__(self,restart,trimming): #Config stuffs - self.path = "data/DEV/" + self.path = "D:/Visual Studio Workspace/CS121/assignment3/data/DEV/" self.restart = restart self.trimming = trimming self.stemmer = PorterStemmer() + self.id = list() + # Creates a pickle file that is a list of urls where the index of the url is the id that the posting refers to. + p = os.path.dirname(os.path.abspath(__file__)) + my_filename = os.path.join(p, "urlID.pkl") + if os.path.exists(my_filename): + os.remove(my_filename) + + # Creates file and closes it + self.f = open(my_filename, "wb") + pickle.dump(id, self.f) + self.f.close() + + # Opens for reading for the entire duration of indexer for worker to use + self.f = open(my_filename, "rb+") + #Shelves for index #https://www3.nd.edu/~busiforc/handouts/cryptography/letterfrequencies.html #https://www.irishtimes.com/news/science/how-many-numbers-begin-with-a-1-more-than-30-per-cent-1.4162466 @@ -79,6 +95,9 @@ class Indexer(): print(len(list(self.save_4.keys()))) print(len(list(self.save_5.keys()))) + def get_url_id(self, url): + return self.id.index(url) + def save_index(self,word,posting): cur_save = self.get_save_file(word) lock = self.get_save_lock(word) @@ -88,7 +107,9 @@ class Indexer(): shelve_list = cur_save[word] shelve_list.append(posting) tic = perf_counter() - shelve_list.sort(key=lambda x: x.tf_idf, reverse = True) + # Sort by url id to help with query search + shelve_list.sort(key=lambda x: x.url) + # shelve_list.sort(key=lambda x: x.tf_idf, reverse = True) toc = perf_counter() if toc - tic > 1 : print("Took " + str(toc - tic) + "seconds to sort shelve list !") @@ -137,33 +158,22 @@ class Indexer(): # 4am # https://stackoverflow.com/questions/34449127/sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-documen - # Andy: added paramenter imporant_words in order to do multiplication of score - def get_tf_idf(self,words,word, important_words): - #tf_idf - #words = whole text - #word the word we finding the score for - #return the score + # removed parameter "word" since it wasn't used + # TODO: Add important words scaling + def get_tf_idf(self, words): + # words = [whole text] one element list + # return the score try: - tfidf = TfidfVectorizer() - tfidf_matrix = tfidf.fit_transform(words) - df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out()) - score = df.iloc[0][''.join(word)] - for k,v in important_words.items(): - if k == 'b' and word in v: - score = score * 1.2 - elif k == 'h1' and word in v: - score = score * 1.75 - elif k == 'h2' and word in v: - score = score * 1.5 - elif k == 'h3' and word in v: - score = score * 1.2 - elif k == 'title' and word in v: - score = score * 2 - return(score) - #print(df) - except KeyError: - return -1 - + tfidf = TfidfVectorizer(ngram_range=(1,3)) # ngram_range is range of n-values for different n-grams to be extracted (1,3) gets unigrams, bigrams, trigrams + tfidf_matrix = tfidf.fit_transform(words) # fit trains the model, transform creates matrix + df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out()) # store value of matrix to associated word/n-gram + #return(df.iloc[0][''.join(word)]) #used for finding single word in dataset + data = df.to_dict() # transform dataframe to dict *could be expensive the larger the data gets, tested on ~1000 word doc and took 0.002 secs to run + return data # returns the dict of words/n-grams with tf-idf + #print(df) # debugging + except: + print("Error in tf_idf!") + return def get_data(self): @@ -179,6 +189,11 @@ class Indexer(): index = 0 while True: file_path = self.path + "" + directory + "/"+file + # Add url to id here so that there isn't any problems when worker is multi-threaded + load = open(file_path) + data = json.load(load) + if data["url"] not in self.id: + self.id.append(data["url"]) if len(threads) < num_threads: thread = Worker(self,file_path) threads.append(thread) @@ -194,7 +209,8 @@ class Indexer(): if(index >= num_threads): index = 0 time.sleep(.1) - + pickle.dump(self.id, self.f) + # should I self.f.close() here? #Found 55770 documents # diff --git a/save_1.shelve.bak b/save_1.shelve.bak new file mode 100644 index 0000000..e69de29 diff --git a/save_1.shelve.dat b/save_1.shelve.dat new file mode 100644 index 0000000..e69de29 diff --git a/save_1.shelve.dir b/save_1.shelve.dir new file mode 100644 index 0000000..e69de29 diff --git a/save_2.shelve.bak b/save_2.shelve.bak new file mode 100644 index 0000000..e69de29 diff --git a/save_2.shelve.dat b/save_2.shelve.dat new file mode 100644 index 0000000..e69de29 diff --git a/save_2.shelve.dir b/save_2.shelve.dir new file mode 100644 index 0000000..e69de29 diff --git a/save_3.shelve.bak b/save_3.shelve.bak new file mode 100644 index 0000000..e69de29 diff --git a/save_3.shelve.dat b/save_3.shelve.dat new file mode 100644 index 0000000..e69de29 diff --git a/save_3.shelve.dir b/save_3.shelve.dir new file mode 100644 index 0000000..e69de29 diff --git a/save_4.shelve.bak b/save_4.shelve.bak new file mode 100644 index 0000000..e69de29 diff --git a/save_4.shelve.dat b/save_4.shelve.dat new file mode 100644 index 0000000..e69de29 diff --git a/save_4.shelve.dir b/save_4.shelve.dir new file mode 100644 index 0000000..e69de29 diff --git a/save_5.shelve.bak b/save_5.shelve.bak new file mode 100644 index 0000000..e69de29 diff --git a/save_5.shelve.dat b/save_5.shelve.dat new file mode 100644 index 0000000..e69de29 diff --git a/save_5.shelve.dir b/save_5.shelve.dir new file mode 100644 index 0000000..e69de29 diff --git a/search.py b/search.py new file mode 100644 index 0000000..d3e9d28 --- /dev/null +++ b/search.py @@ -0,0 +1,63 @@ +#Data input +import json +import os +import shelve +from bs4 import BeautifulSoup +from time import perf_counter +import time +import threading + + +#Data process +from nltk.tokenize import word_tokenize +from nltk.stem import PorterStemmer +from sklearn.feature_extraction.text import TfidfVectorizer +import pandas as pd +import numpy as np + +import re + +#Logging postings +from posting import Posting +from worker import Worker + +class Search(): + + def __init__(self): + self.save_1 = shelve.open("save_1.shelve") + self.save_2 = shelve.open("save_2.shelve") + self.save_3 = shelve.open("save_3.shelve") + self.save_4 = shelve.open("save_4.shelve") + self.save_5 = shelve.open("save_5.shelve") + + def get_save_file(self, word): + word_lower = word.lower() + + if re.match(r"^[a-d0-1].*", word_lower): + return self.save_1 + elif re.match(r"^[e-k2-3].*", word_lower): + return self.save_2 + elif re.match(r"^[l-q4-7].*", word_lower): + return self.save_3 + elif re.match(r"^[r-z8-9].*", word_lower): + return self.save_4 + else: + return self.save_5 + + def get_userinput(): + return + + def get_tf_idf(self, words): + try: + tfidf = TfidfVectorizer(ngram_range=(1,3)) + + def search(query): + x = [query] + + file = self.get_save_file() + + + + + + diff --git a/test1.py b/test1.py new file mode 100644 index 0000000..85c4eb5 --- /dev/null +++ b/test1.py @@ -0,0 +1,28 @@ +import json +import os +import shelve +from bs4 import BeautifulSoup +from time import perf_counter +import time +import threading +import pickle + + +#Data process +from nltk.tokenize import word_tokenize +from nltk.stem import PorterStemmer +from sklearn.feature_extraction.text import TfidfVectorizer +import pandas as pd +import numpy as np +from porter2stemmer import Porter2Stemmer + +import re + +save_1 = shelve.open("save_1.shelve") +save_2 = shelve.open("save_2.shelve") +save_3 = shelve.open("save_3.shelve") +save_4 = shelve.open("save_4.shelve") +save_5 = shelve.open("save_5.shelve") + +key = list(save_1.keys()) +print(key) \ No newline at end of file diff --git a/urlID.pkl b/urlID.pkl new file mode 100644 index 0000000..eff0dff Binary files /dev/null and b/urlID.pkl differ diff --git a/worker.py b/worker.py index 9ad5140..e861935 100644 --- a/worker.py +++ b/worker.py @@ -5,6 +5,7 @@ import shelve from bs4 import BeautifulSoup from time import perf_counter import time +import pickle import re @@ -30,80 +31,26 @@ class Worker(Thread): def run(self): print("Target: " + str(self.file)) - ticker = perf_counter() - tic = perf_counter() + file_load = open(self.file) data = json.load(file_load) soup = BeautifulSoup(data["content"],features="lxml") - words = word_tokenize(soup.get_text()) - toc = perf_counter() - if toc - tic > 1 : - print("Took " + str(toc - tic) + "seconds to tokenize text !") + # Gets a cleaner version text comparative to soup.get_text() + clean_text = ' '.join(soup.stripped_strings) + # Looks for large white space, tabbed space, and other forms of spacing and removes it + # Regex expression matches for space characters excluding a single space or words + clean_text = re.sub(r'\s[^ \w]', '', clean_text) + # Tokenizes text and joins it back into an entire string. Make sure it is an entire string is essential for get_tf_idf to work as intended + clean_text = " ".join([i for i in clean_text.split() if i != "" and re.fullmatch('[A-Za-z0-9]+', i)]) + # Stems tokenized text + clean_text = " ".join([self.indexer.stemmer.stem(i) for i in clean_text.split()]) + # Put clean_text as an element in a list because get_tf_idf workers properly with single element lists + x = [clean_text] + # ngrams is a dict + # structure looks like {ngram : {0: tf-idf score}} + ngrams = self.indexer.get_tf_idf(x) - tokenized_words = list() - stemmed_words = list() + for ngram, tfidf in ngrams.items(): + posting = Posting(self.indexer.get_url_id(data["url"]), tfidf[0]) + self.indexer.save_index(ngram,posting) - important = {'b' : [], 'h1' : [], 'h2' : [], 'h3' : [], 'title' : []} - for key_words in important.keys(): - for i in soup.findAll(key_words): - for word in word_tokenize(i.text): - important[key_words].append(self.indexer.stemmer.stem(word)) - - tic = perf_counter() - for word in words: - if word != "" and re.fullmatch('[A-Za-z0-9]+',word): - #So all the tokenized words are here, - tokenized_words.append(word) - toc = perf_counter() - if toc - tic > 1 : - print("Took " + str(toc - tic) + "seconds to isalnum text !") - #YOUR CODE HERE - - tic = perf_counter() - for word in tokenized_words: - stemmed_words.append(self.indexer.stemmer.stem(word)) - #stemming, - #tf_idf - #get_tf_idf(stemmed_words,word) - #post = Posting() - toc = perf_counter() - if toc - tic > 1 : - print("Took " + str(toc - tic) + "seconds to stemmed text !") - - counts = Counter(stemmed_words) - size = len(stemmed_words) - for word in counts: - #posting = Posting(data["url"],self.get_tf_idf(list(' '.join(stemmed_words)),word)) - tic = perf_counter() - weight = 1.0 - index = 0 - """ - for group in important: - for word_important in group: - if word_important.lower() == word.lower(): - if index == 0: - weight = 1.2 - elif index == 1: - weight = 1.8 - elif index == 2: - weight = 1.5 - elif index == 3: - weight = 1.3 - elif index == 4: - weight = 2.0 - index = index + 1 - """ - - posting = Posting(data["url"],counts[word]/size*weight) - toc = perf_counter() - if toc - tic > 1 : - print("Took " + str(toc - tic) + "seconds to tf_idf text !") - - tic = perf_counter() - self.indexer.save_index(word,posting) - toc = perf_counter() - if toc - tic > 1 : - print("Took " + str(toc - tic) + "seconds to save text !") - - tocker = perf_counter() - print("Finished " + data['url'] + "\n" + str(tocker-ticker))