diff --git a/.gitignore b/.gitignore index ea43522..a5f5e47 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ -/data/DEV +/data/ *.shelve +/__pycache__/ diff --git a/indexer.py b/indexer.py index 9412716..510984d 100644 --- a/indexer.py +++ b/indexer.py @@ -14,6 +14,7 @@ import json import os import shelve from bs4 import BeautifulSoup +from time import perf_counter @@ -26,6 +27,9 @@ import numpy as np import re +#Logging postings +from posting import Posting + class Indexer(): def __init__(self,restart,trimming): @@ -34,7 +38,6 @@ class Indexer(): self.restart = restart self.trimming = trimming self.stemmer = PorterStemmer() - self.vectorizer = TfidfVectorizer(lowercase=True,ngram_range = (1,3)) #Shelves for index #https://www3.nd.edu/~busiforc/handouts/cryptography/letterfrequencies.html @@ -44,7 +47,7 @@ class Indexer(): #Save #2 = EFGHIJK + (2-3)~ 27.1% of words #Save #3 = LMNOPQ + (4-7) ~ 25.4% of words #Save #4 = RSTUVWXYZ + (8-9)~ 29.2% of words - #Save #5 = Numbers ??? + #Save #5 = Special characters if os.path.exists("save_1.shelve") and restart: os.remove("save_1.shelve") if os.path.exists("save_2.shelve") and restart: @@ -53,34 +56,40 @@ class Indexer(): os.remove("save_3.shelve") if os.path.exists("save_4.shelve") and restart: os.remove("save_4.shelve") + if os.path.exists("save_5.shelve") and restart: + os.remove("save_5.shelve") self.save_1 = shelve.open("save_1.shelve") self.save_2 = shelve.open("save_2.shelve") self.save_3 = shelve.open("save_3.shelve") self.save_4 = shelve.open("save_4.shelve") + self.save_5 = shelve.open("save_5.shelve") def save_index(self,word,posting): - wordhash = hash(word) ##Honestly do not know why hashing is even needed, might cause more problems - cur_save = get_save(word) + cur_save = self.get_save_file(word) shelve_list = list() - if wordhash not in cur_save: - shelve_list.append(posting) - cur_save[wordhash] = shelve_list - cur_save.sync() - else: - shelve_list = cur_save[wordhash] + try: + shelve_list = cur_save[word] shelve_list.append(posting) + tic = perf_counter() shelve_list.sort(key=lambda x: x.tf_idf, reverse = True) + toc = perf_counter() + if toc - tic > 1 : + print("Took " + str(toc - tic) + "seconds to sort shelve list !") + cur_save.sync() + except: + shelve_list.append(posting) + cur_save[word] = shelve_list cur_save.sync() def get_save_file(self,word): #return the correct save depending on the starting letter of word word_lower = word.lower() - if re.match(r"^[a-d1-1].*",word_lower): + if re.match(r"^[a-d0-1].*",word_lower): return self.save_1 elif re.match(r"^[e-k2-3].*",word_lower): return self.save_2 @@ -89,8 +98,10 @@ class Indexer(): elif re.match(r"^[r-z8-9].*",word_lower): return self.save_4 else: + print(word) print("You have somehow went beyond the magic") - return None + return self.save_5 + # I have a test file (mytest.py) with pandas but couldn't figure out how to grab just a single cell. # so I came up with this, if anyone knows how to get a single cell and can explain it to # me I would love to know, as I think that method might be quicker, maybe, idk it like @@ -118,31 +129,66 @@ class Indexer(): #JSON["url"] = url of crawled page, ignore fragments #JSON["content"] = actual HTML #JSON["encoding"] = ENCODING + ticker = perf_counter() + tic = perf_counter() file_load = open(self.path + "/" + directory + "/"+file) data = json.load(file_load) soup = BeautifulSoup(data["content"],from_encoding=data["encoding"]) words = word_tokenize(soup.get_text()) + toc = perf_counter() + if toc - tic > 1 : + print("Took " + str(toc - tic) + "seconds to tokenize text !") + tokenized_words = list() stemmed_words = list() + + tic = perf_counter() for word in words: - if word != "" and word.isalnum(): + if word != "" and re.fullmatch('[A-Za-z0-9]+',word): #So all the tokenized words are here, tokenized_words.append(word) + toc = perf_counter() + if toc - tic > 1 : + print("Took " + str(toc - tic) + "seconds to isalnum text !") #YOUR CODE HERE - print(tokenized_words) + tic = perf_counter() for word in tokenized_words: stemmed_words.append(self.stemmer.stem(word)) - - print(X) #stemming, #tf_idf #get_tf_idf(stemmed_words,word) #post = Posting() + toc = perf_counter() + if toc - tic > 1 : + print("Took " + str(toc - tic) + "seconds to stemmed text !") + + for word in stemmed_words: + #posting = Posting(data["url"],self.get_tf_idf(list(' '.join(stemmed_words)),word)) + tic = perf_counter() + posting = Posting(data["url"],self.tf_idf_raw(stemmed_words,word)) + toc = perf_counter() + if toc - tic > 1 : + print("Took " + str(toc - tic) + "seconds to tf_idf text !") + + tic = perf_counter() + self.save_index(word,posting) + toc = perf_counter() + if toc - tic > 1 : + print("Took " + str(toc - tic) + "seconds to save text !") + + tocker = perf_counter() + print("Finished " + data['url'] + " in \t " + str(tocker-ticker)) + + def tf_idf_raw(self,words,word): + tf_times = words.count(word) + + tf = tf_times/len(words) + + return tf + + - print(stemmed_words) - # - exit(1) diff --git a/posting.py b/posting.py index 53e666a..e6d1141 100644 --- a/posting.py +++ b/posting.py @@ -1,9 +1,10 @@ #Posting class for indexer, will probably be more complex as we keep adding crap to it class Posting(): - def __init(self,url,tf_idf): + def __init__(self,url,tf_idf): self.url = url self.tf_idf = tf_idf def comparator(self): - #Some custom comparator for sorting postings later \ No newline at end of file + #Some custom comparator for sorting postings later + pass \ No newline at end of file