diff --git a/indexer.py b/indexer.py index 943c453..0213188 100644 --- a/indexer.py +++ b/indexer.py @@ -197,74 +197,10 @@ class Indexer(): #Found 55770 documents # - ticker = perf_counter() - tic = perf_counter() - file_load = open(self.path + "/" + directory + "/"+file) - data = json.load(file_load) - soup = BeautifulSoup(data["content"],from_encoding=data["encoding"]) - words = word_tokenize(soup.get_text()) - #getting important tokens - important = {'b' : [], 'h1' : [], 'h2' : [], 'h3' : [], 'title' : []} - for type in important.keys(): - for i in soup.findAll(type): - for word in word_tokenize(i.text): - important[type].append(self.stemmer.stem(word)) + - toc = perf_counter() - if toc - tic > 1 : - print("Took " + str(toc - tic) + "seconds to tokenize text !") - - tokenized_words = list() - stemmed_words = list() - - tic = perf_counter() - for word in words: - if word != "" and re.fullmatch('[A-Za-z0-9]+',word): - #So all the tokenized words are here, - tokenized_words.append(word) - toc = perf_counter() - if toc - tic > 1 : - print("Took " + str(toc - tic) + "seconds to isalnum text !") - #YOUR CODE HERE - - tic = perf_counter() - for word in tokenized_words: - stemmed_words.append(self.stemmer.stem(word)) - #stemming, - #tf_idf - #get_tf_idf(stemmed_words,word) - #post = Posting() - toc = perf_counter() - if toc - tic > 1 : - print("Took " + str(toc - tic) + "seconds to stemmed text !") - - for word in stemmed_words: - #posting = Posting(data["url"],self.get_tf_idf(list(' '.join(stemmed_words)),word)) - tic = perf_counter() - #added argument important - posting = Posting(data["url"],self.tf_idf_raw(stemmed_words,word, important)) - toc = perf_counter() - if toc - tic > 1 : - print("Took " + str(toc - tic) + "seconds to tf_idf text !") - - tic = perf_counter() - self.save_index(word,posting) - toc = perf_counter() - if toc - tic > 1 : - print("Took " + str(toc - tic) + "seconds to save text !") - - tocker = perf_counter() - print("Finished " + data['url'] + " in \t " + str(tocker-ticker)) - - def tf_idf_raw(self,words,word): - tf_times = words.count(word) - - tf = tf_times/len(words) - - return tf - @@ -275,9 +211,8 @@ class Indexer(): def main(): - indexer = Indexer(False,0) - - #indexer.get_data() + indexer = Indexer(True,0) + indexer.get_data() if __name__ == "__main__": main() \ No newline at end of file diff --git a/worker.py b/worker.py index c3639dd..9ad5140 100644 --- a/worker.py +++ b/worker.py @@ -43,6 +43,12 @@ class Worker(Thread): tokenized_words = list() stemmed_words = list() + important = {'b' : [], 'h1' : [], 'h2' : [], 'h3' : [], 'title' : []} + for key_words in important.keys(): + for i in soup.findAll(key_words): + for word in word_tokenize(i.text): + important[key_words].append(self.indexer.stemmer.stem(word)) + tic = perf_counter() for word in words: if word != "" and re.fullmatch('[A-Za-z0-9]+',word): @@ -69,7 +75,26 @@ class Worker(Thread): for word in counts: #posting = Posting(data["url"],self.get_tf_idf(list(' '.join(stemmed_words)),word)) tic = perf_counter() - posting = Posting(data["url"],counts[word]/size) + weight = 1.0 + index = 0 + """ + for group in important: + for word_important in group: + if word_important.lower() == word.lower(): + if index == 0: + weight = 1.2 + elif index == 1: + weight = 1.8 + elif index == 2: + weight = 1.5 + elif index == 3: + weight = 1.3 + elif index == 4: + weight = 2.0 + index = index + 1 + """ + + posting = Posting(data["url"],counts[word]/size*weight) toc = perf_counter() if toc - tic > 1 : print("Took " + str(toc - tic) + "seconds to tf_idf text !")