diff --git a/indexer.py b/indexer.py index b5f2f96..0d7b41c 100644 --- a/indexer.py +++ b/indexer.py @@ -18,6 +18,9 @@ from bs4 import BeautifulSoup #Data process from nltk.tokenize import word_tokenize +from nltk.stem import PorterStemmer +from sklearn.feature_extraction.text import TfidfVectorizer + import re @@ -27,7 +30,8 @@ class Indexer(): self.path = "data/DEV/" self.restart = restart self.trimming = trimming - + self.stemmer = PorterStemmer() + self.vectorizer = TfidfVectorizer(lowercase=True,ngram_range = (1,3)) #Shelves for index #https://www3.nd.edu/~busiforc/handouts/cryptography/letterfrequencies.html @@ -85,6 +89,13 @@ class Indexer(): print("You have somehow went beyond the magic") return None + def get_tf_idf(self,words,word): + #tf_idf + #words = whole text + #word the word we finding the score for + #return the score + pass + def get_data(self): for directory in os.listdir(self.path): @@ -93,16 +104,33 @@ class Indexer(): #JSON["url"] = url of crawled page, ignore fragments #JSON["content"] = actual HTML #JSON["encoding"] = ENCODING - print(file) file_load = open(self.path + "/" + directory + "/"+file) data = json.load(file_load) soup = BeautifulSoup(data["content"],from_encoding=data["encoding"]) words = word_tokenize(soup.get_text()) + tokenized_words = list() + stemmed_words = list() for word in words: - if word is not "" and word.isalnum(): - print(word) + if word != "" and word.isalnum(): + #So all the tokenized words are here, + tokenized_words.append(word) + #YOUR CODE HERE + print(tokenized_words) + + for word in tokenized_words: + stemmed_words.append(self.stemmer.stem(word)) + + print(X) + #stemming, + #tf_idf + #get_tf_idf(stemmed_words,word) + #post = Posting() + + print(stemmed_words) + # exit(1) + diff --git a/posting.py b/posting.py index 9625454..53e666a 100644 --- a/posting.py +++ b/posting.py @@ -1,12 +1,9 @@ #Posting class for indexer, will probably be more complex as we keep adding crap to it class Posting(): - def __init(self,source): - self.source = source - self.tf_idf = get_tf_idf() - - def get_tf_idf(self): - #Do tf_idf here - + def __init(self,url,tf_idf): + self.url = url + self.tf_idf = tf_idf + def comparator(self): #Some custom comparator for sorting postings later \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 8721c37..fcf393d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ nltk re shelve json -beautifulsoup4 \ No newline at end of file +beautifulsoup4 +sklearn \ No newline at end of file