Stemmed done

This commit is contained in:
inocturnis 2022-05-04 15:30:01 -07:00
parent 0cb72cbed4
commit 81da17de93
3 changed files with 38 additions and 12 deletions

View File

@ -18,6 +18,9 @@ from bs4 import BeautifulSoup
#Data process #Data process
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import re import re
@ -27,7 +30,8 @@ class Indexer():
self.path = "data/DEV/" self.path = "data/DEV/"
self.restart = restart self.restart = restart
self.trimming = trimming self.trimming = trimming
self.stemmer = PorterStemmer()
self.vectorizer = TfidfVectorizer(lowercase=True,ngram_range = (1,3))
#Shelves for index #Shelves for index
#https://www3.nd.edu/~busiforc/handouts/cryptography/letterfrequencies.html #https://www3.nd.edu/~busiforc/handouts/cryptography/letterfrequencies.html
@ -85,6 +89,13 @@ class Indexer():
print("You have somehow went beyond the magic") print("You have somehow went beyond the magic")
return None return None
def get_tf_idf(self,words,word):
#tf_idf
#words = whole text
#word the word we finding the score for
#return the score
pass
def get_data(self): def get_data(self):
for directory in os.listdir(self.path): for directory in os.listdir(self.path):
@ -93,16 +104,33 @@ class Indexer():
#JSON["url"] = url of crawled page, ignore fragments #JSON["url"] = url of crawled page, ignore fragments
#JSON["content"] = actual HTML #JSON["content"] = actual HTML
#JSON["encoding"] = ENCODING #JSON["encoding"] = ENCODING
print(file)
file_load = open(self.path + "/" + directory + "/"+file) file_load = open(self.path + "/" + directory + "/"+file)
data = json.load(file_load) data = json.load(file_load)
soup = BeautifulSoup(data["content"],from_encoding=data["encoding"]) soup = BeautifulSoup(data["content"],from_encoding=data["encoding"])
words = word_tokenize(soup.get_text()) words = word_tokenize(soup.get_text())
tokenized_words = list()
stemmed_words = list()
for word in words: for word in words:
if word is not "" and word.isalnum(): if word != "" and word.isalnum():
print(word) #So all the tokenized words are here,
tokenized_words.append(word)
#YOUR CODE HERE
print(tokenized_words)
for word in tokenized_words:
stemmed_words.append(self.stemmer.stem(word))
print(X)
#stemming,
#tf_idf
#get_tf_idf(stemmed_words,word)
#post = Posting()
print(stemmed_words)
#
exit(1) exit(1)

View File

@ -1,12 +1,9 @@
#Posting class for indexer, will probably be more complex as we keep adding crap to it #Posting class for indexer, will probably be more complex as we keep adding crap to it
class Posting(): class Posting():
def __init(self,source): def __init(self,url,tf_idf):
self.source = source self.url = url
self.tf_idf = get_tf_idf() self.tf_idf = tf_idf
def get_tf_idf(self):
#Do tf_idf here
def comparator(self): def comparator(self):
#Some custom comparator for sorting postings later #Some custom comparator for sorting postings later

View File

@ -2,4 +2,5 @@ nltk
re re
shelve shelve
json json
beautifulsoup4 beautifulsoup4
sklearn