filled out get_tf_idf, added test file for it

This commit is contained in:
Lacerum 2022-05-06 04:04:04 -07:00
parent 81da17de93
commit b833afbfa3
2 changed files with 66 additions and 2 deletions

View File

@ -16,10 +16,13 @@ import shelve
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
#Data process #Data process
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import re import re
@ -88,13 +91,26 @@ class Indexer():
else: else:
print("You have somehow went beyond the magic") print("You have somehow went beyond the magic")
return None return None
# I have a test file (mytest.py) with pandas but couldn't figure out how to grab just a single cell.
# so I came up with this, if anyone knows how to get a single cell and can explain it to
# me I would love to know, as I think that method might be quicker, maybe, idk it like
# 4am
# https://stackoverflow.com/questions/34449127/sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-documen
def get_tf_idf(self,words,word): def get_tf_idf(self,words,word):
#tf_idf #tf_idf
#words = whole text #words = whole text
#word the word we finding the score for #word the word we finding the score for
#return the score #return the score
pass vect = TfidfVectorizer()
tfidf_matrix = vect.fit_transform(words)
feature_index = tfidf_matrix[0,:].nonzero()[1]
feature_names = vect.get_feature_names_out()
tfidf_scores = zip(feature_index, [tfidf_matrix[0, x] for x in feature_index])
for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
if w == word:
return s
else:
return -1 # don't really know what to do if the word doesn't exist, we can catch with negative or print an error?
def get_data(self): def get_data(self):

48
mytest.py Normal file
View File

@ -0,0 +1,48 @@
from typing import Mapping
from urllib import response
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
#tf_idf
#words = whole text
#word the word we finding the score for
#return the score
words = ['this is the first document this is another one this is the final Kaeya of all the docs wow this will just keep going who knew that ther could be this much Madeon - Love You Back (Visualizer)']
doc1 = "I can't fucking take it any more. Among Us has singlehandedly ruined my life. The other day my teacher was teaching us Greek Mythology and he mentioned a pegasus and I immediately thought 'Pegasus? more like Mega Sus!!!!' and I've never wanted to kms more. I can't look at a vent without breaking down and fucking crying. I can't eat pasta without thinking 'IMPASTA??? THATS PRETTY SUS!!!!' Skit 4 by Kanye West. The lyrics ruined me. A Mongoose, or the 25th island of greece. The scientific name for pig. I can't fucking take it anymore. Please fucking end my suffering."
doc2 = "Anyways, um... I bought a whole bunch of shungite rocks, do you know what shungite is? Anybody know what shungite is? No, not Suge Knight, I think he's locked up in prison. I'm talkin' shungite. Anyways, it's a two billion year-old like, rock stone that protects against frequencies and unwanted frequencies that may be traveling in the air. That's my story, I bought a whole bunch of stuff. Put 'em around the la casa. Little pyramids, stuff like that."
word = 'doc'
vect = TfidfVectorizer()
tfidf_matrix = vect.fit_transform(words)
feature_index = tfidf_matrix[0,:].nonzero()[1]
feature_names = vect.get_feature_names_out()
tfidf_scores = zip(feature_index, [tfidf_matrix[0, x] for x in feature_index])
for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
if w == word:
print (s)
else:
exit
#--------------------------------- Prints the list of all -----------------------------------#
# for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
# print (w, s)
#--------------------------------- Both of these implentations are from this link -----------------------------------------#
# https://stackoverflow.com/questions/34449127/sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-documen
# tfidf = TfidfVectorizer()
# response = tfidf.fit_transform([doc1, doc2])
# print(len(tfidf.vocabulary_))
# print(tfidf.vocabulary_)
# feature_names = tfidf.get_feature_names_out()
# for col in response.nonzero()[1]:
# print(feature_names[col], ' - ', response[0,col])
# vect = TfidfVectorizer()
# tfidf_matrix = vect.fit_transform(words)
# df = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names_out())
# print(df)