From b82516ec856de976d3c28191b002b0658b2c3051 Mon Sep 17 00:00:00 2001 From: Lacerum Date: Fri, 6 May 2022 14:03:49 -0700 Subject: [PATCH] attempted fix for if-idf --- indexer.py | 18 ++++++++---------- mytest.py | 49 +++++++++++++++++++++++++++++++------------------ 2 files changed, 39 insertions(+), 28 deletions(-) diff --git a/indexer.py b/indexer.py index 02decf7..9412716 100644 --- a/indexer.py +++ b/indexer.py @@ -101,16 +101,14 @@ class Indexer(): #words = whole text #word the word we finding the score for #return the score - vect = TfidfVectorizer() - tfidf_matrix = vect.fit_transform(words) - feature_index = tfidf_matrix[0,:].nonzero()[1] - feature_names = vect.get_feature_names_out() - tfidf_scores = zip(feature_index, [tfidf_matrix[0, x] for x in feature_index]) - for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]: - if w == word: - return s - else: - return -1 # don't really know what to do if the word doesn't exist, we can catch with negative or print an error? + try: + tfidf = TfidfVectorizer() + tfidf_matrix = tfidf.fit_transform(words) + df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out()) + return(df.iloc[0][''.join(word)]) + #print(df) + except KeyError: + return -1 def get_data(self): diff --git a/mytest.py b/mytest.py index d240fee..3ec2c2e 100644 --- a/mytest.py +++ b/mytest.py @@ -8,21 +8,35 @@ import numpy as np #words = whole text #word the word we finding the score for #return the score -words = ['this is the first document this is another one this is the final Kaeya of all the docs wow this will just keep going who knew that ther could be this much Madeon - Love You Back (Visualizer)'] -doc1 = "I can't fucking take it any more. Among Us has singlehandedly ruined my life. The other day my teacher was teaching us Greek Mythology and he mentioned a pegasus and I immediately thought 'Pegasus? more like Mega Sus!!!!' and I've never wanted to kms more. I can't look at a vent without breaking down and fucking crying. I can't eat pasta without thinking 'IMPASTA??? THATS PRETTY SUS!!!!' Skit 4 by Kanye West. The lyrics ruined me. A Mongoose, or the 25th island of greece. The scientific name for pig. I can't fucking take it anymore. Please fucking end my suffering." -doc2 = "Anyways, um... I bought a whole bunch of shungite rocks, do you know what shungite is? Anybody know what shungite is? No, not Suge Knight, I think he's locked up in prison. I'm talkin' shungite. Anyways, it's a two billion year-old like, rock stone that protects against frequencies and unwanted frequencies that may be traveling in the air. That's my story, I bought a whole bunch of stuff. Put 'em around the la casa. Little pyramids, stuff like that." -word = 'doc' -vect = TfidfVectorizer() -tfidf_matrix = vect.fit_transform(words) -feature_index = tfidf_matrix[0,:].nonzero()[1] -feature_names = vect.get_feature_names_out() -tfidf_scores = zip(feature_index, [tfidf_matrix[0, x] for x in feature_index]) -for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]: - if w == word: - print (s) - else: - exit + +words = ['this is the first document ' + 'this is another one this is the final ' + 'Kaeya of all the docs wow this will just ' + 'keep going who knew that ther could be this ' + 'much Madeon - Love You Back (Visualizer)' + 'how many how many how how how how'] +doc1 = ["I can't fucking take it any more. Among Us has singlehandedly ruined my life. The other day my teacher was teaching us Greek Mythology and he mentioned a pegasus and I immediately thought 'Pegasus? more like Mega Sus!!!!' and I've never wanted to kms more. I can't look at a vent without breaking down and fucking crying. I can't eat pasta without thinking 'IMPASTA??? THATS PRETTY SUS!!!!' Skit 4 by Kanye West. The lyrics ruined me. A Mongoose, or the 25th island of greece. The scientific name for pig. I can't fucking take it anymore. Please fucking end my suffering."] +doc2 = ["Anyways, um... I bought a whole bunch of shungite rocks, do you know what shungite is? Anybody know what shungite is? No, not Suge Knight, I think he's locked up in prison. I'm talkin' shungite. Anyways, it's a two billion year-old like, rock stone that protects against frequencies and unwanted frequencies that may be traveling in the air. That's my story, I bought a whole bunch of stuff. Put 'em around the la casa. Little pyramids, stuff like that."] +word = 'life' + +try: + tfidf = TfidfVectorizer() + tfidf_matrix = tfidf.fit_transform(doc1) + df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out()) + print(df.iloc[0][''.join(word)]) + #print(df) +except KeyError: # word does not exist + print(-1) + +# vect = TfidfVectorizer() +# tfidf_matrix = vect.fit_transform(words) +# feature_index = tfidf_matrix[0,:].nonzero()[1] +# feature_names = vect.get_feature_names_out() +# tfidf_scores = zip(feature_index, [tfidf_matrix[0, x] for x in feature_index]) +# for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]: +# if w == word: +# print(s) #--------------------------------- Prints the list of all -----------------------------------# # for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]: # print (w, s) @@ -38,10 +52,9 @@ for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]: # print(feature_names[col], ' - ', response[0,col]) -# vect = TfidfVectorizer() -# tfidf_matrix = vect.fit_transform(words) -# df = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names_out()) -# print(df) + + +