Stemmed done

2022-05-04 15:30:01 -07:00
parent 0cb72cbed4
commit 81da17de93
3 changed files with 38 additions and 12 deletions
--- a/indexer.py
+++ b/indexer.py
@@ -18,6 +18,9 @@ from bs4 import BeautifulSoup
 #Data process
 from nltk.tokenize import word_tokenize
 from nltk.stem import PorterStemmer
 from sklearn.feature_extraction.text import TfidfVectorizer
 import re
@@ -27,7 +30,8 @@ class Indexer():
 		self.path = "data/DEV/"
 		self.restart = restart
 		self.trimming = trimming
-
+		self.stemmer = PorterStemmer()
 		self.vectorizer = TfidfVectorizer(lowercase=True,ngram_range = (1,3))
 		#Shelves for index
 		#https://www3.nd.edu/~busiforc/handouts/cryptography/letterfrequencies.html
@@ -85,6 +89,13 @@ class Indexer():
 			print("You have somehow went beyond the magic")
 			return None
 	def get_tf_idf(self,words,word):
 		#tf_idf
 		#words = whole text
 		#word the word we finding the score for
 		#return the score
 		pass
 	def get_data(self):
 		for directory in os.listdir(self.path):
@@ -93,16 +104,33 @@ class Indexer():
 				#JSON["url"] = url of crawled page, ignore fragments
 				#JSON["content"] = actual HTML
 				#JSON["encoding"] = ENCODING
 				print(file)
 				file_load = open(self.path + "/" + directory + "/"+file)
 				data = json.load(file_load)
 				soup = BeautifulSoup(data["content"],from_encoding=data["encoding"])
 				words = word_tokenize(soup.get_text())
 				tokenized_words = list()
 				stemmed_words = list()
 				for word in words:
-					if word is not "" and word.isalnum():
+					if word != "" and word.isalnum():
-						print(word)
+						#So all the tokenized words are here,
 						tokenized_words.append(word)
 				#YOUR CODE HERE
 				print(tokenized_words)
 				for word in tokenized_words:
 					stemmed_words.append(self.stemmer.stem(word))
 					print(X)
 					#stemming,
 					#tf_idf
 					#get_tf_idf(stemmed_words,word)
 					#post = Posting()
 				print(stemmed_words)
 				#
 				exit(1)
--- a/posting.py
+++ b/posting.py
@@ -1,12 +1,9 @@
 #Posting class for indexer, will probably be more complex as we keep adding crap to it
 class Posting():
-	def __init(self,source):
+	def __init(self,url,tf_idf):
-		self.source = source
+		self.url = url
-		self.tf_idf = get_tf_idf()
+		self.tf_idf = tf_idf
-
+		
 	def get_tf_idf(self):
 		#Do tf_idf here
 	def comparator(self):
 		#Some custom comparator for sorting postings later
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,4 +2,5 @@ nltk
 re
 shelve
 json
-beautifulsoup4
+beautifulsoup4
 sklearn