Stemmed done
This commit is contained in:
parent
0cb72cbed4
commit
81da17de93
36
indexer.py
36
indexer.py
@ -18,6 +18,9 @@ from bs4 import BeautifulSoup
|
|||||||
|
|
||||||
#Data process
|
#Data process
|
||||||
from nltk.tokenize import word_tokenize
|
from nltk.tokenize import word_tokenize
|
||||||
|
from nltk.stem import PorterStemmer
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
@ -27,7 +30,8 @@ class Indexer():
|
|||||||
self.path = "data/DEV/"
|
self.path = "data/DEV/"
|
||||||
self.restart = restart
|
self.restart = restart
|
||||||
self.trimming = trimming
|
self.trimming = trimming
|
||||||
|
self.stemmer = PorterStemmer()
|
||||||
|
self.vectorizer = TfidfVectorizer(lowercase=True,ngram_range = (1,3))
|
||||||
|
|
||||||
#Shelves for index
|
#Shelves for index
|
||||||
#https://www3.nd.edu/~busiforc/handouts/cryptography/letterfrequencies.html
|
#https://www3.nd.edu/~busiforc/handouts/cryptography/letterfrequencies.html
|
||||||
@ -85,6 +89,13 @@ class Indexer():
|
|||||||
print("You have somehow went beyond the magic")
|
print("You have somehow went beyond the magic")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def get_tf_idf(self,words,word):
|
||||||
|
#tf_idf
|
||||||
|
#words = whole text
|
||||||
|
#word the word we finding the score for
|
||||||
|
#return the score
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def get_data(self):
|
def get_data(self):
|
||||||
for directory in os.listdir(self.path):
|
for directory in os.listdir(self.path):
|
||||||
@ -93,16 +104,33 @@ class Indexer():
|
|||||||
#JSON["url"] = url of crawled page, ignore fragments
|
#JSON["url"] = url of crawled page, ignore fragments
|
||||||
#JSON["content"] = actual HTML
|
#JSON["content"] = actual HTML
|
||||||
#JSON["encoding"] = ENCODING
|
#JSON["encoding"] = ENCODING
|
||||||
print(file)
|
|
||||||
file_load = open(self.path + "/" + directory + "/"+file)
|
file_load = open(self.path + "/" + directory + "/"+file)
|
||||||
data = json.load(file_load)
|
data = json.load(file_load)
|
||||||
soup = BeautifulSoup(data["content"],from_encoding=data["encoding"])
|
soup = BeautifulSoup(data["content"],from_encoding=data["encoding"])
|
||||||
words = word_tokenize(soup.get_text())
|
words = word_tokenize(soup.get_text())
|
||||||
|
tokenized_words = list()
|
||||||
|
stemmed_words = list()
|
||||||
for word in words:
|
for word in words:
|
||||||
if word is not "" and word.isalnum():
|
if word != "" and word.isalnum():
|
||||||
print(word)
|
#So all the tokenized words are here,
|
||||||
|
tokenized_words.append(word)
|
||||||
|
#YOUR CODE HERE
|
||||||
|
print(tokenized_words)
|
||||||
|
|
||||||
|
for word in tokenized_words:
|
||||||
|
stemmed_words.append(self.stemmer.stem(word))
|
||||||
|
|
||||||
|
print(X)
|
||||||
|
#stemming,
|
||||||
|
#tf_idf
|
||||||
|
#get_tf_idf(stemmed_words,word)
|
||||||
|
#post = Posting()
|
||||||
|
|
||||||
|
print(stemmed_words)
|
||||||
|
#
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
11
posting.py
11
posting.py
@ -1,12 +1,9 @@
|
|||||||
#Posting class for indexer, will probably be more complex as we keep adding crap to it
|
#Posting class for indexer, will probably be more complex as we keep adding crap to it
|
||||||
|
|
||||||
class Posting():
|
class Posting():
|
||||||
def __init(self,source):
|
def __init(self,url,tf_idf):
|
||||||
self.source = source
|
self.url = url
|
||||||
self.tf_idf = get_tf_idf()
|
self.tf_idf = tf_idf
|
||||||
|
|
||||||
def get_tf_idf(self):
|
|
||||||
#Do tf_idf here
|
|
||||||
|
|
||||||
def comparator(self):
|
def comparator(self):
|
||||||
#Some custom comparator for sorting postings later
|
#Some custom comparator for sorting postings later
|
@ -2,4 +2,5 @@ nltk
|
|||||||
re
|
re
|
||||||
shelve
|
shelve
|
||||||
json
|
json
|
||||||
beautifulsoup4
|
beautifulsoup4
|
||||||
|
sklearn
|
Loading…
Reference in New Issue
Block a user