Implemented a starting point for the project, run indexer.py, it will stop after 1 single file, a very rudimentary tokenzier implemented.
This commit is contained in:
parent
1fb8fef7a3
commit
fbb1a1ab2c
25
indexer.py
25
indexer.py
@ -13,6 +13,7 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import shelve
|
import shelve
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
#Data process
|
#Data process
|
||||||
@ -86,10 +87,30 @@ class Indexer():
|
|||||||
|
|
||||||
|
|
||||||
def get_data(self):
|
def get_data(self):
|
||||||
for directory in os.listdir(path):
|
for directory in os.listdir(self.path):
|
||||||
for files in os.listdir(directory):
|
for file in os.listdir(self.path + "/" + directory + "/"):
|
||||||
#Actual files here
|
#Actual files here
|
||||||
|
#JSON["url"] = url of crawled page, ignore fragments
|
||||||
|
#JSON["content"] = actual HTML
|
||||||
|
#JSON["encoding"] = ENCODING
|
||||||
|
print(file)
|
||||||
|
file_load = open(self.path + "/" + directory + "/"+file)
|
||||||
|
data = json.load(file_load)
|
||||||
|
soup = BeautifulSoup(data["content"],from_encoding=data["encoding"])
|
||||||
|
words = word_tokenize(soup.get_text())
|
||||||
|
for word in words:
|
||||||
|
if word is not "" and word.isalnum():
|
||||||
|
print(word)
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
indexer = Indexer(True,0)
|
||||||
|
indexer.get_data()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -0,0 +1,5 @@
|
|||||||
|
nltk
|
||||||
|
re
|
||||||
|
shelve
|
||||||
|
json
|
||||||
|
beautifulsoup4
|
BIN
save_1.shelve
Normal file
BIN
save_1.shelve
Normal file
Binary file not shown.
BIN
save_2.shelve
Normal file
BIN
save_2.shelve
Normal file
Binary file not shown.
BIN
save_3.shelve
Normal file
BIN
save_3.shelve
Normal file
Binary file not shown.
BIN
save_4.shelve
Normal file
BIN
save_4.shelve
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user