Implemented a starting point for the project, run indexer.py, it will stop after 1 single file, a very rudimentary tokenzier implemented.

This commit is contained in:
inocturnis 2022-05-04 13:26:18 -07:00
parent 1fb8fef7a3
commit fbb1a1ab2c
6 changed files with 28 additions and 2 deletions

View File

@ -13,6 +13,7 @@
import json import json
import os import os
import shelve import shelve
from bs4 import BeautifulSoup
#Data process #Data process
@ -86,10 +87,30 @@ class Indexer():
def get_data(self): def get_data(self):
for directory in os.listdir(path): for directory in os.listdir(self.path):
for files in os.listdir(directory): for file in os.listdir(self.path + "/" + directory + "/"):
#Actual files here #Actual files here
#JSON["url"] = url of crawled page, ignore fragments
#JSON["content"] = actual HTML
#JSON["encoding"] = ENCODING
print(file)
file_load = open(self.path + "/" + directory + "/"+file)
data = json.load(file_load)
soup = BeautifulSoup(data["content"],from_encoding=data["encoding"])
words = word_tokenize(soup.get_text())
for word in words:
if word is not "" and word.isalnum():
print(word)
exit(1)
def main():
indexer = Indexer(True,0)
indexer.get_data()
if __name__ == "__main__":
main()

View File

@ -0,0 +1,5 @@
nltk
re
shelve
json
beautifulsoup4

BIN
save_1.shelve Normal file

Binary file not shown.

BIN
save_2.shelve Normal file

Binary file not shown.

BIN
save_3.shelve Normal file

Binary file not shown.

BIN
save_4.shelve Normal file

Binary file not shown.