Implemented a starting point for the project, run indexer.py, it will stop after 1 single file, a very rudimentary tokenzier implemented.

2022-05-04 13:26:18 -07:00 · 2022-05-04 13:26:18 -07:00 · fbb1a1ab2c
commit fbb1a1ab2c
parent 1fb8fef7a3
6 changed files with 28 additions and 2 deletions
--- a/indexer.py
+++ b/indexer.py
@ -13,6 +13,7 @@
 import json
 import os
 import shelve
 from bs4 import BeautifulSoup
 #Data process
@ -86,10 +87,30 @@ class Indexer():
 	def get_data(self):
-		for directory in os.listdir(path):
+		for directory in os.listdir(self.path):
-			for files in os.listdir(directory):
+			for file in os.listdir(self.path + "/" + directory + "/"):
 				#Actual files here
 				#JSON["url"] = url of crawled page, ignore fragments
 				#JSON["content"] = actual HTML
 				#JSON["encoding"] = ENCODING
 				print(file)
 				file_load = open(self.path + "/" + directory + "/"+file)
 				data = json.load(file_load)
 				soup = BeautifulSoup(data["content"],from_encoding=data["encoding"])
 				words = word_tokenize(soup.get_text())
 				for word in words:
 					if word is not "" and word.isalnum():
 						print(word)
 				exit(1)
 def main():
 	indexer = Indexer(True,0)
 	indexer.get_data()
 if __name__ == "__main__":
 	main()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,5 @@
 nltk
 re
 shelve
 json
 beautifulsoup4
--- a/save_1.shelve
+++ b/save_1.shelve
--- a/save_2.shelve
+++ b/save_2.shelve
--- a/save_3.shelve
+++ b/save_3.shelve
--- a/save_4.shelve
+++ b/save_4.shelve