Implemented a starting point for the project, run indexer.py, it will stop after 1 single file, a very rudimentary tokenzier implemented.

2022-05-04 13:26:18 -07:00
parent 1fb8fef7a3
commit fbb1a1ab2c
6 changed files with 28 additions and 2 deletions
--- a/indexer.py
+++ b/indexer.py
@@ -13,6 +13,7 @@
 import json
 import os
 import shelve
+from bs4 import BeautifulSoup


 #Data process
@@ -86,10 +87,30 @@ class Indexer():


 	def get_data(self):
-		for directory in os.listdir(path):
-			for files in os.listdir(directory):
+		for directory in os.listdir(self.path):
+			for file in os.listdir(self.path + "/" + directory + "/"):
 				#Actual files here
+				#JSON["url"] = url of crawled page, ignore fragments
+				#JSON["content"] = actual HTML
+				#JSON["encoding"] = ENCODING
+				print(file)
+				file_load = open(self.path + "/" + directory + "/"+file)
+				data = json.load(file_load)
+				soup = BeautifulSoup(data["content"],from_encoding=data["encoding"])
+				words = word_tokenize(soup.get_text())
+				for word in words:
+					if word is not "" and word.isalnum():
+						print(word)
+				exit(1)

 				


+
+
+def main():
+	indexer = Indexer(True,0)
+	indexer.get_data()
+
+if __name__ == "__main__":
+	main()
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+nltk
+re
+shelve
+json
+beautifulsoup4
--- a/save_1.shelve
+++ b/save_1.shelve
--- a/save_2.shelve
+++ b/save_2.shelve
--- a/save_3.shelve
+++ b/save_3.shelve
--- a/save_4.shelve
+++ b/save_4.shelve