From fbb1a1ab2c51ca5d2b68658f9d7c2b2466704632 Mon Sep 17 00:00:00 2001 From: inocturnis Date: Wed, 4 May 2022 13:26:18 -0700 Subject: [PATCH] Implemented a starting point for the project, run indexer.py, it will stop after 1 single file, a very rudimentary tokenzier implemented. --- indexer.py | 25 +++++++++++++++++++++++-- requirements.txt | 5 +++++ save_1.shelve | Bin 0 -> 16384 bytes save_2.shelve | Bin 0 -> 16384 bytes save_3.shelve | Bin 0 -> 16384 bytes save_4.shelve | Bin 0 -> 16384 bytes 6 files changed, 28 insertions(+), 2 deletions(-) create mode 100644 save_1.shelve create mode 100644 save_2.shelve create mode 100644 save_3.shelve create mode 100644 save_4.shelve diff --git a/indexer.py b/indexer.py index 5240574..b5f2f96 100644 --- a/indexer.py +++ b/indexer.py @@ -13,6 +13,7 @@ import json import os import shelve +from bs4 import BeautifulSoup #Data process @@ -86,10 +87,30 @@ class Indexer(): def get_data(self): - for directory in os.listdir(path): - for files in os.listdir(directory): + for directory in os.listdir(self.path): + for file in os.listdir(self.path + "/" + directory + "/"): #Actual files here + #JSON["url"] = url of crawled page, ignore fragments + #JSON["content"] = actual HTML + #JSON["encoding"] = ENCODING + print(file) + file_load = open(self.path + "/" + directory + "/"+file) + data = json.load(file_load) + soup = BeautifulSoup(data["content"],from_encoding=data["encoding"]) + words = word_tokenize(soup.get_text()) + for word in words: + if word is not "" and word.isalnum(): + print(word) + exit(1) + + +def main(): + indexer = Indexer(True,0) + indexer.get_data() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index e69de29..8721c37 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,5 @@ +nltk +re +shelve +json +beautifulsoup4 \ No newline at end of file diff --git a/save_1.shelve b/save_1.shelve new file mode 100644 index 0000000000000000000000000000000000000000..f03fe98e2b3029b3d1103880728d018993291465 GIT binary patch literal 16384 zcmeI%!3l#v5Czc5(VLJ4EWr}&BE4Ba+OZ))-K^q3yyg&j!tTHxL>_$blTlfqS3_^nf1F z1A0IY=m9-&vmSWVTYdij?~jAu%w>D7-t4@paNv;dhdjuGJY)fR$Qt@V9^@el$V1l9 h5Aq-nSwJ4XHJlJ2K!5-N0t5&UAV7cs0RndxSOLec*~S0> literal 0 HcmV?d00001 diff --git a/save_2.shelve b/save_2.shelve new file mode 100644 index 0000000000000000000000000000000000000000..f03fe98e2b3029b3d1103880728d018993291465 GIT binary patch literal 16384 zcmeI%!3l#v5Czc5(VLJ4EWr}&BE4Ba+OZ))-K^q3yyg&j!tTHxL>_$blTlfqS3_^nf1F z1A0IY=m9-&vmSWVTYdij?~jAu%w>D7-t4@paNv;dhdjuGJY)fR$Qt@V9^@el$V1l9 h5Aq-nSwJ4XHJlJ2K!5-N0t5&UAV7cs0RndxSOLec*~S0> literal 0 HcmV?d00001 diff --git a/save_3.shelve b/save_3.shelve new file mode 100644 index 0000000000000000000000000000000000000000..f03fe98e2b3029b3d1103880728d018993291465 GIT binary patch literal 16384 zcmeI%!3l#v5Czc5(VLJ4EWr}&BE4Ba+OZ))-K^q3yyg&j!tTHxL>_$blTlfqS3_^nf1F z1A0IY=m9-&vmSWVTYdij?~jAu%w>D7-t4@paNv;dhdjuGJY)fR$Qt@V9^@el$V1l9 h5Aq-nSwJ4XHJlJ2K!5-N0t5&UAV7cs0RndxSOLec*~S0> literal 0 HcmV?d00001 diff --git a/save_4.shelve b/save_4.shelve new file mode 100644 index 0000000000000000000000000000000000000000..f03fe98e2b3029b3d1103880728d018993291465 GIT binary patch literal 16384 zcmeI%!3l#v5Czc5(VLJ4EWr}&BE4Ba+OZ))-K^q3yyg&j!tTHxL>_$blTlfqS3_^nf1F z1A0IY=m9-&vmSWVTYdij?~jAu%w>D7-t4@paNv;dhdjuGJY)fR$Qt@V9^@el$V1l9 h5Aq-nSwJ4XHJlJ2K!5-N0t5&UAV7cs0RndxSOLec*~S0> literal 0 HcmV?d00001