diff --git a/indexer.py b/indexer.py index 5240574..b5f2f96 100644 --- a/indexer.py +++ b/indexer.py @@ -13,6 +13,7 @@ import json import os import shelve +from bs4 import BeautifulSoup #Data process @@ -86,10 +87,30 @@ class Indexer(): def get_data(self): - for directory in os.listdir(path): - for files in os.listdir(directory): + for directory in os.listdir(self.path): + for file in os.listdir(self.path + "/" + directory + "/"): #Actual files here + #JSON["url"] = url of crawled page, ignore fragments + #JSON["content"] = actual HTML + #JSON["encoding"] = ENCODING + print(file) + file_load = open(self.path + "/" + directory + "/"+file) + data = json.load(file_load) + soup = BeautifulSoup(data["content"],from_encoding=data["encoding"]) + words = word_tokenize(soup.get_text()) + for word in words: + if word is not "" and word.isalnum(): + print(word) + exit(1) + + +def main(): + indexer = Indexer(True,0) + indexer.get_data() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index e69de29..8721c37 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,5 @@ +nltk +re +shelve +json +beautifulsoup4 \ No newline at end of file diff --git a/save_1.shelve b/save_1.shelve new file mode 100644 index 0000000..f03fe98 Binary files /dev/null and b/save_1.shelve differ diff --git a/save_2.shelve b/save_2.shelve new file mode 100644 index 0000000..f03fe98 Binary files /dev/null and b/save_2.shelve differ diff --git a/save_3.shelve b/save_3.shelve new file mode 100644 index 0000000..f03fe98 Binary files /dev/null and b/save_3.shelve differ diff --git a/save_4.shelve b/save_4.shelve new file mode 100644 index 0000000..f03fe98 Binary files /dev/null and b/save_4.shelve differ