From 1fb8fef7a3cc66485d6c14877e588b738acbb008 Mon Sep 17 00:00:00 2001 From: Hieuhuy Pham Date: Wed, 4 May 2022 12:22:20 -0700 Subject: [PATCH] First pushed, setup all the stuff we need, no launcher yet. So test your code in another place for now, because they are all codepended on each others ... --- .gitignore | 1 + __init__.py | 0 indexer.py | 95 ++++++++++++++++++++++++++++++++++++++++++++++++ posting.py | 12 ++++++ requirements.txt | 0 stemmer.py | 18 +++++++++ test.py | 17 +++++++++ 7 files changed, 143 insertions(+) create mode 100644 .gitignore create mode 100644 __init__.py create mode 100644 indexer.py create mode 100644 posting.py create mode 100644 requirements.txt create mode 100644 stemmer.py create mode 100644 test.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f81b14d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/data/DEV diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/indexer.py b/indexer.py new file mode 100644 index 0000000..5240574 --- /dev/null +++ b/indexer.py @@ -0,0 +1,95 @@ +#We have to import the files +#Split the indexer into 4 parts +#Alphanumeric sequences into the dataset +#Stemming +#Text in bold, headings and other titles should be treated as more important + +#Posting structure > tf-idf score. Name/id the token was found in . So hashmap. +#We need shelves to hold the data. + +#Posting ---> Source of file, tf-idf score. #for now we will only use these two, as we get more complex posting will be change accordingly + +#Data input +import json +import os +import shelve + + +#Data process +from nltk.tokenize import word_tokenize +import re + + +class Indexer(): + def __init__(self,restart,trimming): + #Config stuffs + self.path = "data/DEV/" + self.restart = restart + self.trimming = trimming + + + #Shelves for index + #https://www3.nd.edu/~busiforc/handouts/cryptography/letterfrequencies.html + #https://www.irishtimes.com/news/science/how-many-numbers-begin-with-a-1-more-than-30-per-cent-1.4162466 + #According to this will be how we split things + #Save #1 = ABCD + (1) ~ 18.3% of words + #Save #2 = EFGHIJK + (2-3)~ 27.1% of words + #Save #3 = LMNOPQ + (4-7) ~ 25.4% of words + #Save #4 = RSTUVWXYZ + (8-9)~ 29.2% of words + #Save #5 = Numbers ??? + if os.path.exists("save_1.shelve") and restart: + os.remove("save_1.shelve") + if os.path.exists("save_2.shelve") and restart: + os.remove("save_2.shelve") + if os.path.exists("save_3.shelve") and restart: + os.remove("save_3.shelve") + if os.path.exists("save_4.shelve") and restart: + os.remove("save_4.shelve") + + + self.save_1 = shelve.open("save_1.shelve") + self.save_2 = shelve.open("save_2.shelve") + self.save_3 = shelve.open("save_3.shelve") + self.save_4 = shelve.open("save_4.shelve") + + + def save_index(self,word,posting): + wordhash = hash(word) ##Honestly do not know why hashing is even needed, might cause more problems + cur_save = get_save(word) + shelve_list = list() + + if wordhash not in cur_save: + shelve_list.append(posting) + cur_save[wordhash] = shelve_list + cur_save.sync() + else: + shelve_list = cur_save[wordhash] + shelve_list.append(posting) + shelve_list.sort(key=lambda x: x.tf_idf, reverse = True) + cur_save.sync() + + def get_save_file(self,word): + #return the correct save depending on the starting letter of word + word_lower = word.lower() + + if re.match(r"^[a-d1-1].*",word_lower): + return self.save_1 + elif re.match(r"^[e-k2-3].*",word_lower): + return self.save_2 + elif re.match(r"^[l-q4-7].*",word_lower): + return self.save_3 + elif re.match(r"^[r-z8-9].*",word_lower): + return self.save_4 + else: + print("You have somehow went beyond the magic") + return None + + + def get_data(self): + for directory in os.listdir(path): + for files in os.listdir(directory): + #Actual files here + + + + diff --git a/posting.py b/posting.py new file mode 100644 index 0000000..9625454 --- /dev/null +++ b/posting.py @@ -0,0 +1,12 @@ +#Posting class for indexer, will probably be more complex as we keep adding crap to it + +class Posting(): + def __init(self,source): + self.source = source + self.tf_idf = get_tf_idf() + + def get_tf_idf(self): + #Do tf_idf here + + def comparator(self): + #Some custom comparator for sorting postings later \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e69de29 diff --git a/stemmer.py b/stemmer.py new file mode 100644 index 0000000..f270888 --- /dev/null +++ b/stemmer.py @@ -0,0 +1,18 @@ +#Multiple implementation of stemming here please +class Stemmer(): + + def __init__(self,mode, data): + #Different type of stemmer = different modes + self.mode = mode + self.data = data + + def stem(self): + #Do stuff here + if(self.mode == 0): + #Do stemmer 1 + return #stemmed data + #.... + + def #name of stemmer 1 + + def #name of stemmer 2 \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 0000000..754903b --- /dev/null +++ b/test.py @@ -0,0 +1,17 @@ +import re +import os + +for i in range(99): + word_lower = chr(i % 26 + 97) + chr(i % 26 + 97 + 1) + print(word_lower) + if re.match(r"^[a-d1-1].*",word_lower): + print("SAVE 1") + elif re.match(r"^[e-k2-3].*",word_lower): + print("SAVE 2") + elif re.match(r"^[l-q4-7].*",word_lower): + print("SAVE 3") + elif re.match(r"^[r-z8-9].*",word_lower): + print("SAVE 4") + +path = "data/DEV/" +print(os.listdir(path)) \ No newline at end of file