First pushed, setup all the stuff we need, no launcher yet. So test your code in another place for now, because they are all codepended on each others ...

This commit is contained in:
Hieuhuy Pham 2022-05-04 12:22:20 -07:00
parent 5875ac0e79
commit 1fb8fef7a3
7 changed files with 143 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/data/DEV

0
__init__.py Normal file
View File

95
indexer.py Normal file
View File

@ -0,0 +1,95 @@
#We have to import the files
#Split the indexer into 4 parts
#Alphanumeric sequences into the dataset
#Stemming
#Text in bold, headings and other titles should be treated as more important
#Posting structure > tf-idf score. Name/id the token was found in . So hashmap.
#We need shelves to hold the data.
#Posting ---> Source of file, tf-idf score. #for now we will only use these two, as we get more complex posting will be change accordingly
#Data input
import json
import os
import shelve
#Data process
from nltk.tokenize import word_tokenize
import re
class Indexer():
def __init__(self,restart,trimming):
#Config stuffs
self.path = "data/DEV/"
self.restart = restart
self.trimming = trimming
#Shelves for index
#https://www3.nd.edu/~busiforc/handouts/cryptography/letterfrequencies.html
#https://www.irishtimes.com/news/science/how-many-numbers-begin-with-a-1-more-than-30-per-cent-1.4162466
#According to this will be how we split things
#Save #1 = ABCD + (1) ~ 18.3% of words
#Save #2 = EFGHIJK + (2-3)~ 27.1% of words
#Save #3 = LMNOPQ + (4-7) ~ 25.4% of words
#Save #4 = RSTUVWXYZ + (8-9)~ 29.2% of words
#Save #5 = Numbers ???
if os.path.exists("save_1.shelve") and restart:
os.remove("save_1.shelve")
if os.path.exists("save_2.shelve") and restart:
os.remove("save_2.shelve")
if os.path.exists("save_3.shelve") and restart:
os.remove("save_3.shelve")
if os.path.exists("save_4.shelve") and restart:
os.remove("save_4.shelve")
self.save_1 = shelve.open("save_1.shelve")
self.save_2 = shelve.open("save_2.shelve")
self.save_3 = shelve.open("save_3.shelve")
self.save_4 = shelve.open("save_4.shelve")
def save_index(self,word,posting):
wordhash = hash(word) ##Honestly do not know why hashing is even needed, might cause more problems
cur_save = get_save(word)
shelve_list = list()
if wordhash not in cur_save:
shelve_list.append(posting)
cur_save[wordhash] = shelve_list
cur_save.sync()
else:
shelve_list = cur_save[wordhash]
shelve_list.append(posting)
shelve_list.sort(key=lambda x: x.tf_idf, reverse = True)
cur_save.sync()
def get_save_file(self,word):
#return the correct save depending on the starting letter of word
word_lower = word.lower()
if re.match(r"^[a-d1-1].*",word_lower):
return self.save_1
elif re.match(r"^[e-k2-3].*",word_lower):
return self.save_2
elif re.match(r"^[l-q4-7].*",word_lower):
return self.save_3
elif re.match(r"^[r-z8-9].*",word_lower):
return self.save_4
else:
print("You have somehow went beyond the magic")
return None
def get_data(self):
for directory in os.listdir(path):
for files in os.listdir(directory):
#Actual files here

12
posting.py Normal file
View File

@ -0,0 +1,12 @@
#Posting class for indexer, will probably be more complex as we keep adding crap to it
class Posting():
def __init(self,source):
self.source = source
self.tf_idf = get_tf_idf()
def get_tf_idf(self):
#Do tf_idf here
def comparator(self):
#Some custom comparator for sorting postings later

0
requirements.txt Normal file
View File

18
stemmer.py Normal file
View File

@ -0,0 +1,18 @@
#Multiple implementation of stemming here please
class Stemmer():
def __init__(self,mode, data):
#Different type of stemmer = different modes
self.mode = mode
self.data = data
def stem(self):
#Do stuff here
if(self.mode == 0):
#Do stemmer 1
return #stemmed data
#....
def #name of stemmer 1
def #name of stemmer 2

17
test.py Normal file
View File

@ -0,0 +1,17 @@
import re
import os
for i in range(99):
word_lower = chr(i % 26 + 97) + chr(i % 26 + 97 + 1)
print(word_lower)
if re.match(r"^[a-d1-1].*",word_lower):
print("SAVE 1")
elif re.match(r"^[e-k2-3].*",word_lower):
print("SAVE 2")
elif re.match(r"^[l-q4-7].*",word_lower):
print("SAVE 3")
elif re.match(r"^[r-z8-9].*",word_lower):
print("SAVE 4")
path = "data/DEV/"
print(os.listdir(path))