First pushed, setup all the stuff we need, no launcher yet. So test your code in another place for now, because they are all codepended on each others ...
This commit is contained in:
parent
5875ac0e79
commit
1fb8fef7a3
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
/data/DEV
|
0
__init__.py
Normal file
0
__init__.py
Normal file
95
indexer.py
Normal file
95
indexer.py
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
#We have to import the files
|
||||||
|
#Split the indexer into 4 parts
|
||||||
|
#Alphanumeric sequences into the dataset
|
||||||
|
#Stemming
|
||||||
|
#Text in bold, headings and other titles should be treated as more important
|
||||||
|
|
||||||
|
#Posting structure > tf-idf score. Name/id the token was found in . So hashmap.
|
||||||
|
#We need shelves to hold the data.
|
||||||
|
|
||||||
|
#Posting ---> Source of file, tf-idf score. #for now we will only use these two, as we get more complex posting will be change accordingly
|
||||||
|
|
||||||
|
#Data input
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import shelve
|
||||||
|
|
||||||
|
|
||||||
|
#Data process
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
class Indexer():
|
||||||
|
def __init__(self,restart,trimming):
|
||||||
|
#Config stuffs
|
||||||
|
self.path = "data/DEV/"
|
||||||
|
self.restart = restart
|
||||||
|
self.trimming = trimming
|
||||||
|
|
||||||
|
|
||||||
|
#Shelves for index
|
||||||
|
#https://www3.nd.edu/~busiforc/handouts/cryptography/letterfrequencies.html
|
||||||
|
#https://www.irishtimes.com/news/science/how-many-numbers-begin-with-a-1-more-than-30-per-cent-1.4162466
|
||||||
|
#According to this will be how we split things
|
||||||
|
#Save #1 = ABCD + (1) ~ 18.3% of words
|
||||||
|
#Save #2 = EFGHIJK + (2-3)~ 27.1% of words
|
||||||
|
#Save #3 = LMNOPQ + (4-7) ~ 25.4% of words
|
||||||
|
#Save #4 = RSTUVWXYZ + (8-9)~ 29.2% of words
|
||||||
|
#Save #5 = Numbers ???
|
||||||
|
if os.path.exists("save_1.shelve") and restart:
|
||||||
|
os.remove("save_1.shelve")
|
||||||
|
if os.path.exists("save_2.shelve") and restart:
|
||||||
|
os.remove("save_2.shelve")
|
||||||
|
if os.path.exists("save_3.shelve") and restart:
|
||||||
|
os.remove("save_3.shelve")
|
||||||
|
if os.path.exists("save_4.shelve") and restart:
|
||||||
|
os.remove("save_4.shelve")
|
||||||
|
|
||||||
|
|
||||||
|
self.save_1 = shelve.open("save_1.shelve")
|
||||||
|
self.save_2 = shelve.open("save_2.shelve")
|
||||||
|
self.save_3 = shelve.open("save_3.shelve")
|
||||||
|
self.save_4 = shelve.open("save_4.shelve")
|
||||||
|
|
||||||
|
|
||||||
|
def save_index(self,word,posting):
|
||||||
|
wordhash = hash(word) ##Honestly do not know why hashing is even needed, might cause more problems
|
||||||
|
cur_save = get_save(word)
|
||||||
|
shelve_list = list()
|
||||||
|
|
||||||
|
if wordhash not in cur_save:
|
||||||
|
shelve_list.append(posting)
|
||||||
|
cur_save[wordhash] = shelve_list
|
||||||
|
cur_save.sync()
|
||||||
|
else:
|
||||||
|
shelve_list = cur_save[wordhash]
|
||||||
|
shelve_list.append(posting)
|
||||||
|
shelve_list.sort(key=lambda x: x.tf_idf, reverse = True)
|
||||||
|
cur_save.sync()
|
||||||
|
|
||||||
|
def get_save_file(self,word):
|
||||||
|
#return the correct save depending on the starting letter of word
|
||||||
|
word_lower = word.lower()
|
||||||
|
|
||||||
|
if re.match(r"^[a-d1-1].*",word_lower):
|
||||||
|
return self.save_1
|
||||||
|
elif re.match(r"^[e-k2-3].*",word_lower):
|
||||||
|
return self.save_2
|
||||||
|
elif re.match(r"^[l-q4-7].*",word_lower):
|
||||||
|
return self.save_3
|
||||||
|
elif re.match(r"^[r-z8-9].*",word_lower):
|
||||||
|
return self.save_4
|
||||||
|
else:
|
||||||
|
print("You have somehow went beyond the magic")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_data(self):
|
||||||
|
for directory in os.listdir(path):
|
||||||
|
for files in os.listdir(directory):
|
||||||
|
#Actual files here
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
12
posting.py
Normal file
12
posting.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
#Posting class for indexer, will probably be more complex as we keep adding crap to it
|
||||||
|
|
||||||
|
class Posting():
|
||||||
|
def __init(self,source):
|
||||||
|
self.source = source
|
||||||
|
self.tf_idf = get_tf_idf()
|
||||||
|
|
||||||
|
def get_tf_idf(self):
|
||||||
|
#Do tf_idf here
|
||||||
|
|
||||||
|
def comparator(self):
|
||||||
|
#Some custom comparator for sorting postings later
|
0
requirements.txt
Normal file
0
requirements.txt
Normal file
18
stemmer.py
Normal file
18
stemmer.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
#Multiple implementation of stemming here please
|
||||||
|
class Stemmer():
|
||||||
|
|
||||||
|
def __init__(self,mode, data):
|
||||||
|
#Different type of stemmer = different modes
|
||||||
|
self.mode = mode
|
||||||
|
self.data = data
|
||||||
|
|
||||||
|
def stem(self):
|
||||||
|
#Do stuff here
|
||||||
|
if(self.mode == 0):
|
||||||
|
#Do stemmer 1
|
||||||
|
return #stemmed data
|
||||||
|
#....
|
||||||
|
|
||||||
|
def #name of stemmer 1
|
||||||
|
|
||||||
|
def #name of stemmer 2
|
17
test.py
Normal file
17
test.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
import re
|
||||||
|
import os
|
||||||
|
|
||||||
|
for i in range(99):
|
||||||
|
word_lower = chr(i % 26 + 97) + chr(i % 26 + 97 + 1)
|
||||||
|
print(word_lower)
|
||||||
|
if re.match(r"^[a-d1-1].*",word_lower):
|
||||||
|
print("SAVE 1")
|
||||||
|
elif re.match(r"^[e-k2-3].*",word_lower):
|
||||||
|
print("SAVE 2")
|
||||||
|
elif re.match(r"^[l-q4-7].*",word_lower):
|
||||||
|
print("SAVE 3")
|
||||||
|
elif re.match(r"^[r-z8-9].*",word_lower):
|
||||||
|
print("SAVE 4")
|
||||||
|
|
||||||
|
path = "data/DEV/"
|
||||||
|
print(os.listdir(path))
|
Loading…
Reference in New Issue
Block a user