From 1fb8fef7a3cc66485d6c14877e588b738acbb008 Mon Sep 17 00:00:00 2001
From: Hieuhuy Pham <inocturnis@gmail.com>
Date: Wed, 4 May 2022 12:22:20 -0700
Subject: [PATCH] First pushed, setup all the stuff we need, no launcher yet.
 So test your code in another place for now, because they are all codepended
 on each others ...

---
 .gitignore       |  1 +
 __init__.py      |  0
 indexer.py       | 95 ++++++++++++++++++++++++++++++++++++++++++++++++
 posting.py       | 12 ++++++
 requirements.txt |  0
 stemmer.py       | 18 +++++++++
 test.py          | 17 +++++++++
 7 files changed, 143 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 __init__.py
 create mode 100644 indexer.py
 create mode 100644 posting.py
 create mode 100644 requirements.txt
 create mode 100644 stemmer.py
 create mode 100644 test.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f81b14d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+/data/DEV
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/indexer.py b/indexer.py
new file mode 100644
index 0000000..5240574
--- /dev/null
+++ b/indexer.py
@@ -0,0 +1,95 @@
+#We have to import the files
+#Split the indexer into 4 parts
+#Alphanumeric sequences into the dataset
+#Stemming
+#Text in bold, headings and other titles should be treated as more important
+
+#Posting structure > tf-idf score. Name/id the token was found in . So hashmap.
+#We need shelves to hold the data.
+
+#Posting ---> Source of file, tf-idf score. #for now we will only use these two, as we get more complex posting will be change accordingly
+
+#Data input
+import json
+import os
+import shelve
+
+
+#Data process
+from nltk.tokenize import word_tokenize
+import re
+
+
+class Indexer():
+	def __init__(self,restart,trimming):
+		#Config stuffs
+		self.path = "data/DEV/"
+		self.restart = restart
+		self.trimming = trimming
+
+
+		#Shelves for index
+		#https://www3.nd.edu/~busiforc/handouts/cryptography/letterfrequencies.html
+		#https://www.irishtimes.com/news/science/how-many-numbers-begin-with-a-1-more-than-30-per-cent-1.4162466
+		#According to this will be how we split things
+		#Save #1 = ABCD + (1) ~ 18.3% of words
+		#Save #2 = EFGHIJK + (2-3)~ 27.1% of words
+		#Save #3 = LMNOPQ + (4-7) ~ 25.4% of words
+		#Save #4 = RSTUVWXYZ + (8-9)~ 29.2% of words
+		#Save #5 = Numbers ???
+		if os.path.exists("save_1.shelve") and restart:
+			os.remove("save_1.shelve")
+		if os.path.exists("save_2.shelve") and restart:
+			os.remove("save_2.shelve")
+		if os.path.exists("save_3.shelve") and restart:
+			os.remove("save_3.shelve")
+		if os.path.exists("save_4.shelve") and restart:
+			os.remove("save_4.shelve")
+
+
+		self.save_1 = shelve.open("save_1.shelve")
+		self.save_2 = shelve.open("save_2.shelve")
+		self.save_3 = shelve.open("save_3.shelve")
+		self.save_4 = shelve.open("save_4.shelve")
+
+
+	def save_index(self,word,posting):
+		wordhash = hash(word)	##Honestly do not know why hashing is even needed, might cause more problems 
+		cur_save = get_save(word)
+		shelve_list = list()
+
+		if wordhash not in cur_save:
+			shelve_list.append(posting)
+			cur_save[wordhash] = shelve_list
+			cur_save.sync()
+		else:
+			shelve_list = cur_save[wordhash]
+			shelve_list.append(posting)
+			shelve_list.sort(key=lambda x: x.tf_idf, reverse = True)
+			cur_save.sync()
+
+	def get_save_file(self,word):
+		#return the correct save depending on the starting letter of word
+		word_lower = word.lower()
+
+		if re.match(r"^[a-d1-1].*",word_lower):
+			return self.save_1
+		elif re.match(r"^[e-k2-3].*",word_lower):
+			return self.save_2
+		elif re.match(r"^[l-q4-7].*",word_lower):
+			return self.save_3
+		elif re.match(r"^[r-z8-9].*",word_lower):
+			return self.save_4
+		else:
+			print("You have somehow went beyond the magic")
+			return None
+
+
+	def get_data(self):
+		for directory in os.listdir(path):
+			for files in os.listdir(directory):
+				#Actual files here
+				
+
+
+
diff --git a/posting.py b/posting.py
new file mode 100644
index 0000000..9625454
--- /dev/null
+++ b/posting.py
@@ -0,0 +1,12 @@
+#Posting class for indexer, will probably be more complex as we keep adding crap to it
+
+class Posting():
+	def __init(self,source):
+		self.source = source
+		self.tf_idf = get_tf_idf()
+
+	def get_tf_idf(self):
+		#Do tf_idf here
+	
+	def comparator(self):
+		#Some custom comparator for sorting postings later
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..e69de29
diff --git a/stemmer.py b/stemmer.py
new file mode 100644
index 0000000..f270888
--- /dev/null
+++ b/stemmer.py
@@ -0,0 +1,18 @@
+#Multiple implementation of stemming here please
+class Stemmer():
+
+	def __init__(self,mode, data):
+		#Different type of stemmer = different modes
+		self.mode = mode
+		self.data = data
+
+	def stem(self):
+		#Do stuff here
+		if(self.mode == 0):
+			#Do stemmer 1
+			return #stemmed data
+		#....
+
+	def #name of stemmer 1
+
+	def #name of stemmer 2
\ No newline at end of file
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..754903b
--- /dev/null
+++ b/test.py
@@ -0,0 +1,17 @@
+import re
+import os
+
+for i in range(99):
+	word_lower = chr(i % 26 + 97) + chr(i % 26 + 97 + 1)
+	print(word_lower)
+	if re.match(r"^[a-d1-1].*",word_lower):
+		print("SAVE 1")
+	elif re.match(r"^[e-k2-3].*",word_lower):
+		print("SAVE 2")
+	elif re.match(r"^[l-q4-7].*",word_lower):
+		print("SAVE 3")
+	elif re.match(r"^[r-z8-9].*",word_lower):
+		print("SAVE 4")
+
+path = "data/DEV/"
+print(os.listdir(path))
\ No newline at end of file