First pushed, setup all the stuff we need, no launcher yet. So test your code in another place for now, because they are all codepended on each others ...
This commit is contained in:
		
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | |||||||
|  | /data/DEV | ||||||
							
								
								
									
										0
									
								
								__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										95
									
								
								indexer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										95
									
								
								indexer.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,95 @@ | |||||||
|  | #We have to import the files | ||||||
|  | #Split the indexer into 4 parts | ||||||
|  | #Alphanumeric sequences into the dataset | ||||||
|  | #Stemming | ||||||
|  | #Text in bold, headings and other titles should be treated as more important | ||||||
|  |  | ||||||
|  | #Posting structure > tf-idf score. Name/id the token was found in . So hashmap. | ||||||
|  | #We need shelves to hold the data. | ||||||
|  |  | ||||||
|  | #Posting ---> Source of file, tf-idf score. #for now we will only use these two, as we get more complex posting will be change accordingly | ||||||
|  |  | ||||||
|  | #Data input | ||||||
|  | import json | ||||||
|  | import os | ||||||
|  | import shelve | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #Data process | ||||||
|  | from nltk.tokenize import word_tokenize | ||||||
|  | import re | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Indexer(): | ||||||
|  | 	def __init__(self,restart,trimming): | ||||||
|  | 		#Config stuffs | ||||||
|  | 		self.path = "data/DEV/" | ||||||
|  | 		self.restart = restart | ||||||
|  | 		self.trimming = trimming | ||||||
|  |  | ||||||
|  |  | ||||||
|  | 		#Shelves for index | ||||||
|  | 		#https://www3.nd.edu/~busiforc/handouts/cryptography/letterfrequencies.html | ||||||
|  | 		#https://www.irishtimes.com/news/science/how-many-numbers-begin-with-a-1-more-than-30-per-cent-1.4162466 | ||||||
|  | 		#According to this will be how we split things | ||||||
|  | 		#Save #1 = ABCD + (1) ~ 18.3% of words | ||||||
|  | 		#Save #2 = EFGHIJK + (2-3)~ 27.1% of words | ||||||
|  | 		#Save #3 = LMNOPQ + (4-7) ~ 25.4% of words | ||||||
|  | 		#Save #4 = RSTUVWXYZ + (8-9)~ 29.2% of words | ||||||
|  | 		#Save #5 = Numbers ??? | ||||||
|  | 		if os.path.exists("save_1.shelve") and restart: | ||||||
|  | 			os.remove("save_1.shelve") | ||||||
|  | 		if os.path.exists("save_2.shelve") and restart: | ||||||
|  | 			os.remove("save_2.shelve") | ||||||
|  | 		if os.path.exists("save_3.shelve") and restart: | ||||||
|  | 			os.remove("save_3.shelve") | ||||||
|  | 		if os.path.exists("save_4.shelve") and restart: | ||||||
|  | 			os.remove("save_4.shelve") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | 		self.save_1 = shelve.open("save_1.shelve") | ||||||
|  | 		self.save_2 = shelve.open("save_2.shelve") | ||||||
|  | 		self.save_3 = shelve.open("save_3.shelve") | ||||||
|  | 		self.save_4 = shelve.open("save_4.shelve") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | 	def save_index(self,word,posting): | ||||||
|  | 		wordhash = hash(word)	##Honestly do not know why hashing is even needed, might cause more problems  | ||||||
|  | 		cur_save = get_save(word) | ||||||
|  | 		shelve_list = list() | ||||||
|  |  | ||||||
|  | 		if wordhash not in cur_save: | ||||||
|  | 			shelve_list.append(posting) | ||||||
|  | 			cur_save[wordhash] = shelve_list | ||||||
|  | 			cur_save.sync() | ||||||
|  | 		else: | ||||||
|  | 			shelve_list = cur_save[wordhash] | ||||||
|  | 			shelve_list.append(posting) | ||||||
|  | 			shelve_list.sort(key=lambda x: x.tf_idf, reverse = True) | ||||||
|  | 			cur_save.sync() | ||||||
|  |  | ||||||
|  | 	def get_save_file(self,word): | ||||||
|  | 		#return the correct save depending on the starting letter of word | ||||||
|  | 		word_lower = word.lower() | ||||||
|  |  | ||||||
|  | 		if re.match(r"^[a-d1-1].*",word_lower): | ||||||
|  | 			return self.save_1 | ||||||
|  | 		elif re.match(r"^[e-k2-3].*",word_lower): | ||||||
|  | 			return self.save_2 | ||||||
|  | 		elif re.match(r"^[l-q4-7].*",word_lower): | ||||||
|  | 			return self.save_3 | ||||||
|  | 		elif re.match(r"^[r-z8-9].*",word_lower): | ||||||
|  | 			return self.save_4 | ||||||
|  | 		else: | ||||||
|  | 			print("You have somehow went beyond the magic") | ||||||
|  | 			return None | ||||||
|  |  | ||||||
|  |  | ||||||
|  | 	def get_data(self): | ||||||
|  | 		for directory in os.listdir(path): | ||||||
|  | 			for files in os.listdir(directory): | ||||||
|  | 				#Actual files here | ||||||
|  | 				 | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
							
								
								
									
										12
									
								
								posting.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								posting.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,12 @@ | |||||||
|  | #Posting class for indexer, will probably be more complex as we keep adding crap to it | ||||||
|  |  | ||||||
|  | class Posting(): | ||||||
|  | 	def __init(self,source): | ||||||
|  | 		self.source = source | ||||||
|  | 		self.tf_idf = get_tf_idf() | ||||||
|  |  | ||||||
|  | 	def get_tf_idf(self): | ||||||
|  | 		#Do tf_idf here | ||||||
|  | 	 | ||||||
|  | 	def comparator(self): | ||||||
|  | 		#Some custom comparator for sorting postings later | ||||||
							
								
								
									
										0
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										18
									
								
								stemmer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								stemmer.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,18 @@ | |||||||
|  | #Multiple implementation of stemming here please | ||||||
|  | class Stemmer(): | ||||||
|  |  | ||||||
|  | 	def __init__(self,mode, data): | ||||||
|  | 		#Different type of stemmer = different modes | ||||||
|  | 		self.mode = mode | ||||||
|  | 		self.data = data | ||||||
|  |  | ||||||
|  | 	def stem(self): | ||||||
|  | 		#Do stuff here | ||||||
|  | 		if(self.mode == 0): | ||||||
|  | 			#Do stemmer 1 | ||||||
|  | 			return #stemmed data | ||||||
|  | 		#.... | ||||||
|  |  | ||||||
|  | 	def #name of stemmer 1 | ||||||
|  |  | ||||||
|  | 	def #name of stemmer 2 | ||||||
							
								
								
									
										17
									
								
								test.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								test.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,17 @@ | |||||||
|  | import re | ||||||
|  | import os | ||||||
|  |  | ||||||
|  | for i in range(99): | ||||||
|  | 	word_lower = chr(i % 26 + 97) + chr(i % 26 + 97 + 1) | ||||||
|  | 	print(word_lower) | ||||||
|  | 	if re.match(r"^[a-d1-1].*",word_lower): | ||||||
|  | 		print("SAVE 1") | ||||||
|  | 	elif re.match(r"^[e-k2-3].*",word_lower): | ||||||
|  | 		print("SAVE 2") | ||||||
|  | 	elif re.match(r"^[l-q4-7].*",word_lower): | ||||||
|  | 		print("SAVE 3") | ||||||
|  | 	elif re.match(r"^[r-z8-9].*",word_lower): | ||||||
|  | 		print("SAVE 4") | ||||||
|  |  | ||||||
|  | path = "data/DEV/" | ||||||
|  | print(os.listdir(path)) | ||||||
		Reference in New Issue
	
	Block a user
	 Hieuhuy Pham
					Hieuhuy Pham