Massive changes to indexer and created merge
This commit is contained in:
128
test.py
128
test.py
@@ -1,17 +1,115 @@
|
||||
import re
|
||||
import os
|
||||
import json
|
||||
from posting import Posting
|
||||
import math
|
||||
import sys
|
||||
import random
|
||||
from nltk.corpus import words
|
||||
random_list = [1,2,3,4,5,6,7,8,9,10]
|
||||
|
||||
for i in range(99):
|
||||
word_lower = chr(i % 26 + 97) + chr(i % 26 + 97 + 1)
|
||||
print(word_lower)
|
||||
if re.match(r"^[a-d1-1].*",word_lower):
|
||||
print("SAVE 1")
|
||||
elif re.match(r"^[e-k2-3].*",word_lower):
|
||||
print("SAVE 2")
|
||||
elif re.match(r"^[l-q4-7].*",word_lower):
|
||||
print("SAVE 3")
|
||||
elif re.match(r"^[r-z8-9].*",word_lower):
|
||||
print("SAVE 4")
|
||||
|
||||
path = "data/DEV/"
|
||||
print(os.listdir(path))
|
||||
test_data = words.words()
|
||||
random.shuffle(test_data)
|
||||
|
||||
class Node():
|
||||
index_value = ''
|
||||
postings = list()
|
||||
|
||||
class Index():
|
||||
length = 0
|
||||
index = list()
|
||||
|
||||
def random_posting(id):
|
||||
return Posting(id,random.choice(random_list),random.choice(random_list),[random.choice(random_list),random.choice(random_list),random.choice(random_list),random.choice(random_list),
|
||||
random.choice(random_list),random.choice(random_list),random.choice(random_list),random.choice(random_list)])
|
||||
|
||||
def random_partial_index(name):
|
||||
part_index = Index()
|
||||
part_index.index = list()
|
||||
part_index.length = 0
|
||||
with open(name +'.partial', 'w') as f:
|
||||
for i in range(1000):
|
||||
|
||||
node1 = Node()
|
||||
node1.index_value = random.choice(test_data).lower()
|
||||
node1.postings = list()
|
||||
for i in range(10):
|
||||
node1.postings.append(random_posting(i))
|
||||
|
||||
jsonStr = json.dumps(node1, default=lambda o: o.__dict__,sort_keys=False)
|
||||
|
||||
part_index.index.append((node1.index_value,f.tell()))
|
||||
f.write(jsonStr + '\n')
|
||||
part_index.length = part_index.length + 1
|
||||
|
||||
part_index.index.sort(key=lambda y:y[0])
|
||||
jsonStr =json.dumps(part_index, default=lambda o: o.__dict__,sort_keys=False)
|
||||
with open(name + '.index','w') as f:
|
||||
f.write(jsonStr)
|
||||
|
||||
def merge(partial_indices):
|
||||
partial_files = list()
|
||||
partial_index_files = list()
|
||||
parital_index_indices = list()
|
||||
merged_index = open("merged_index.full",'w')
|
||||
num_indices = len(partial_indices)
|
||||
|
||||
#Full Index.Index and Length
|
||||
full_index = Index()
|
||||
full_index.index = list()
|
||||
full_index.length = 0
|
||||
|
||||
for partial_index in partial_indices:
|
||||
file = open(partial_index+'.partial','r')
|
||||
partial_files.append(file)
|
||||
index = open(partial_index+'.index','r')
|
||||
partial_index_files.append(index)
|
||||
|
||||
for partial_index_file in partial_index_files:
|
||||
partial_index_file.seek(0,0)
|
||||
parital_index_indices.append(json.loads(partial_index_file.readline()))
|
||||
|
||||
#Start all indexes at 0
|
||||
for partial_file in partial_files:
|
||||
partial_file.seek(0,0)
|
||||
|
||||
pointers = [0]*num_indices
|
||||
|
||||
while(True):
|
||||
|
||||
#Get all values from all indices to find min
|
||||
value = None
|
||||
values = list()
|
||||
for i in range(num_indices):
|
||||
if pointers[i] < parital_index_indices[i]['length']:
|
||||
values.append(parital_index_indices[i]['index'][pointers[i]][0])
|
||||
|
||||
if(len(values) == 0):
|
||||
break
|
||||
value = min(values)
|
||||
|
||||
#Get data from the min value of all indices if exists then save to mergedIndex
|
||||
if value == None:
|
||||
print("I have crashed some how by not getting min value")
|
||||
break
|
||||
|
||||
node = Node()
|
||||
node.index_value = value
|
||||
for i in range(num_indices):
|
||||
if pointers[i] < parital_index_indices[i]['length'] and parital_index_indices[i]['index'][pointers[i]][0] == value:
|
||||
to_seek = parital_index_indices[i]['index'][pointers[i]][1]
|
||||
partial_files[i].seek(to_seek,0)
|
||||
json_value = partial_files[i].readline()
|
||||
temp_node = json.loads(json_value)
|
||||
node.postings = node.postings + temp_node['postings']
|
||||
pointers[i] = pointers[i] + 1
|
||||
|
||||
node.postings.sort(key=lambda y:y['doc_id'])
|
||||
full_index.index.append((value,merged_index.tell()))
|
||||
full_index.length = full_index.length + 1
|
||||
jsonStr = json.dumps(node,default=lambda o: o.__dict__,sort_keys=False)
|
||||
merged_index.write(jsonStr + '\n')
|
||||
|
||||
full_index.index.sort(key=lambda y:y[0])
|
||||
jsonStr =json.dumps(full_index, default=lambda o: o.__dict__,sort_keys=False)
|
||||
with open("merged_index.index" ,'w') as f:
|
||||
f.write(jsonStr)
|
||||
|
||||
Reference in New Issue
Block a user