patch 4.0

This commit is contained in:
wangzelin.wzl
2022-10-24 10:34:53 +08:00
parent 4ad6e00ec3
commit 93a1074b0c
10533 changed files with 2588271 additions and 2299373 deletions

View File

@ -0,0 +1,6 @@
1、cost_model_util:根据输入参数采集实验数据。需要将被测目标在这里建立一个参数可控的最小运行环境,提供基本的数据生成、schema控制等功能。
2、benchmaster_xxx:生成参数组合调用cost_model_util,需要根据被测目标的特点控制参数的种类、数量。例如对于join,需要控制左右表行数等。
3、preprocess:对原始数据进行预处理。例如benchmaster对每组参数运行多次,在这进行去极值、取平均等工作,将同参数的几组数据合为一组。
4、fit_xx:进行拟合
5、plot:绘制图像

View File

@ -0,0 +1,88 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import getopt
import sys
import math
def array_model_form(args,
params):
Nelem = args
(
Telem_ence,
Telem_copy
) = params
ELEM_PER_PAGE = 1024
extend_cnt = math.ceil(math.log(float(Nelem)/ELEM_PER_PAGE, 2))
if extend_cnt < 0:
extend_cnt = 0
copy_cnt = ELEM_PER_PAGE * (math.pow(2, extend_cnt) - 1)
total_cost = Telem_ence * Nelem
#total_cost += Tmem_alloc * extend_cnt
total_cost += Telem_copy * copy_cnt
return total_cost
def extract_info_from_line(line):
splited = line.split(",")
line_info = []
for item in splited:
line_info.append(float(item))
return line_info
file_name = "get_total.data.prep"
output_fit_res = False
wrong_arg = False
opts,args = getopt.getopt(sys.argv[1:],"i:o:m:")
for op, value in opts:
if "-i" == op:
file_name = value
elif "-o" == op:
output_fit_res = True
out_file_name = value
elif "-m" == op:
model_file_name = value
else:
wrong_arg = True
if wrong_arg:
print "wrong arg"
sys.exit(1)
input_file = open(file_name, "r")
model_file = open(model_file_name, "r")
out_file = open(out_file_name, "w")
line = model_file.readline()
model_params = [float(p) for p in line.split(",")]
for line in input_file:
if line.startswith('#'):
out_file.write(line)
continue
case_param = extract_info_from_line(line)
args = (case_param[0])
time = case_param[1]
cost_val = array_model_form(args, model_params)
percent = (cost_val - time) / time
# new_line = ",".join([line.strip(),"\t" ,str(cost_val),"\t" , str(time),"\t\t" , str(percent * 100)])
# new_line += "\n"
# out_file.write(new_line)
new_line = ",".join([line.strip(), str(cost_val)])
new_line += "\n"
out_file.write(new_line)
out_file.close()

View File

@ -0,0 +1,100 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import getopt
import sys
import math
def mg_model_form(args,
params
):
(
Nrow_input,
Nrow_res,
Ncol_input,
Ncol_aggr,
Ncol_group
) = args
(
Tstartup,
Trow_once,
Tres_once,
Taggr_prepare_result,
Taggr_process,
Tgroup_cmp_col,
Tcopy_col
) = params
total_cost = Tstartup + Nrow_res * Tres_once + Nrow_input * Trow_once
#cost for judge group
total_cost += Nrow_input * Ncol_group * Tgroup_cmp_col
#cost for group related operation
total_cost += Nrow_res * (Ncol_input * Tcopy_col)
total_cost += Nrow_res * (Ncol_aggr * Taggr_prepare_result)
#cost for input row process
total_cost += Nrow_input * (Ncol_aggr * Taggr_process)
return total_cost
def extract_info_from_line(line):
splited = line.split(",")
line_info = []
for item in splited:
line_info.append(float(item))
return line_info
file_name = "get_total.data.prep"
output_fit_res = False
wrong_arg = False
opts,args = getopt.getopt(sys.argv[1:],"i:o:m:")
for op, value in opts:
if "-i" == op:
file_name = value
elif "-o" == op:
output_fit_res = True
out_file_name = value
elif "-m" == op:
model_file_name = value
else:
wrong_arg = True
if wrong_arg:
print "wrong arg"
sys.exit(1)
input_file = open(file_name, "r")
model_file = open(model_file_name, "r")
out_file = open(out_file_name, "w")
line = model_file.readline()
model_params = [float(p) for p in line.split(",")]
for line in input_file:
case_param = extract_info_from_line(line)
args = (case_param[0],
case_param[5],
case_param[4],
case_param[2],
case_param[3])
time = case_param[6]
cost_val = mg_model_form(args, model_params)
percent = (cost_val - time) / time
new_line = ",".join([line.strip(),"\t" ,str(cost_val),"\t" , str(time),"\t\t" , str(percent * 100)])
new_line += "\n"
out_file.write(new_line)
out_file.close()

View File

@ -0,0 +1,82 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import getopt
import sys
import math
def material_model_form(args,
params):
(
Nrow,
Ncol,
) = args
(
# Tstartup,
Trow_once,
Trow_col
) = params
total_cost = 0 #Tstartup
total_cost += Nrow * (Trow_once + Ncol * Trow_col)
return total_cost
def extract_info_from_line(line):
splited = line.split(",")
line_info = []
for item in splited:
line_info.append(float(item))
return line_info
file_name = "get_total.data.prep"
output_fit_res = False
wrong_arg = False
opts,args = getopt.getopt(sys.argv[1:],"i:o:m:")
for op, value in opts:
if "-i" == op:
file_name = value
elif "-o" == op:
output_fit_res = True
out_file_name = value
elif "-m" == op:
model_file_name = value
else:
wrong_arg = True
if wrong_arg:
print "wrong arg"
sys.exit(1)
input_file = open(file_name, "r")
model_file = open(model_file_name, "r")
out_file = open(out_file_name, "w")
line = model_file.readline()
model_params = [float(p) for p in line.split(",")]
for line in input_file:
if line.startswith('#'):
out_file.write(line)
continue
case_param = extract_info_from_line(line)
args = (case_param[0],
case_param[1])
time = case_param[3]
cost_val = material_model_form(args, model_params)
percent = (cost_val - time) / time
new_line = ",".join([line.strip(),"\t" ,str(cost_val),"\t" , str(time),"\t\t" , str(percent * 100)])
new_line += "\n"
out_file.write(new_line)
out_file.close()

View File

@ -0,0 +1,146 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import getopt
import sys
import math
def merge_model_form(args,
params
):
(
Nrow_res,
Nrow_left,
Nrow_right,
Nright_cache_in,
Nright_cache_out,
Nright_cache_clear,
Nequal_cond
) = args
(
Tstartup,
Tres_right_op,
Tres_right_cache,
Tmatch_group,
#Tassemble_row,
Tequal_fail,
Trow_left,
Trow_right
) = params
total_cost = Tstartup
total_cost += Nrow_left * Trow_left
total_cost += (Nrow_right - Nright_cache_in) * Trow_right
total_cost += Nright_cache_in * Tres_right_op
total_cost += Nright_cache_out * Tres_right_cache
#total_cost += Nrow_res * Tassemble_row
total_cost += Nright_cache_clear * Tmatch_group
total_cost += (Nequal_cond - Nrow_res - 2 * Tmatch_group) * Tequal_fail
# total_cost += Nright_cache_in * Tres_right_op
# total_cost += (Nrow_res - Nright_cache_in) * Tres_right_cache
# total_cost += Nright_cache_clear * Tmatch_group
# total_cost += Nrow_res * Tassemble_row
# total_cost += (Nequal_cond - Nrow_res - 2 * Tmatch_group) * Tequal_fail
# total_cost += Nrow_left * Trow_left
# total_cost += (Nrow_right - Nright_cache_in) * Trow_right
return total_cost
#
# def merge_model_form(args,
# params
# ):
# (
# Nrow_res,
# Nrow_left,
# Nrow_right,
# Nright_cache_in,
# Nright_cache_out,
# Nright_cache_clear,
# Nequal_cond,
# ) = args
#
# (
# Tstartup,
# Tright_cache_in,
# Tright_cache_out,
# Tright_cache_clear,
# Tassemble_row,
# Tequal_fail,
# Trow_left,
# #Trow_right
# ) = params
#
# total_cost = Tstartup
# total_cost += Nright_cache_in * Tright_cache_in
# total_cost += (Nright_cache_out - Nright_cache_clear) * Tright_cache_out
# total_cost += Nright_cache_clear * Tright_cache_clear
# total_cost += Nrow_res * Tassemble_row
# total_cost += (Nequal_cond - Nrow_res - 2 * Tright_cache_clear) * Tequal_fail
# total_cost += Nrow_left * Trow_left
# #total_cost += (Nrow_right - Nright_cache_in) * Trow_right
#
# return total_cost
def extract_info_from_line(line):
splited = line.split(",")
line_info = []
for item in splited:
line_info.append(float(item))
return line_info
file_name = "get_total.data.prep"
output_fit_res = False
wrong_arg = False
opts,args = getopt.getopt(sys.argv[1:],"i:o:m:")
for op, value in opts:
if "-i" == op:
file_name = value
elif "-o" == op:
output_fit_res = True
out_file_name = value
elif "-m" == op:
model_file_name = value
else:
wrong_arg = True
if wrong_arg:
print "wrong arg"
sys.exit(1)
input_file = open(file_name, "r")
model_file = open(model_file_name, "r")
out_file = open(out_file_name, "w")
line = model_file.readline()
model_params = [float(p) for p in line.split(",")]
for line in input_file:
case_param = extract_info_from_line(line)
args = (case_param[6], #Nrow_res
case_param[0], #Nrow_left
case_param[1], #Nrow_right
case_param[-3], #Nright_cache_in
case_param[-2], #Nright_cache_out
case_param[-1],
case_param[8])
time = case_param[7]
cost_val = merge_model_form(args, model_params)
percent = (cost_val - time) / time
new_line = ",".join([line.strip(),"\t" ,str(cost_val),"\t" , str(time),"\t\t" , str(percent * 100)])
new_line += "\n"
out_file.write(new_line)
out_file.close()

View File

@ -0,0 +1,101 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import getopt
import sys
import math
def mg_model_form(args,
params
):
(
Nrow_input,
Nrow_res,
Ncol_input,
Ncol_aggr,
Ncol_group
) = args
(
#Tstartup,
Trow_once,
Tres_once,
Taggr_prepare_result,
Taggr_process,
Tgroup_cmp_col,
Tcopy_col
) = params
total_cost = Nrow_res * Tres_once + Nrow_input * Trow_once
#cost for judge group
total_cost += Nrow_res * Tgroup_cmp_col
total_cost += (Nrow_input - Nrow_res) * Ncol_group * Tgroup_cmp_col
#cost for group related operation
total_cost += Nrow_res * (Ncol_input * Tcopy_col)
total_cost += Nrow_res * (Ncol_aggr * Taggr_prepare_result)
#cost for input row process
total_cost += Nrow_input * (Ncol_aggr * Taggr_process)
return total_cost
def extract_info_from_line(line):
splited = line.split(",")
line_info = []
for item in splited:
line_info.append(float(item))
return line_info
file_name = "get_total.data.prep"
output_fit_res = False
wrong_arg = False
opts,args = getopt.getopt(sys.argv[1:],"i:o:m:")
for op, value in opts:
if "-i" == op:
file_name = value
elif "-o" == op:
output_fit_res = True
out_file_name = value
elif "-m" == op:
model_file_name = value
else:
wrong_arg = True
if wrong_arg:
print "wrong arg"
sys.exit(1)
input_file = open(file_name, "r")
model_file = open(model_file_name, "r")
out_file = open(out_file_name, "w")
line = model_file.readline()
model_params = [float(p) for p in line.split(",")]
for line in input_file:
case_param = extract_info_from_line(line)
args = (case_param[0],
case_param[5],
case_param[4],
case_param[2],
case_param[3])
time = case_param[6]
cost_val = mg_model_form(args, model_params)
percent = (cost_val - time) / time
new_line = ",".join([line.strip(),"\t" ,str(cost_val),"\t" , str(time),"\t\t" , str(percent * 100)])
new_line += "\n"
out_file.write(new_line)
out_file.close()

View File

@ -0,0 +1,97 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import getopt
import sys
import math
def nl_model_form(args,
params
):
(
Nrow_res,
Nrow_left,
Nrow_right,
Nright_cache_in,
Nright_cache_out,
Nright_cache_clear,
Nequal_cond
) = args
(
Tstartup,
#Tqual,
Tres,
Tfail,
Tleft_row,
Tright_row
) = params
total_cost = Tstartup
total_cost += Nrow_res * Tres
#total_cost += Nequal_cond * Tqual
total_cost += (Nequal_cond - Nrow_res) * Tfail
total_cost += Nrow_left * Tleft_row
total_cost += Nrow_right * Tright_row
return total_cost
def extract_info_from_line(line):
splited = line.split(",")
line_info = []
for item in splited:
line_info.append(float(item))
return line_info
file_name = "get_total.data.prep"
output_fit_res = False
wrong_arg = False
opts,args = getopt.getopt(sys.argv[1:],"i:o:m:")
for op, value in opts:
if "-i" == op:
file_name = value
elif "-o" == op:
output_fit_res = True
out_file_name = value
elif "-m" == op:
model_file_name = value
else:
wrong_arg = True
if wrong_arg:
print "wrong arg"
sys.exit(1)
input_file = open(file_name, "r")
model_file = open(model_file_name, "r")
out_file = open(out_file_name, "w")
line = model_file.readline()
model_params = [float(p) for p in line.split(",")]
for line in input_file:
case_param = extract_info_from_line(line)
args = (case_param[6], #Nrow_res
case_param[0], #Nrow_left
case_param[1], #Nrow_right
case_param[-3], #Nright_cache_in
case_param[-2], #Nright_cache_out
case_param[-1],
case_param[8])
time = case_param[7]
cost_val = nl_model_form(args, model_params)
percent = (cost_val - time) / time
new_line = ",".join([line.strip(),"\t" ,str(cost_val),"\t" , str(time),"\t\t" , str(percent * 100)])
new_line += "\n"
out_file.write(new_line)
out_file.close()

View File

@ -0,0 +1,213 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import getopt
import sys
import math
def material_model_form(args):
(
Nrow,
Ncol,
) = args
Trow_col = 0.02674675
Trow_once = 0.07931677
total_cost = 0 #Tstartup
total_cost += Nrow * (Trow_once + Ncol * Trow_col)
return total_cost
def array_model_form(args):
# (
# Nelem,
# ) = args
Telem_ence = 0.00898860
Telem_copy = 0.00631888
Nelem = args
ELEM_PER_PAGE = 1024
extend_cnt = math.ceil(math.log(float(Nelem)/ELEM_PER_PAGE, 2))
if extend_cnt < 0:
extend_cnt = 0
copy_cnt = ELEM_PER_PAGE * (math.pow(2, extend_cnt) - 1)
total_cost = Telem_ence * Nelem
#total_cost += Tmem_alloc * extend_cnt
total_cost += Telem_copy * copy_cnt
return total_cost
def get_row_size(reserve, col):
size = 16
size += reserve * 16
col /= 8
size += col * (3 + 8 + 4 + 8 + 16 + 32 + 64 + 128)
size += col
return size
def get_miss_prob(Nrow, Ncol, Nord, Turn):
total_size = Nrow * get_row_size(Nord, Ncol)
TLBcovered = Turn
if TLBcovered >= 0.9 * total_size:
hit = 0.9
else:
hit = TLBcovered / total_size
return 1 - hit
def sort_model_form(args,
params
):
(
Nrow,
Ncol,
Nordering
) = args
(
# Tstartup,
#Trowstore_once,
#Trowstore_col,
# Tarray_once,
# Tarray_elem_copy,
# Tordercol,
# Treserve_cell,
Tcompare,
# Trow_once,
Tmiss_K1,
Turn
# Tmiss_K2,
# Turn
) = params
total_cost = 0 #Tstartup
# total_cost += Nrow * Trow_once
#cost for rowstore
# total_cost += material_model_form((Nrow, Ncol))
# total_cost += 0.0044 * Nrow * Ncol * Nordering
# total_cost += Tordercol * Nrow * Nordering
#cost for push array
# total_cost += array_model_form(Nrow)
# cost for sorting
Nordering_cmp = Nordering
if Nordering >= 1:
Nordering_cmp = 1
compare_cost = Tcompare * Nordering_cmp + Tmiss_K1 * get_miss_prob(Nrow, Ncol, Nordering, Turn)
total_cost += Nrow * compare_cost * math.log(Nrow, 2)
#cost for get row
# total_cost += Nrow * (Tmiss_K2 * get_miss_prob(Nrow, Ncol, Nordering))
return total_cost
#
# def sort_model_form(args,
# params):
# (
# Nrow,
# Nordering,
# Ncol,
# ) = args
#
# (
# Tstartup,
# Trowstore_once,
# Trowstore_col,
# # Tarray_once,
# # Tarray_elem_copy,
# Treserve_cell,
# Tcompare
# ) = params
#
#
# total_cost = Tstartup
#
# #cost for row store
# total_cost += Nrow * (Trowstore_once + Ncol * Trowstore_col)
# total_cost += Treserve_cell * Nrow * Ncol * Nordering
#
# #cost for array
# # ELEM_PER_PAGE = 1024
# # extend_cnt = math.ceil(math.log(float(Nrow)/ELEM_PER_PAGE, 2))
# # copy_cnt = ELEM_PER_PAGE * (math.pow(2, extend_cnt) - 1)
# #total_cost += Tarray_once * Nrow + Tarray_elem_copy * copy_cnt
#
# #cost for sorting
# if Nordering > 2:
# Nordering_cmp = 2
# else:
# Nordering_cmp = Nordering
# compare_cost = Tcompare * Nordering_cmp
# total_cost += Nrow * compare_cost * math.log(Nrow, 2)
#
# return total_cost
def extract_info_from_line(line):
splited = line.split(",")
line_info = []
for item in splited:
line_info.append(float(item))
return line_info
# sys.argv.extend('-i sort.prep.double -o sort.fit.double -m sort.model.double'.split())
file_name = "get_total.data.prep"
output_fit_res = False
wrong_arg = False
opts,args = getopt.getopt(sys.argv[1:],"i:o:m:")
for op, value in opts:
if "-i" == op:
file_name = value
elif "-o" == op:
output_fit_res = True
out_file_name = value
elif "-m" == op:
model_file_name = value
else:
wrong_arg = True
if wrong_arg:
print "wrong arg"
sys.exit(1)
input_file = open(file_name, "r")
model_file = open(model_file_name, "r")
out_file = open(out_file_name, "w")
line = model_file.readline()
model_params = [float(p) for p in line.split(",")]
# if len(model_params) == 1:
# model_params = model_params[0]
for line in input_file:
if line.startswith('#'):
out_file.write(line)
continue
case_param = extract_info_from_line(line)
args = (case_param[0],
case_param[1],
case_param[2])
time = case_param[4]
cost_val = sort_model_form(args, model_params)
percent = (cost_val - time) / time
new_line = ",".join([line.strip(),str(cost_val),str(percent * 100)])
new_line += "\n"
out_file.write(new_line)
out_file.close()

View File

@ -0,0 +1,33 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import subprocess as sp
import os
if os.path.exists("array_result"):
os.remove("array_result")
#cmd_form = 'LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./cost_model_util -GB -s c10k1.schema -t array -r 1000000'
cmd_form = 'LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./cost_model_util -G -s c10k1.schema -t array -r 1000000'
cmd_elements = cmd_form.split(" ")
minrc = 1
maxrc = 1100001
step = 1000
case_run_time = 5
total_case_count = (maxrc - minrc) / step
case_count = 0
print "Total case count %s ..." % (total_case_count)
for row_count in xrange(minrc, maxrc + 1, step):
cmd_elements[-1] = str(row_count)
case_count += 1
prompt = "Running case %s / %s ... : %s " % (case_count, total_case_count, " ".join(cmd_elements))
print prompt
sp.check_call('echo "### %s" >> array_result' % prompt, shell=True)
for time in xrange(case_run_time):
#print "running the %d time" % time
sp.check_call("echo -n '%s,' >> array_result"%(row_count), shell=True)
sp.check_call(" ".join(cmd_elements) + ' >> array_result', shell=True)

View File

@ -0,0 +1,36 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import subprocess as sp
import os
if os.path.exists("material_result"):
os.remove("material_result")
if os.path.exists("material_final_result"):
os.remove("material_final_result")
# cmd_form = "./cost_model_util -B -t material -s c10k1.schema -r 1000 -p 1 >> material_result"
cmd_form = "./cost_model_util -G -t material -s c10k1.schema -r 1000 -p 1 >> material_result"
cmd_elements = cmd_form.split(" ")
row_count_max = 10001
row_count_step = 100
column_counts = [3, 5, 8]
case_run_time = 7
total_case_count = (row_count_max / row_count_step + 1) * len(column_counts) * case_run_time
case_count = 0
print "Total case count %s ..." % (total_case_count)
for row_count in xrange(1, row_count_max + 1, row_count_step):
for column_count in column_counts:
for time in xrange(case_run_time):
case_count += 1
cmd_elements[7] = str(row_count)
cmd_elements[9] = str(column_count)
sp.check_call("echo -n '%s,' >> material_result" % (row_count), shell=True)
sp.check_call("echo -n '%s,' >> material_result" % (column_count), shell=True)
print "Running case %s / %s ... : %s " % (case_count, total_case_count, " ".join(cmd_elements))
sp.check_call(" ".join(cmd_elements), shell=True)

View File

@ -0,0 +1,102 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import subprocess as sp
import os
import sys
import getopt
import time
ISOTIMEFORMAT='%Y-%m-%d %X'
#cmd_form = "./cost_model_util -B -s c10k1x2.schema -t merge -r 900 -r 900 -Z1 -Z1 -C 2 -C 2 -V 3 -V 3 >> res"
cmd_form = "./cost_model_util -G -s c10k1x2.schema -t merge -r 900 -r 900 -Z1 -Z1 -C 2 -C 2 -V 3 -V 3 >> res"
cmd_elements = cmd_form.split(" ")
row_count_max = 10001
row_count_step = 100
left_row_counts = [5000, 10000, 50000, 100000]
right_row_counts = [5000, 10000, 50000, 100000]
left_steps = [1, 3, 4, 5, 7, 10]
right_steps = [1, 3, 4, 5, 7, 10]
left_step_lengths = [1, 2, 4, 5, 10]
right_step_lengths = [1, 2, 4, 5, 10]
case_run_time = 7
total_case_count = len(left_row_counts)
total_case_count *= len(right_row_counts)
total_case_count *= len(left_steps)
total_case_count *= len(right_steps)
total_case_count *= len(left_step_lengths)
total_case_count *= len(right_step_lengths)
total_case_count *= case_run_time
wrong_arg = False
out_file_name = "merge_result"
opts,args = getopt.getopt(sys.argv[1:],"o:")
for op, value in opts:
if "-o" == op:
out_file_name = value
else:
wrong_arg = True
if wrong_arg:
print "wrong arg"
sys.exit(1)
case_count = 0
cmd_elements[-1] = out_file_name
if os.path.exists(out_file_name):
os.remove(out_file_name)
print "Total case count %s ..." % (total_case_count)
for left_row_count in left_row_counts:
for right_row_count in right_row_counts:
for left_step in left_steps:
for right_step in right_steps:
for left_step_length in left_step_lengths:
for right_step_length in right_step_lengths:
for i in xrange(case_run_time):
case_count += 1
cmd_elements[7] = str(left_row_count)
cmd_elements[9] = str(right_row_count)
cmd_elements[13] = str(left_step)
cmd_elements[15] = str(right_step)
cmd_elements[17] = str(left_step_length)
cmd_elements[19] = str(right_step_length)
prompt = "%s Running case %s / %s ... : %s " % (time.strftime( ISOTIMEFORMAT, time.localtime()), case_count, total_case_count, " ".join(cmd_elements))
print prompt
params = [str(p) for p in [left_row_count, right_row_count, left_step, right_step, left_step_length, right_step_length]]
sp.check_call("echo '#%s' >> %s"%(prompt, out_file_name), shell=True)
sp.check_call("echo -n '%s,' >> %s"%(",".join(params), out_file_name), shell=True)
sp.check_call(" ".join(cmd_elements), shell=True)
#
# total_case_count = (row_count_max / row_count_step + 1) * len(column_counts) * case_run_time
# case_count = 0
#
# print "Total case count %s ..." % (total_case_count)
# for row_count in xrange(1, row_count_max + 1, row_count_step):
# for column_count in column_counts:
# for time in xrange(case_run_time):
# case_count += 1
# cmd_elements[7] = str(row_count)
# cmd_elements[9] = str(column_count)
# sp.check_call("echo -n '%s,' >> material_result"%(row_count), shell=True)
# sp.check_call("echo -n '%s,' >> material_result"%(column_count), shell=True)
# print "Running case %s / %s ... : %s " % (case_count, total_case_count, " ".join(cmd_elements))
# sp.check_call(" ".join(cmd_elements), shell=True)
#

View File

@ -0,0 +1,87 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import subprocess as sp
import os
import sys
import getopt
import time
ISOTIMEFORMAT = '%Y-%m-%d %X'
# cmd_form = "./cost_model_util -t mg -B -s c10k1.schema -r 10000 -Z 1 -V 10 -e 1 -o 10 -p 1 >> out_file"
cmd_form = "./cost_model_util -t mg -G -s c10k1.schema -r 10000 -Z 1 -V 10 -e 1 -o 10 -p 1 >> mergegroupby_result"
cmd_elements = cmd_form.split(" ")
row_counts = [10, 30, 50, 70, 100, 1000, 5000, 10000]
steps = [1, 3, 5, 10, 20]
aggr_funcs = [1, 4, 7, 10]
group_cols = [1, 4, 7, 10]
non_group_cols = [10]
case_run_time = 7
total_case_count = len(row_counts)
total_case_count *= len(steps)
total_case_count *= len(aggr_funcs)
total_case_count *= len(group_cols)
total_case_count *= len(non_group_cols)
total_case_count *= case_run_time
print total_case_count
wrong_arg = False
out_file_name = "mergegroupby_result"
if os.path.exists(out_file_name):
os.remove(out_file_name)
opts, args = getopt.getopt(sys.argv[1:], "o:")
for op, value in opts:
if "-o" == op:
out_file_name = value
else:
wrong_arg = True
if wrong_arg:
print "wrong arg"
sys.exit(1)
case_count = 0
cmd_elements[-1] = out_file_name
if os.path.exists(out_file_name):
os.remove(out_file_name)
print "Total case count %s ..." % (total_case_count)
for row_count in row_counts:
for step in steps:
for aggr_func in aggr_funcs:
for group_col in group_cols:
for non_group_col in non_group_cols:
for run_time in xrange(case_run_time):
cmd_elements[7] = str(row_count)
cmd_elements[11] = str(step)
cmd_elements[13] = str(aggr_func)
cmd_elements[15] = str(group_col)
cmd_elements[17] = str(non_group_col)
cmd_elements[19] = out_file_name
param = ",".join([cmd_elements[7],
cmd_elements[11],
cmd_elements[13],
cmd_elements[15],
cmd_elements[17]]) + ","
prompt = "%s Running case %s / %s ... : %s " % (
time.strftime(ISOTIMEFORMAT, time.localtime()), case_count, total_case_count,
" ".join(cmd_elements))
print prompt
case_count += 1
sp.check_call("echo '#%s' >> %s" % (prompt, out_file_name), shell=True)
if group_col <= non_group_col:
sp.check_call("echo -n '%s' >> %s" % (param, out_file_name), shell=True)
sp.check_call(" ".join(cmd_elements), shell=True)
else:
sp.check_call("echo '#%s skipped' >> %s" % (param, out_file_name), shell=True)

View File

@ -0,0 +1,101 @@
#!/bin/env python
# -*- coding: utf-8 -*-
__author__ = 'dongyun.zdy'
import datetime
import multiprocessing
import MySQLdb
import Queue
import signal
import re
import argparse
import time
import sys
import subprocess as sp
import os
outfile = 'miss.result'
schema_file = 'miss.schema'
if os.path.exists(outfile):
os.remove(outfile)
def remove_schema():
global schema_file
if os.path.exists(schema_file):
os.remove(schema_file)
def write_schema(s):
global schema_file
of = open(schema_file, 'w')
of.write(s)
of.close()
def make_seq(t, cnt):
types = [t]
types *= cnt
return types
def make_schema(types):
global schema_file
remove_schema()
col_id = 1
s = "create table t1 ("
for t in types:
s += "c%d %s, " % (col_id, t)
col_id += 1
s = s[:-2]
s += ', primary key (c1))'
run_cmd('echo "# %s" >> ' % s + outfile)
write_schema(s)
def run_cmd(cmd):
# print cmd
res = ''
p = sp.Popen(cmd, shell=True, stdout=sp.PIPE, stderr=sp.STDOUT)
while True:
line = p.stdout.readline()
res += line
if line:
# print line.strip()
sys.stdout.flush()
else:
break
p.wait()
return res
#cmd_form1 = 'LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./cost_model_util -BGK -t material -s miss.schema -r 500000'.split()
cmd_form1 = 'LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./cost_model_util -GK -t material -s miss.schema -r 500000'.split()
types_to_test = {'bigint': 'bigint', 'double': 'double', 'float': 'float', 'timestamp': 'timestamp',
'number': 'number(20,3)', 'v32': 'varchar(32)', 'v64': 'varchar(64)', 'v128': 'varchar(128)'}
row_counts = [1000, 2000, 4000, 7000, 8000, 10000, 20000, 50000]
input_col_cnts = [1, 2, 3, 6]
case_run_time = 7
total_case_count = len(row_counts) * len(input_col_cnts)
case_count = 0
print "Total case count %s ..." % (total_case_count)
for col_count in input_col_cnts:
make_schema(sorted(types_to_test.values()) * col_count)
for row_count in row_counts:
cmd_form1[-1] = str(row_count)
case_count += 1
prompt = "Running case %s / %s ... : %s " % (case_count, total_case_count, " ".join(cmd_form1))
print prompt
sp.check_call('echo "### %s" >> ' % prompt + outfile, shell=True)
caseinfo = '%d,%d,' % (row_count, col_count)
for t in xrange(case_run_time):
print t
res = caseinfo + run_cmd(" ".join(cmd_form1) + " -i3").strip()
run_cmd('echo "%s" >> ' % (res) + outfile)
for t in xrange(case_run_time):
print t
res = caseinfo + run_cmd(" ".join(cmd_form1) + " -i4").strip()
run_cmd('echo "%s" >> ' % (res) + outfile)

View File

@ -0,0 +1,103 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import subprocess as sp
import os
import sys
import getopt
import time
ISOTIMEFORMAT='%Y-%m-%d %X'
# cmd_form = "./cost_model_util -B -s c10k1x2.schema -t nestloop -r 900 -r 900 -Z1 -Z1 -C 2 -C 2 -V 3 -V 3 >> res"
cmd_form = "./cost_model_util -G -s c10k1x2.schema -t nestloop -r 900 -r 900 -Z1 -Z1 -C 2 -C 2 -V 3 -V 3 >> nl_result"
cmd_elements = cmd_form.split(" ")
row_count_max = 10001
row_count_step = 100
left_row_counts = [10, 100, 500, 1000]
right_row_counts = [10, 100, 500, 1000]
left_steps = [1, 3, 4, 5, 7, 10]
right_steps = [1, 3, 4, 5, 7, 10]
left_step_lengths = [1, 2, 4, 5, 10]
right_step_lengths = [1, 2, 4, 5, 10]
case_run_time = 7
total_case_count = len(left_row_counts)
total_case_count *= len(right_row_counts)
total_case_count *= len(left_steps)
total_case_count *= len(right_steps)
total_case_count *= len(left_step_lengths)
total_case_count *= len(right_step_lengths)
total_case_count *= case_run_time
wrong_arg = False
#out_file_name = "nestloop_result"
out_file_name = "nl_result"
opts,args = getopt.getopt(sys.argv[1:],"o:")
for op, value in opts:
if "-o" == op:
out_file_name = value
else:
wrong_arg = True
if wrong_arg:
print "wrong arg"
sys.exit(1)
case_count = 0
cmd_elements[-1] = out_file_name
if os.path.exists(out_file_name):
os.remove(out_file_name)
print "Total case count %s ..." % (total_case_count)
for left_row_count in left_row_counts:
for right_row_count in right_row_counts:
for left_step in left_steps:
for right_step in right_steps:
for left_step_length in left_step_lengths:
for right_step_length in right_step_lengths:
for i in xrange(case_run_time):
case_count += 1
cmd_elements[7] = str(left_row_count)
cmd_elements[9] = str(right_row_count)
cmd_elements[13] = str(left_step)
cmd_elements[15] = str(right_step)
cmd_elements[17] = str(left_step_length)
cmd_elements[19] = str(right_step_length)
prompt = "%s Running case %s / %s ... : %s " % (time.strftime( ISOTIMEFORMAT, time.localtime()), case_count, total_case_count, " ".join(cmd_elements))
print prompt
params = [str(p) for p in [left_row_count, right_row_count, left_step, right_step, left_step_length, right_step_length]]
sp.check_call("echo '#%s' >> %s"%(prompt, out_file_name), shell=True)
sp.check_call("echo -n '%s,' >> %s"%(",".join(params), out_file_name), shell=True)
sp.check_call(" ".join(cmd_elements), shell=True)
#
# total_case_count = (row_count_max / row_count_step + 1) * len(column_counts) * case_run_time
# case_count = 0
#
# print "Total case count %s ..." % (total_case_count)
# for row_count in xrange(1, row_count_max + 1, row_count_step):
# for column_count in column_counts:
# for time in xrange(case_run_time):
# case_count += 1
# cmd_elements[7] = str(row_count)
# cmd_elements[9] = str(column_count)
# sp.check_call("echo -n '%s,' >> material_result"%(row_count), shell=True)
# sp.check_call("echo -n '%s,' >> material_result"%(column_count), shell=True)
# print "Running case %s / %s ... : %s " % (case_count, total_case_count, " ".join(cmd_elements))
# sp.check_call(" ".join(cmd_elements), shell=True)
#

View File

@ -0,0 +1,76 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import subprocess as sp
import os
schema_file = 'rowstore.schema'
outfile = 'rowstore.result'
def remove_schema():
global schema_file
if os.path.exists(schema_file):
os.remove(schema_file)
def write_schema(s):
global schema_file
of = open(schema_file, 'w')
of.write(s)
of.close()
def make_seq(t, cnt):
types = [t]
types *= cnt
return types
def make_schema(types):
global schema_file
remove_schema()
col_id = 1
s = "create table t1 ("
for t in types:
s += "c%d %s, " % (col_id, t)
col_id += 1
s = s[:-2]
s += ', primary key (c1))'
print s
write_schema(s)
cmdform = 'LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./cost_model_util -RBGK -t material -s rowstore.schema -r 10 -i1'.split()
types_to_test = {'bigint':'bigint', 'double':'double', 'float':'float', 'timestamp':'timestamp', 'number':'number(20,3)', 'v32':'varchar(32)', 'v64':'varchar(64)', 'v128':'varchar(128)'}
row_counts = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 20000, 50000]
col_nums = [1, 3, 20, 50]
case_run_time = 7
total_case_count = len(types_to_test) * len(row_counts) * len(col_nums)
case_count = 0
if os.path.exists(outfile):
os.remove(outfile)
# for t in types_to_test:
# outfile = 'rowstore.result.' + t
# if os.path.exists(outfile):
# os.remove(outfile)
# for n in col_nums:
# make_schema(make_seq(types_to_test[t], n))
# for rc in row_counts:
# cmdform[8] = str(rc)
# case_count += 1
# prompt = "# %d / %d %s col_cnt = %d rc = %d \n# %s" % (case_count, total_case_count, t, n, rc, ' '.join(cmdform))
# print prompt
# sp.check_call('echo "%s" >> ' % prompt + outfile, shell=True)
# for times in xrange(0, case_run_time):
# print times
# sp.check_call("echo -n '%s,' >> " % str(rc) + outfile, shell=True)
# sp.check_call("echo -n '%s,' >> " % str(n) + outfile, shell=True)
# sp.check_call(" ".join(cmdform) + ' >> ' + outfile, shell=True)
make_schema(make_seq('bigint', 50))

View File

@ -0,0 +1,44 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import subprocess as sp
import os
if os.path.exists("sort_result"):
os.remove("sort_result")
#cmd_form = "LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./cost_model_util -GBR -t sort -s c20.schema -r 1000 -c 10 -p 10 >> sort_result"
cmd_form = "LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./cost_model_util -GR -t sort -s sort.schema -r 1000 -c 10 -p 10 >> sort_result"
cmd_elements = cmd_form.split(" ")
row_counts = [1, 100, 500, 800, 1000, 3000, 5000, 8000, 9000, 10000, 20000, 40000, 60000, 70000, 100000, 300000]
column_counts = [1, 2, 3, 4, 5]
#input_col_cnts = [15, 30, 45]
input_col_cnts = [3, 5, 9] #schema file related, col counts should not be less than projector count
case_run_time = 7
total_case_count = len(row_counts) * len(column_counts) * len(input_col_cnts)
case_count = 0
print "Total case count %s ..." % (total_case_count)
for row_count in row_counts:
for column_count in column_counts:
for input_col in input_col_cnts:
cmd_elements[8] = str(row_count)
cmd_elements[10] = str(column_count)
cmd_elements[12] = str(input_col)
case_count += 1
prompt = "Running case %s / %s ... : %s " % (case_count, total_case_count, " ".join(cmd_elements))
print prompt
sp.check_call('echo "### %s" >> sort_result' % prompt, shell=True)
if column_count > input_col:
print "### PASS"
sp.check_call('echo "### PASS" >> sort_result', shell=True)
continue
for time in xrange(case_run_time):
print "running the %d time" % time
sp.check_call("echo -n '%s,' >> sort_result"%(row_count), shell=True)
sp.check_call("echo -n '%s,' >> sort_result"%(column_count), shell=True)
sp.check_call("echo -n '%s,' >> sort_result"%(input_col), shell=True)
sp.check_call(" ".join(cmd_elements), shell=True)

View File

@ -0,0 +1,97 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import subprocess as sp
import os
from cost_test_conf import Config
schema_file = 'sort.schema'
outfile = 'sort.result'
def remove_schema():
global schema_file
if os.path.exists(schema_file):
os.remove(schema_file)
def write_schema(s):
global schema_file
of = open(schema_file, 'w')
of.write(s)
of.close()
def make_seq(t, cnt):
types = [t]
types *= cnt
return types
def make_schema(types):
global schema_file
remove_schema()
col_id = 1
s = "create table t1 ("
for t in types:
s += "c%d %s, " % (col_id, t)
col_id += 1
s = s[:-2]
s += ', primary key (c1))'
print s
write_schema(s)
if os.path.exists("sort_result"):
os.remove("sort_result")
#cmd_form = "LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./cost_model_util -GBR -t sort -s c20.schema -r 1000 -c 10 -i4".split()
cmd_form = "LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./cost_model_util -GR -t sort -s c20.schema -r 1000 -c 10 -i4".split()
types_to_test = {'bigint':'bigint', 'double':'double', 'float':'float', 'timestamp':'timestamp', 'number':'number(20,3)', 'v32':'varchar(32)', 'v64':'varchar(64)', 'v128':'varchar(128)'}
row_counts = [1000, 2000, 4000, 8000, 10000, 20000, 50000]
sort_column_counts = [1, 2, 3, 5]
input_col_cnts = [1, 2, 6]
case_run_time = 7
keys = sorted(types_to_test.keys())
total_case_count = len(row_counts) * len(sort_column_counts) * len(input_col_cnts) * len(keys)
case_count = 0
cmd_form[6] = schema_file
def make_headed_seq(head, arr):
a = [head] + arr[0:arr.index(head)] + arr[arr.index(head) + 1:]
b = [types_to_test[i] for i in a]
return b
#for t in keys:
if Config.u_to_test_type is not None:
#outfile = 'sort.result.' + t
t = Config.u_to_test_type
outfile = 'sort_add_' + t + '_' + 'result'
if os.path.exists(outfile):
os.remove(outfile)
for n in input_col_cnts:
make_schema(make_headed_seq(t, keys) * n)
for rc in row_counts:
cmd_form[8] = str(rc)
for order_count in sort_column_counts:
cmd_form[-2] = str(order_count)
case_count+=1
prompt = "# %d / %d %s col_cnt = %d rc = %d order_cnt = %d\n# %s" % (case_count, total_case_count, t, n * len(keys), rc, order_count, ' '.join(cmd_form))
print prompt
sp.check_call('echo "%s" >> ' % prompt + outfile, shell=True)
if order_count > n * len(keys):
print 'PASS'
sp.check_call('echo "# PASS" >> ' + outfile, shell=True)
continue
for times in xrange(0, case_run_time):
print times
sp.check_call("echo -n '%s,' >> " % str(rc) + outfile, shell=True)
sp.check_call("echo -n '%s,' >> " % str(n) + outfile, shell=True)
sp.check_call("echo -n '%s,' >> " % str(order_count) + outfile, shell=True)
sp.check_call(" ".join(cmd_form) + ' >> ' + outfile, shell=True)

View File

@ -0,0 +1,94 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import subprocess as sp
import os
schema_file = 'sort.schema'
outfile = 'sort.result'
def remove_schema():
global schema_file
if os.path.exists(schema_file):
os.remove(schema_file)
def write_schema(s):
global schema_file
of = open(schema_file, 'w')
of.write(s)
of.close()
def make_seq(t, cnt):
types = [t]
types *= cnt
return types
def make_schema(types):
global schema_file
remove_schema()
col_id = 1
s = "create table t1 ("
for t in types:
s += "c%d %s, " % (col_id, t)
col_id += 1
s = s[:-2]
s += ', primary key (c1))'
print s
write_schema(s)
if os.path.exists("sort_result"):
os.remove("sort_result")
cmd_form = "LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./cost_model_util -GBR -t sort -s c20.schema -r 1000 -c 10 -i1".split()
types_to_test = {'bigint': 'bigint', 'double': 'double', 'float': 'float', 'timestamp': 'timestamp',
'number': 'number(20,3)', 'v32': 'varchar(32)', 'v64': 'varchar(64)', 'v128': 'varchar(128)'}
row_counts = [1000, 2000, 4000, 7000, 8000, 10000, 20000, 50000]
sort_column_counts = [1, 2, 3]
input_col_cnts = [1, 2, 3, 6]
case_run_time = 7
keys = sorted(types_to_test.keys())
total_case_count = len(row_counts) * len(sort_column_counts) * len(input_col_cnts) * len(keys)
case_count = 0
cmd_form[6] = schema_file
def make_headed_seq(head, arr):
a = [head] + arr[0:arr.index(head)] + arr[arr.index(head) + 1:]
b = [types_to_test[i] for i in a]
return b
for t in keys:
outfile = 'sort.result.' + t
if os.path.exists(outfile):
os.remove(outfile)
for n in input_col_cnts:
make_schema(make_headed_seq(t, keys) * n)
for rc in row_counts:
cmd_form[8] = str(rc)
for order_count in sort_column_counts:
cmd_form[-2] = str(order_count)
case_count += 1
prompt = "# %d / %d %s col_cnt = %d rc = %d order_cnt = %d\n# %s" % (
case_count, total_case_count, t, n * len(keys), rc, order_count, ' '.join(cmd_form))
print prompt
sp.check_call('echo "%s" >> ' % prompt + outfile, shell=True)
if order_count > n * len(keys):
print 'PASS'
sp.check_call('echo "# PASS" >> ' + outfile, shell=True)
continue
for times in xrange(0, case_run_time):
print times
sp.check_call("echo -n '%s,' >> " % str(rc) + outfile, shell=True)
sp.check_call("echo -n '%s,' >> " % str(n) + outfile, shell=True)
sp.check_call("echo -n '%s,' >> " % str(order_count) + outfile, shell=True)
sp.check_call(" ".join(cmd_form) + ' >> ' + outfile, shell=True)

View File

@ -0,0 +1 @@
create table t1 (c1 bigint, c2 bigint, c3 bigint, c4 bigint, c5 bigint, c6 bigint, c7 bigint, c8 bigint, c9 bigint, c10 bigint, primary key(c1))

View File

@ -0,0 +1,2 @@
create table t1(c1 bigint,c2 bigint,c3 bigint,c4 bigint,c5 bigint,c6 bigint,c7 bigint,c8 bigint,c9 bigint,c10 bigint,primary key(c1))
create table t2(c1 bigint,c2 bigint,c3 bigint,c4 bigint,c5 bigint,c6 bigint,c7 bigint,c8 bigint,c9 bigint,c10 bigint,primary key(c1))

View File

@ -0,0 +1,42 @@
from mylog.mylog import MyLogger
import subprocess as sp
'''
class Tester(object):
bench_script = "python benchmaster_{0}.py"
data_process_script = 'python preprocess.py -i {0} -o {1} -d'
fit_script = 'python fit_{0}.py'
def __init__(self, conf):
self.conf = conf
def do_all(self):
# MyLogger.log('try to do all test fit plot')
pass
def do_bench(self):
# MyLogger.log('try to do bench')
sp.check_call(Tester.bench_script.format(self.conf.u_to_test_op_c), shell=True)
def do_fit(self):
# MyLogger.log('try to do fit')
sp.check_call(Tester.fit_script.format(self.conf.u_to_test_op_c), shell=True)
def do_plot(self):
# MyLogger.log('try to do plot')
pass
def do_data_process(self):
if self.conf.u_to_test_type_c is None:
sp.check_call(Tester.data_process_script.format(self.conf.u_to_test_op_c + '_result',
self.conf.u_to_test_op_c + '_result_final'), shell=True)
else:
sp.check_call(
Tester.data_process_script.format(self.conf.u_to_test_op + '_' + self.conf.u_to_test_type + '_result',
self.conf.u_to_test_op + '_' + self.conf.u_to_test_type + '_result_final'
), shell=True)
'''
if __name__ == '__main__':
MyLogger.info("start to do cost model unittest")
sp.check_call('python %s' % ('material.py'), shell=True)

View File

@ -0,0 +1,118 @@
class Config(object):
'''
user input info
'''
################
operators = {
'array': 'array',
'material': 'material',
'mergegroupby': 'mergegroupby',
'merge': 'merge',
'hash': 'hash',
'miss': 'miss',
'nl': 'nl',
'rowstore': 'rowstore',
'sort_add': 'sort_add',
'sort': 'sort',
'sort_with_type': 'sort_with_type'
}
types_to_test = {'bigint': 'bigint', 'double': 'double', 'float': 'float', 'timestamp': 'timestamp',
'number': 'number(20,3)', 'v32': 'varchar(32)', 'v64': 'varchar(64)', 'v128': 'varchar(128)'}
config_map_dict = {
'is_printing_help_c': ' -h ',
'schema_file_c': ' -s ',
'row_count_c': ' -r ',
'left_row_count_c': ' -r ',
'right_row_count_c': ' -r ',
'sort_col_count_c': ' -c ',
'input_projector_count_c': ' -p ',
'left_pj_c': ' -p ',
'right_pj_c': ' -p ',
'is_printing_output_c': ' -O ',
'equal_cond_count_c': ' -e ',
'other_cond_count_c': ' -o ',
'u_to_test_op_c': ' -t ',
'u_to_test_type_c': '',
'is_binding_cpu_c': ' -B ',
'seed_min_c': ' -Z ',
'left_min_c': ' -Z ',
'right_min_c': ' -Z ',
'seed_max_c': ' -X ',
'left_max_c': '-X',
'right_max_c': '-X',
'seed_step_c': ' -C ',
'left_seed_step_c': ' -C ',
'right_seed_step_c': ' -C ',
'seed_step_len_c': ' -V ',
'left_seed_step_len_c': ' -V ',
'right_seed_step_len_c': ' -V ',
'limit_c': ' -L ',
'is_random_c': ' -R ',
'is_experimental_c': ' -K ',
'sleep_before_test_c': ' -S ',
'add_sort_column_c': ' -T ',
'info_type_c': ' -i ',
'common_prefix_len_c': ' -l ',
'is_not_running_as_unittest_c': ' -G '
}
def __init__(self):
# config info based on cost_model_util.cpp
self.is_printing_help_c = False
self.schema_file_c = None
self.row_count_c = None
self.left_row_count_c = None
self.right_row_count_c = None
self.sort_col_count_c = None
self.input_projector_count_c = None
self.left_pj = None
self.right_pj = None
self.is_printing_output_c = False
self.equal_cond_count_c = None
self.other_cond_count_c = None
self.u_to_test_op_c = None
self.u_to_test_type_c = None # special
self.is_binding_cpu_c = False
self.seed_min_c = None
self.left_min_c = None
self.rigt_min_c = None
self.seed_max_c = None
self.left_max_c = None
self.right_max_c = None
self.seed_step_c = None
self.left_seed_step_c = None
self.right_seed_step_c = None
self.seed_step_len_c = None
self.left_seed_step_len_c = None
self.right_seed_step_len_c = None
self.limit_c = None
self.is_random_c = False
self.is_experimental_c = False
self.sleep_before_test_c = None
self.add_sort_column_c = None
self.info_type_c = None
self.common_prefix_len_c = None
self.is_not_running_as_unittest_c = False
def gen_params(self):
if self.is_printing_help_c:
return " -h "
else:
args = " "
for key in filter(lambda aname: aname.endswith('_c') and aname != 'is_printing_help_c', dir(self)):
val = self.__getattribute__(key)
# MyLogger.info("config object %s %s", key, val)
if key.startswith('is'):
if val is True:
args = args + Config.config_map_dict[key]
else:
if val is not None:
args = args + Config.config_map_dict[key]
args = args + " " + str(val) + " "
return args
if __name__ == '__main__':
conf = Config()
conf.is_printing_help_c = True
print conf.gen_params()

View File

@ -0,0 +1,148 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import math
import numpy as np
from scipy.optimize import leastsq
from scipy.optimize import curve_fit
import sys
from lmfit import Model
import getopt
import os
#
# def array_model_form(args):
# # (
# # Nelem,
# # ) = args
#
# Telem_ence = 0.00898860
# Telem_copy = 0.00631888
#
# Nelem = args
#
# ELEM_PER_PAGE = 1024
# extend_cnt = math.ceil(math.log(float(Nelem)/ELEM_PER_PAGE, 2))
# if extend_cnt < 0:
# extend_cnt = 0
# copy_cnt = ELEM_PER_PAGE * (math.pow(2, extend_cnt) - 1)
#
# total_cost = Telem_ence * Nelem
# #total_cost += Tmem_alloc * extend_cnt
# total_cost += Telem_copy * copy_cnt
#
# return total_cost
def array_model_form(args,
#Tstartup,
Telem_ence,
Telem_copy,
#Tmem_alloc
):
# (
# Nelem,
# ) = args
Nelem = args
ELEM_PER_PAGE = 1024
extend_cnt = math.ceil(math.log(float(Nelem)/ELEM_PER_PAGE, 2))
if extend_cnt < 0:
extend_cnt = 0
copy_cnt = ELEM_PER_PAGE * (math.pow(2, extend_cnt) - 1)
total_cost = Telem_ence * Nelem
#total_cost += Tmem_alloc * extend_cnt
total_cost += Telem_copy * copy_cnt
return total_cost
def material_model_arr(arg_sets,
# Tstartup,
Telem_ence,
Telem_copy,
#Tmem_alloc
):
res = []
for single_arg_set in arg_sets:
res.append(array_model_form(single_arg_set,
# Tstartup,
Telem_ence,
Telem_copy,
#Tmem_alloc
))
return np.array(res)
material_model = Model(material_model_arr)
# material_model.set_param_hint("Tstartup", min=0.0)
material_model.set_param_hint("Telem_ence", min=0.0)
material_model.set_param_hint("Telem_copy", min=0.0)
#material_model.set_param_hint("Tmem_alloc", min=0.0)
def extract_info_from_line(line):
splited = line.split(",")
line_info = []
for item in splited:
line_info.append(float(item))
return line_info
if __name__ == '__main__':
#file_name = "scan_model.res.formal.prep"
#out_file_name = "scan_model.fit"
file_name = "array_result_final"
out_file_name = "array_model"
if os.path.exists(out_file_name):
os.remove(out_file_name)
#sys.argv.extend("-i arr.prep -o arr.model".split(" "))
output_fit_res = True
wrong_arg = False
opts,args = getopt.getopt(sys.argv[1:],"i:o:")
for op, value in opts:
if "-i" == op:
file_name = value
elif "-o" == op:
output_fit_res = True
out_file_name = value
else:
wrong_arg = True
if wrong_arg:
print "wrong arg"
sys.exit(1)
file = open(file_name, "r")
arg_sets = []
times = []
case_params = []
for line in file:
if line.startswith('#'):
continue
case_param = extract_info_from_line(line)
case_params.append(case_param)
arg_sets.append((case_param[0]))
times.append(case_param[1])
file.close()
arg_sets_np = np.array(arg_sets)
times_np = np.array(times)
#10, 0.20406430879623488, 0.016618100054245379, 14.0, 4.5, 37.0, -0.005, 0.5, -7.0
result = material_model.fit(times_np, arg_sets=arg_sets_np,
# Tstartup=10.0,
Telem_ence=1.0,
Telem_copy=1.0,
#Tmem_alloc=1.0
)
# res_line = str(result.best_values["Tstartup"]) + ","
res_line = str(result.best_values["Telem_ence"]) + ","
res_line += str(result.best_values["Telem_copy"])# + ","
#res_line += str(result.best_values["Tmem_alloc"])
print result.fit_report()
if output_fit_res:
out_file = open(out_file_name, "w")
out_file.write(res_line)
out_file.close()

View File

@ -0,0 +1,162 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import math
import numpy as np
from scipy.optimize import leastsq
from scipy.optimize import curve_fit
import sys
from lmfit import Model
import getopt
def mg_model_form(args,
Tstartup,
Trow_once,
Tres_once,
Taggr_prepare_result,
Taggr_process,
Tgroup_hash_col,
Tcopy_col
):
(
Nrow_input,
Nrow_res,
Ncol_input,
Ncol_aggr,
Ncol_group
) = args
total_cost = Tstartup + Nrow_res * Tres_once + Nrow_input * Trow_once
#cost for judge group
total_cost += Nrow_input * Ncol_group * Tgroup_hash_col
#cost for group related operation
total_cost += Nrow_res * (Ncol_input * Tcopy_col)
total_cost += Nrow_res * (Ncol_aggr * Taggr_prepare_result)
#cost for input row process
total_cost += Nrow_input * (Ncol_aggr * Taggr_process)
return total_cost
eval_count = 0
def mg_model_arr(arg_sets,
Tstartup,
Trow_once,
Tres_once,
Taggr_prepare_result,
Taggr_process,
Tgroup_hash_col,
Tcopy_col
) :
res = [mg_model_form(single_arg_set,
Tstartup,
Trow_once,
Tres_once,
Taggr_prepare_result,
Taggr_process,
Tgroup_hash_col,
Tcopy_col
) for single_arg_set in arg_sets]
global eval_count
eval_count += 1
print "eval "+ str(eval_count)
return np.array(res)
mg_model = Model(mg_model_arr)
mg_model.set_param_hint("Tstartup", min=0.0)
mg_model.set_param_hint("Trow_once", min=0.0)
mg_model.set_param_hint("Tres_once", min=0.0)
mg_model.set_param_hint("Taggr_prepare_result", min=0.0)
mg_model.set_param_hint("Taggr_process", min=0.0)
mg_model.set_param_hint("Tgroup_hash_col", min=0.0)
mg_model.set_param_hint("Tcopy_col", min=0.0)
def extract_info_from_line(line):
splited = line.split(",")
line_info = []
for item in splited:
line_info.append(float(item))
return line_info
if __name__ == '__main__':
file_name = "scan_model.res.formal.prep"
out_file_name = "scan_model.fit"
output_fit_res = False
wrong_arg = False
opts,args = getopt.getopt(sys.argv[1:],"i:o:")
for op, value in opts:
if "-i" == op:
file_name = value
elif "-o" == op:
output_fit_res = True
out_file_name = value
else:
wrong_arg = True
if wrong_arg:
print "wrong arg"
sys.exit(1)
file = open(file_name, "r")
arg_sets = []
times = []
case_params = []
for line in file:
case_param = extract_info_from_line(line)
case_params.append(case_param)
# Nrow_input,
# Nrow_res,
# Ncol_input,
# Ncol_aggr,
# Ncol_group
arg_sets.append((case_param[0],
case_param[5],
case_param[4],
case_param[2],
case_param[3]
))
times.append(case_param[6])
file.close()
arg_sets_np = np.array(arg_sets)
times_np = np.array(times)
#10, 0.20406430879623488, 0.016618100054245379, 14.0, 4.5, 37.0, -0.005, 0.5, -7.0
result = mg_model.fit(times_np, arg_sets=arg_sets_np,
Tstartup = 0.1,
Trow_once = 0.1,
Tres_once = 0.1,
Taggr_prepare_result = 0.1,
Taggr_process = 0.1,
Tgroup_hash_col = 0.1,
Tcopy_col = 0.1
)
res_line = str(result.best_values["Tstartup"]) + ","
res_line += str(result.best_values["Trow_once"]) + ","
res_line += str(result.best_values["Tres_once"]) + ","
res_line += str(result.best_values["Taggr_prepare_result"]) + ","
res_line += str(result.best_values["Taggr_process"]) + ","
res_line += str(result.best_values["Tgroup_hash_col"]) + ","
res_line += str(result.best_values["Tcopy_col"])
print result.fit_report()
if output_fit_res:
out_file = open(out_file_name, "w")
out_file.write(res_line)
out_file.close()

View File

@ -0,0 +1,111 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import math
import numpy as np
from scipy.optimize import leastsq
from scipy.optimize import curve_fit
import sys
from lmfit import Model
import getopt
import os
def material_model_form(args,
# Tstartup,
Trow_once,
Trow_col):
(
Nrow,
Ncol,
) = args
total_cost = 0 # Tstartup
total_cost += Nrow * (Trow_once + Ncol * Trow_col)
return total_cost
def material_model_arr(arg_sets,
# Tstartup,
Trow_once,
Trow_col):
res = []
for single_arg_set in arg_sets:
res.append(material_model_form(single_arg_set,
# Tstartup,
Trow_once,
Trow_col))
return np.array(res)
material_model = Model(material_model_arr)
# material_model.set_param_hint("Tstartup", min=0.0)
material_model.set_param_hint("Trow_once", min=0.0)
material_model.set_param_hint("Trow_col", min=0.0)
def extract_info_from_line(line):
splited = line.split(",")
line_info = []
for item in splited:
line_info.append(float(item))
return line_info
if __name__ == '__main__':
# file_name = "scan_model.res.formal.prep"
file_name = "material_result_final"
# out_file_name = "scan_model.fit"
out_file_name = "material_model"
if os.path.exists(out_file_name):
os.remove(out_file_name)
# sys.argv.extend("-i rowstore.prepare.bigint -o rowstore.model".split(" "))
output_fit_res = True
wrong_arg = False
opts, args = getopt.getopt(sys.argv[1:], "i:o:")
for op, value in opts:
if "-i" == op:
file_name = value
elif "-o" == op:
output_fit_res = True
out_file_name = value
else:
wrong_arg = True
if wrong_arg:
print "wrong arg"
sys.exit(1)
file = open(file_name, "r")
arg_sets = []
times = []
case_params = []
for line in file:
if line.startswith('#'):
continue
case_param = extract_info_from_line(line)
case_params.append(case_param)
arg_sets.append((case_param[0], case_param[1]))
times.append(case_param[3])
file.close()
arg_sets_np = np.array(arg_sets)
times_np = np.array(times)
# 10, 0.20406430879623488, 0.016618100054245379, 14.0, 4.5, 37.0, -0.005, 0.5, -7.0
result = material_model.fit(times_np, arg_sets=arg_sets_np,
# Tstartup=10.0,
Trow_once=10.0,
Trow_col=1.0
)
# res_line = str(result.best_values["Tstartup"]) + ","
res_line = str(result.best_values["Trow_once"]) + ","
res_line += str(result.best_values["Trow_col"])
print result.fit_report()
if output_fit_res:
out_file = open(out_file_name, "w")
out_file.write(res_line)
out_file.close()

View File

@ -0,0 +1,167 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import math
import numpy as np
from scipy.optimize import leastsq
from scipy.optimize import curve_fit
import sys
from lmfit import Model
import getopt
def merge_model_form(args,
Tstartup,
Tres_right_op,
Tres_right_cache,
Tmatch_group,
#Tassemble_row,
Tequal_fail,
Trow_left,
Trow_right
):
(
Nrow_res,
Nrow_left,
Nrow_right,
Nright_cache_in,
Nright_cache_out,
Nright_cache_clear,
Nequal_cond
) = args
total_cost = Tstartup
total_cost += Nrow_left * Trow_left
total_cost += (Nrow_right - Nright_cache_in) * Trow_right
total_cost += Nright_cache_in * Tres_right_op
total_cost += Nright_cache_out * Tres_right_cache
#total_cost += Nrow_res * Tassemble_row
total_cost += Nright_cache_clear * Tmatch_group
total_cost += (Nequal_cond - Nrow_res - 2 * Tmatch_group) * Tequal_fail
# total_cost += Nright_cache_in * Tres_right_op
# total_cost += (Nrow_res - Nright_cache_in) * Tres_right_cache
# total_cost += Nright_cache_clear * Tmatch_group
# total_cost += Nrow_res * Tassemble_row
# total_cost += (Nequal_cond - Nrow_res - 2 * Tmatch_group) * Tequal_fail
# total_cost += Nrow_left * Trow_left
# total_cost += (Nrow_right - Nright_cache_in) * Trow_right
return total_cost
eval_count = 0
def merge_model_arr(arg_sets,
Tstartup,
Tres_right_op,
Tres_right_cache,
Tmatch_group,
#Tassemble_row,
Tequal_fail,
Trow_left,
Trow_right
):
res = [merge_model_form(single_arg_set,
Tstartup,
Tres_right_op,
Tres_right_cache,
Tmatch_group,
#Tassemble_row,
Tequal_fail,
Trow_left,
Trow_right
) for single_arg_set in arg_sets]
global eval_count
eval_count += 1
return np.array(res)
merge_model = Model(merge_model_arr)
merge_model.set_param_hint("Tstartup", min=0.0)
merge_model.set_param_hint("Tres_right_op", min=0.0)
merge_model.set_param_hint("Tres_right_cache", min=0.0)
merge_model.set_param_hint("Tmatch_group", min=0.0)
#merge_model.set_param_hint("Tassemble_row", min=0.0)
merge_model.set_param_hint("Tequal_fail", min=0.0)
merge_model.set_param_hint("Trow_left", min=0.0)
merge_model.set_param_hint("Trow_right", min=0.0)
def extract_info_from_line(line):
splited = line.split(",")
line_info = []
for item in splited:
line_info.append(float(item))
return line_info
if __name__ == '__main__':
file_name = "scan_model.res.formal.prep"
out_file_name = "scan_model.fit"
sys.argv.extend("-i merge.prep.1 -o merge.model".split(" "))
output_fit_res = False
wrong_arg = False
opts,args = getopt.getopt(sys.argv[1:],"i:o:")
for op, value in opts:
if "-i" == op:
file_name = value
elif "-o" == op:
output_fit_res = True
out_file_name = value
else:
wrong_arg = True
if wrong_arg:
print "wrong arg"
sys.exit(1)
file = open(file_name, "r")
arg_sets = []
times = []
case_params = []
for line in file:
case_param = extract_info_from_line(line)
case_params.append(case_param)
arg_sets.append((case_param[6], #Nrow_res
case_param[0], #Nrow_left
case_param[1], #Nrow_right
case_param[-3], #Nright_cache_in
case_param[-2], #Nright_cache_out
case_param[-1], #Nright_cache_clear
case_param[8] #Nequal_cond
))
times.append(case_param[7])
file.close()
arg_sets_np = np.array(arg_sets)
times_np = np.array(times)
#10, 0.20406430879623488, 0.016618100054245379, 14.0, 4.5, 37.0, -0.005, 0.5, -7.0
result = merge_model.fit(times_np, arg_sets=arg_sets_np,
Tstartup=0.1,
Tres_right_op=0.1,
Tres_right_cache=0.1,
Tmatch_group=1.0,
#Tassemble_row=0.5,
Tequal_fail=1.0,
Trow_left=0.05,
Trow_right=0.05
)
res_line = str(result.best_values["Tstartup"]) + ","
res_line += str(result.best_values["Tres_right_op"]) + ","
res_line += str(result.best_values["Tres_right_cache"]) + ","
res_line += str(result.best_values["Tmatch_group"]) + ","
#res_line += str(result.best_values["Tassemble_row"]) + ","
res_line += str(result.best_values["Tequal_fail"]) + ","
res_line += str(result.best_values["Trow_left"]) + ","
res_line += str(result.best_values["Trow_right"])
print result.fit_report()
if output_fit_res:
out_file = open(out_file_name, "w")
out_file.write(res_line)
out_file.close()

View File

@ -0,0 +1,166 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import math
import numpy as np
from scipy.optimize import leastsq
from scipy.optimize import curve_fit
import sys
from lmfit import Model
import getopt
def mg_model_form(args,
#Tstartup,
Trow_once,
Tres_once,
Taggr_prepare_result,
Taggr_process,
Tgroup_cmp_col,
Tcopy_col
):
(
Nrow_input,
Nrow_res,
Ncol_input,
Ncol_aggr,
Ncol_group
) = args
total_cost = Nrow_res * Tres_once + Nrow_input * Trow_once
#cost for judge group
total_cost += Nrow_res * Tgroup_cmp_col
total_cost += (Nrow_input - Nrow_res) * Ncol_group * Tgroup_cmp_col
#cost for group related operation
total_cost += Nrow_res * (Ncol_input * Tcopy_col)
total_cost += Nrow_res * (Ncol_aggr * Taggr_prepare_result)
#cost for input row process
total_cost += Nrow_input * (Ncol_aggr * Taggr_process)
return total_cost
eval_count = 0
def mg_model_arr(arg_sets,
#Tstartup,
Trow_once,
Tres_once,
Taggr_prepare_result,
Taggr_process,
Tgroup_cmp_col,
Tcopy_col
) :
res = [mg_model_form(single_arg_set,
#Tstartup,
Trow_once,
Tres_once,
Taggr_prepare_result,
Taggr_process,
Tgroup_cmp_col,
Tcopy_col
) for single_arg_set in arg_sets]
global eval_count
eval_count += 1
print "eval "+ str(eval_count)
return np.array(res)
mg_model = Model(mg_model_arr)
#mg_model.set_param_hint("Tstartup", min=0.0)
mg_model.set_param_hint("Trow_once", min=0.0)
mg_model.set_param_hint("Tres_once", min=0.0)
mg_model.set_param_hint("Taggr_prepare_result", min=0.0)
mg_model.set_param_hint("Taggr_process", min=0.0)
mg_model.set_param_hint("Tgroup_cmp_col", min=0.0)
mg_model.set_param_hint("Tcopy_col", min=0.0)
def extract_info_from_line(line):
splited = line.split(",")
line_info = []
for item in splited:
line_info.append(float(item))
return line_info
if __name__ == '__main__':
#file_name = "scan_model.res.formal.prep"
#out_file_name = "scan_model.fit"
file_name = "mergegroupby_result_final"
out_file_name = "mergegroupby_model"
output_fit_res = True
wrong_arg = False
opts,args = getopt.getopt(sys.argv[1:],"i:o:")
for op, value in opts:
if "-i" == op:
file_name = value
elif "-o" == op:
output_fit_res = True
out_file_name = value
else:
wrong_arg = True
if wrong_arg:
print "wrong arg"
sys.exit(1)
file = open(file_name, "r")
arg_sets = []
times = []
case_params = []
for line in file:
if line.startswith('#'):
continue
case_param = extract_info_from_line(line)
case_params.append(case_param)
# Nrow_input,
# Nrow_res,
# Ncol_input,
# Ncol_aggr,
# Ncol_group
arg_sets.append((case_param[0],
case_param[5],
case_param[4],
case_param[2],
case_param[3]
))
times.append(case_param[6])
file.close()
arg_sets_np = np.array(arg_sets)
times_np = np.array(times)
#10, 0.20406430879623488, 0.016618100054245379, 14.0, 4.5, 37.0, -0.005, 0.5, -7.0
result = mg_model.fit(times_np, arg_sets=arg_sets_np,
#Tstartup = 0.1,
Trow_once = 0.1,
Tres_once = 0.1,
Taggr_prepare_result = 0.1,
Taggr_process = 0.1,
Tgroup_cmp_col = 0.1,
Tcopy_col = 0.1
)
res_line = str(result.best_values["Trow_once"]) + ","
res_line += str(result.best_values["Tres_once"]) + ","
res_line += str(result.best_values["Taggr_prepare_result"]) + ","
res_line += str(result.best_values["Taggr_process"]) + ","
res_line += str(result.best_values["Tgroup_cmp_col"]) + ","
res_line += str(result.best_values["Tcopy_col"])
print result.fit_report()
if output_fit_res:
out_file = open(out_file_name, "w")
out_file.write(res_line)
out_file.close()

View File

@ -0,0 +1,164 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import math
import numpy as np
from scipy.optimize import leastsq
from scipy.optimize import curve_fit
import sys
from lmfit import Model
import getopt
def mg_model_form(args,
#Tstartup,
Trow_once,
Tres_once,
Taggr_prepare_result,
Taggr_process,
Tgroup_cmp_col,
Tcopy_col
):
(
Nrow_input,
Nrow_res,
Ncol_input,
Ncol_aggr,
Ncol_group
) = args
total_cost = Nrow_res * Tres_once + Nrow_input * Trow_once
#cost for judge group
total_cost += Nrow_res * Tgroup_cmp_col
total_cost += (Nrow_input - Nrow_res) * Ncol_group * Tgroup_cmp_col
#cost for group related operation
total_cost += Nrow_res * (Ncol_input * Tcopy_col)
total_cost += Nrow_res * (Ncol_aggr * Taggr_prepare_result)
#cost for input row process
total_cost += Nrow_input * (Ncol_aggr * Taggr_process)
return total_cost
eval_count = 0
def mg_model_arr(arg_sets,
#Tstartup,
Trow_once,
Tres_once,
Taggr_prepare_result,
Taggr_process,
Tgroup_cmp_col,
Tcopy_col
) :
res = [mg_model_form(single_arg_set,
#Tstartup,
Trow_once,
Tres_once,
Taggr_prepare_result,
Taggr_process,
Tgroup_cmp_col,
Tcopy_col
) for single_arg_set in arg_sets]
global eval_count
eval_count += 1
print "eval "+ str(eval_count)
return np.array(res)
mg_model = Model(mg_model_arr)
#mg_model.set_param_hint("Tstartup", min=0.0)
mg_model.set_param_hint("Trow_once", min=0.0)
mg_model.set_param_hint("Tres_once", min=0.0)
mg_model.set_param_hint("Taggr_prepare_result", min=0.0)
mg_model.set_param_hint("Taggr_process", min=0.0)
mg_model.set_param_hint("Tgroup_cmp_col", min=0.0)
mg_model.set_param_hint("Tcopy_col", min=0.0)
def extract_info_from_line(line):
splited = line.split(",")
line_info = []
for item in splited:
line_info.append(float(item))
return line_info
if __name__ == '__main__':
#file_name = "scan_model.res.formal.prep"
#out_file_name = "scan_model.fit"
file_name = "mergegroupby_result_final"
out_file_name = "mergegroupby_model"
output_fit_res = True
wrong_arg = False
opts,args = getopt.getopt(sys.argv[1:],"i:o:")
for op, value in opts:
if "-i" == op:
file_name = value
elif "-o" == op:
output_fit_res = True
out_file_name = value
else:
wrong_arg = True
if wrong_arg:
print "wrong arg"
sys.exit(1)
file = open(file_name, "r")
arg_sets = []
times = []
case_params = []
for line in file:
case_param = extract_info_from_line(line)
case_params.append(case_param)
# Nrow_input,
# Nrow_res,
# Ncol_input,
# Ncol_aggr,
# Ncol_group
arg_sets.append((case_param[0],
case_param[5],
case_param[4],
case_param[2],
case_param[3]
))
times.append(case_param[6])
file.close()
arg_sets_np = np.array(arg_sets)
times_np = np.array(times)
#10, 0.20406430879623488, 0.016618100054245379, 14.0, 4.5, 37.0, -0.005, 0.5, -7.0
result = mg_model.fit(times_np, arg_sets=arg_sets_np,
#Tstartup = 0.1,
Trow_once = 0.1,
Tres_once = 0.1,
Taggr_prepare_result = 0.1,
Taggr_process = 0.1,
Tgroup_cmp_col = 0.1,
Tcopy_col = 0.1
)
res_line = str(result.best_values["Trow_once"]) + ","
res_line += str(result.best_values["Tres_once"]) + ","
res_line += str(result.best_values["Taggr_prepare_result"]) + ","
res_line += str(result.best_values["Taggr_process"]) + ","
res_line += str(result.best_values["Tgroup_cmp_col"]) + ","
res_line += str(result.best_values["Tcopy_col"])
print result.fit_report()
if output_fit_res:
out_file = open(out_file_name, "w")
out_file.write(res_line)
out_file.close()

View File

@ -0,0 +1,140 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import math
import numpy as np
from scipy.optimize import leastsq
from scipy.optimize import curve_fit
import sys
from lmfit import Model
import getopt
def get_row_size(col):
size = 16
size += col * (3 + 8 + 4 + 8 + 16 + 32 + 64 + 128)
size += col
return size
def round_wasted_spave(rsize, psize):
nr = math.floor(float(psize / rsize))
waste = psize - nr * rsize
return rsize + waste / nr
def get_miss_prob(Nrow, Ncol, Turn):
total_size = Nrow * get_row_size(Ncol)
TLBcovered = Turn
if TLBcovered >= 0.9 * total_size:
hit = 0.9
else:
hit = TLBcovered / total_size
return 1 - hit
def sort_model_form(args,
Tmiss,
Turn
):
(
Nrow,
Ncol,
) = args
total_cost = 0
total_cost += Nrow * Tmiss * Ncol * get_miss_prob(Nrow, Ncol, Turn)
return total_cost
def sort_model_arr(arg_sets,
Tmiss,
Turn,
):
res = []
for single_arg_set in arg_sets:
res.append(sort_model_form(single_arg_set,
Tmiss,
Turn,
))
return np.array(res)
sort_model = Model(sort_model_arr)
sort_model.set_param_hint("Tmiss", min=0.0)
sort_model.set_param_hint("Turn", min=2097152.0, max=2097153.0)
# sort_model.set_param_hint("Tmiss_K2", min=0.0)
def extract_info_from_line(line):
splited = line.split(",")
line_info = []
for item in splited:
line_info.append(float(item))
return line_info
if __name__ == '__main__':
file_name = "miss.prep.1"
out_file_name = "miss.model"
# sys.argv.extend("-i sort.prep.bigint -o sort.model".split(" "))
output_fit_res = False
wrong_arg = False
opts,args = getopt.getopt(sys.argv[1:],"i:o:R:C:")
for op, value in opts:
if "-i" == op:
file_name = value
elif "-o" == op:
output_fit_res = True
out_file_name = value
elif "-R" == op:
MATERIAL_ROW_ONCE = float(value)
elif "-C" == op:
MATERIAL_ROW_COL = float(value)
else:
wrong_arg = True
if wrong_arg:
print "wrong arg"
sys.exit(1)
file = open(file_name, "r")
arg_sets = []
times = []
case_params = []
for line in file:
if line.startswith('#'):
continue
case_param = extract_info_from_line(line)
case_params.append(case_param)
arg_sets.append((case_param[0], case_param[1]))
times.append(case_param[3])
file.close()
arg_sets_np = np.array(arg_sets)
times_np = np.array(times)
#10, 0.20406430879623488, 0.016618100054245379, 14.0, 4.5, 37.0, -0.005, 0.5, -7.0
result = sort_model.fit(times_np, arg_sets=arg_sets_np,
Tmiss=1.0,
Turn=2097152,
)
Tmiss = result.best_values["Tmiss"]
Turn = result.best_values["Turn"]
res_line = str(Tmiss) + ","
res_line += str(Turn)
# res_line += str(result.best_values["Tmiss_K2"])
print result.fit_report()
if output_fit_res:
out_file = open(out_file_name, "w")
out_file.write(res_line)
out_file.close()
for i, args in enumerate(arg_sets):
cost = sort_model_form(args, Tmiss, Turn)
time = times[i]
print "\t".join([str(args), str(time), str(cost)])

View File

@ -0,0 +1,147 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import math
import numpy as np
from scipy.optimize import leastsq
from scipy.optimize import curve_fit
import sys
from lmfit import Model
import getopt
def nl_model_form(args,
Tstartup,
#Tqual,
Tres,
Tfail,
Tleft_row,
Tright_row
):
(
Nrow_res,
Nrow_left,
Nrow_right,
Nright_cache_in,
Nright_cache_out,
Nright_cache_clear,
Nequal_cond
) = args
total_cost = Tstartup
total_cost += Nrow_res * Tres
#total_cost += Nequal_cond * Tqual
total_cost += (Nequal_cond - Nrow_res) * Tfail
total_cost += Nrow_left * Tleft_row
total_cost += Nrow_right * Tright_row
return total_cost
eval_count = 0
def nl_model_arr(arg_sets,
Tstartup,
#Tqual,
Tres,
Tfail,
Tleft_row,
Tright_row
):
res = [nl_model_form(single_arg_set,
Tstartup,
#Tqual,
Tres,
Tfail,
Tleft_row,
Tright_row
) for single_arg_set in arg_sets]
global eval_count
eval_count += 1
return np.array(res)
nl_model = Model(nl_model_arr)
nl_model.set_param_hint("Tstartup", min=0.0, max = 50)
#nl_model.set_param_hint("Tqual", min=0.0)
nl_model.set_param_hint("Tres", min=0.0)
nl_model.set_param_hint("Tfail", min=0.0)
nl_model.set_param_hint("Tleft_row", min=0.0)
nl_model.set_param_hint("Tright_row", min=0.0)
def extract_info_from_line(line):
splited = line.split(",")
line_info = []
for item in splited:
line_info.append(float(item))
return line_info
if __name__ == '__main__':
file_name = "scan_model.res.formal.prep"
out_file_name = "scan_model.fit"
sys.argv.extend("-i nl.prep -o nl.model".split(" "))
output_fit_res = False
wrong_arg = False
opts,args = getopt.getopt(sys.argv[1:],"i:o:")
for op, value in opts:
if "-i" == op:
file_name = value
elif "-o" == op:
output_fit_res = True
out_file_name = value
else:
wrong_arg = True
if wrong_arg:
print "wrong arg"
sys.exit(1)
file = open(file_name, "r")
arg_sets = []
times = []
case_params = []
for line in file:
case_param = extract_info_from_line(line)
case_params.append(case_param)
arg_sets.append((case_param[6], #Nrow_res
case_param[0], #Nrow_left
case_param[1], #Nrow_right
case_param[-3], #Nright_cache_in
case_param[-2], #Nright_cache_out
case_param[-1], #Nright_cache_clear
case_param[8] #Nequal_cond
))
times.append(case_param[7])
file.close()
arg_sets_np = np.array(arg_sets)
times_np = np.array(times)
#10, 0.20406430879623488, 0.016618100054245379, 14.0, 4.5, 37.0, -0.005, 0.5, -7.0
result = nl_model.fit(times_np, arg_sets=arg_sets_np,
Tstartup=50.0,
#Tqual=0.1,
Tres=0.3,
Tfail=0.3,
Tleft_row=0.3,
Tright_row=0.3
)
res_line = str(result.best_values["Tstartup"]) + ","
#res_line += str(result.best_values["Tqual"]) + ","
res_line += str(result.best_values["Tres"]) + ","
res_line += str(result.best_values["Tfail"]) + ","
res_line += str(result.best_values["Tleft_row"]) + ","
res_line += str(result.best_values["Tright_row"])
print result.fit_report()
if output_fit_res:
out_file = open(out_file_name, "w")
out_file.write(res_line)
out_file.close()

View File

@ -0,0 +1,60 @@
import math
import numpy as np
from scipy.optimize import leastsq
from scipy.optimize import curve_fit
import sys
from lmfit import Model
import getopt
import subprocess
import os
import re
types_to_test = {'bigint':'bigint', 'double':'double', 'float':'float', 'timestamp':'timestamp', 'number':'number(20,3)','v1':'varchar(1)','v32':'varchar(32)', 'v64':'varchar(64)', 'v128':'varchar(128)'}
def run_cmd(cmd):
#print cmd
res = ''
p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
while True:
line = p.stdout.readline()
res += line
if line:
#print line.strip()
sys.stdout.flush()
else:
break
p.wait()
return res.strip()
def rm_if_exist(filename):
if os.path.exists(filename):
os.remove(filename)
def extract_kv(k, src):
pat=k + ':\s*[\d\.e\-\+]+'
mat = re.compile(pat)
return float(mat.findall(src)[0].split()[1])
for t in sorted(types_to_test.keys()):
result_file_name = 'rowstore.result.' + t
prep_file_name = 'rowstore.prep.' + t
model_file = 'rowstore.model.' + t
fit_file = 'rowstore.fit.' + t
rm_if_exist(prep_file_name)
run_cmd("./preprocess.py -i %s -o %s -t 7 -C 3 -d" % (result_file_name, prep_file_name))
fitres = run_cmd("./fit_material.py -i " + prep_file_name + " -o " + model_file)
# print fitres
run_cmd("./apply_material_model.py -i %s -o %s -m %s" % (prep_file_name, fit_file, model_file))
Trow_col = extract_kv('Trow_col', fitres)
Trow_once = extract_kv('Trow_once', fitres)
print types_to_test[t] + ":"
print " " + str(Trow_col)
print " " + str(Trow_once)

View File

@ -0,0 +1,268 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import math
import numpy as np
from scipy.optimize import leastsq
from scipy.optimize import curve_fit
import sys
from lmfit import Model
import getopt
import os
MATERIAL_ROW_COL = 0.02674675
MATERIAL_ROW_ONCE = 0.07931677
RESERVE_CELL = 0.0044
def material_model_form(args):
(
Nrow,
Ncol,
) = args
global MATERIAL_ROW_COL
global MATERIAL_ROW_ONCE
Trow_col = MATERIAL_ROW_COL
Trow_once = MATERIAL_ROW_ONCE
total_cost = 0 #Tstartup
total_cost += Nrow * (Trow_once + Ncol * Trow_col)
return total_cost
def array_model_form(args):
# (
# Nelem,
# ) = args
Telem_ence = 0.00898860
Telem_copy = 0.00631888
Nelem = args
ELEM_PER_PAGE = 1024
extend_cnt = math.ceil(math.log(float(Nelem)/ELEM_PER_PAGE, 2))
if extend_cnt < 0:
extend_cnt = 0
copy_cnt = ELEM_PER_PAGE * (math.pow(2, extend_cnt) - 1)
total_cost = Telem_ence * Nelem
#total_cost += Tmem_alloc * extend_cnt
total_cost += Telem_copy * copy_cnt
return total_cost
def get_row_size(reserve, col):
size = 16
size += reserve * 16
col /= 8
size += col * (3 + 8 + 4 + 8 + 16 + 32 + 64 + 128)
size += col
return size
def round_wasted_spave(rsize, psize):
nr = math.floor(float(psize / rsize))
waste = psize - nr * rsize
return rsize + waste / nr
def get_miss_prob(Nrow, Ncol, Nord, Turn):
total_size = Nrow * get_row_size(Nord, Ncol)
TLBcovered = Turn
if TLBcovered >= 0.9 * total_size:
hit = 0.9
else:
hit = TLBcovered / total_size
return 1 - hit
def sort_model_form(args,
#Tstartup,
#Trowstore_once,
#Trowstore_col,
# Tarray_once,
# Tarray_elem_copy,
# Tordercol,
#Treserve_cell,
Tcompare,
# Trow_once,
Tmiss_K1,
Turn
# Tmiss_K2
):
(
Nrow,
Ncol,
Nordering,
) = args
total_cost = 0 #Tstartup
# total_cost += Nrow * Trow_once
#cost for rowstore
# total_cost += material_model_form((Nrow, Ncol))
# total_cost += 0.0044 * Nrow * Ncol * Nordering
# total_cost += Tordercol * Nrow * Nordering
#cost for push array
# total_cost += array_model_form(Nrow)
# cost for sorting
Nordering_cmp = Nordering
if Nordering >= 1:
Nordering_cmp = 1
compare_cost = Tcompare * Nordering_cmp + Tmiss_K1 * get_miss_prob(Nrow, Ncol, Nordering, Turn)
total_cost += Nrow * compare_cost * math.log(Nrow, 2)
#cost for get row
# total_cost += Nrow * (Tmiss_K2 * get_miss_prob(Nrow, Ncol, Nordering))
return total_cost
def sort_model_arr(arg_sets,
#Tstartup,
# Trowstore_once,
# Trowstore_col,
# Tarray_once,
# Tarray_elem_copy,
# Tordercol,
# Treserve_cell,
Tcompare,
# Trow_once,
Tmiss_K1,
Turn,
# Tmiss_K2
):
res = []
for single_arg_set in arg_sets:
res.append(sort_model_form(single_arg_set,
# Tstartup,
# Trowstore_once,
# Trowstore_col,
# Tarray_once,
# Tarray_elem_copy,
# Tordercol,
# Treserve_cell,
Tcompare,
# Trow_once,
Tmiss_K1,
Turn,
# Tmiss_K2
))
return np.array(res)
sort_model = Model(sort_model_arr)
# #sort_model.set_param_hint("Tstartup", min=0.0)
# #sort_model.set_param_hint("Trow_startup", min=0.0)
# sort_model.set_param_hint("Trow_col", min=0.0)
# #sort_model.set_param_hint("Tcmp_startup", min=0.0)
# sort_model.set_param_hint("Trow_once", min=0.0)
# sort_model.set_param_hint("Tcompare", min=0.0)
# sort_model.set_param_hint("Talloc", min=0.0)
# sort_model.set_param_hint("Treserve_cell", min=0.0)
# sort_model.set_param_hint("Tstartup", min=0)
# sort_model.set_param_hint("Trowstore_once", min=0.0)
# sort_model.set_param_hint("Trowstore_col", min=0.0)
# sort_model.set_param_hint("Tarray_once", min=0.0)
# sort_model.set_param_hint("Tarray_elem_copy", min=0.0)
# sort_model.set_param_hint("Tordercol", min=0.0)
# sort_model.set_param_hint("Treserve_cell", min=0.0)
sort_model.set_param_hint("Tcompare", min=0.0)
# sort_model.set_param_hint("Trow_once", min=0.0)
sort_model.set_param_hint("Tmiss_K1", min=0.0)
sort_model.set_param_hint("Turn", min=2097152.0, max=2097153.0)
# sort_model.set_param_hint("Tmiss_K2", min=0.0)
def extract_info_from_line(line):
splited = line.split(",")
line_info = []
for item in splited:
line_info.append(float(item))
return line_info
if __name__ == '__main__':
#file_name = "scan_model.res.formal.prep"
#out_file_name = "scan_model.fit"
file_name = "sort_result_final"
out_file_name = "sort_model"
if os.path.exists(out_file_name):
os.remove(out_file_name)
# sys.argv.extend("-i sort.prep.bigint -o sort.model".split(" "))
output_fit_res = True
wrong_arg = False
opts,args = getopt.getopt(sys.argv[1:],"i:o:R:C:")
for op, value in opts:
if "-i" == op:
file_name = value
elif "-o" == op:
output_fit_res = True
out_file_name = value
elif "-R" == op:
MATERIAL_ROW_ONCE = float(value)
elif "-C" == op:
MATERIAL_ROW_COL = float(value)
else:
wrong_arg = True
if wrong_arg:
print "wrong arg"
sys.exit(1)
file = open(file_name, "r")
arg_sets = []
times = []
case_params = []
for line in file:
if line.startswith('#'):
continue
case_param = extract_info_from_line(line)
case_params.append(case_param)
arg_sets.append((case_param[0], case_param[1], case_param[2]))
times.append(case_param[4])
file.close()
arg_sets_np = np.array(arg_sets)
times_np = np.array(times)
#10, 0.20406430879623488, 0.016618100054245379, 14.0, 4.5, 37.0, -0.005, 0.5, -7.0
result = sort_model.fit(times_np, arg_sets=arg_sets_np,
# Tstartup=25.0,
# Trowstore_once=1.0,
# Trowstore_col=1.0,
# Tarray_once=1.0,
# Tarray_elem_copy=1.0,
# Tordercol=1.0,
# Treserve_cell=1.0,
Tcompare=1.0,
# Trow_once=1.0,
Tmiss_K1=1.0,
Turn=2097152,
# Tmiss_K2=1.0
)
# res_line = str(result.best_values["Tstartup"]) + ","
# res_line += str(result.best_values["Trowstore_once"]) + ","
# res_line += str(result.best_values["Trowstore_col"]) + ","
# res_line += str(result.best_values["Tarray_once"]) + ","
# res_line += str(result.best_values["Tarray_elem_copy"]) + ","
# res_line = str(result.best_values["Tordercol"]) + ","
# res_line = str(result.best_values["Treserve_cell"]) + ","
res_line = str(result.best_values["Tcompare"]) + ","
# res_line += str(result.best_values["Trow_once"]) #+ ","
res_line += str(result.best_values["Tmiss_K1"]) + ","
res_line += str(result.best_values["Turn"])
# res_line += str(result.best_values["Tmiss_K2"])
print result.fit_report()
if output_fit_res:
out_file = open(out_file_name, "w")
out_file.write(res_line)
out_file.close()

View File

@ -0,0 +1,271 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import math
import numpy as np
from scipy.optimize import leastsq
from scipy.optimize import curve_fit
import sys
from lmfit import Model
import getopt
import os
from cost_test_conf import Config
MATERIAL_ROW_COL = 0.02674675
MATERIAL_ROW_ONCE = 0.07931677
RESERVE_CELL = 0.0044
def material_model_form(args):
(
Nrow,
Ncol,
) = args
global MATERIAL_ROW_COL
global MATERIAL_ROW_ONCE
Trow_col = MATERIAL_ROW_COL
Trow_once = MATERIAL_ROW_ONCE
total_cost = 0 #Tstartup
total_cost += Nrow * (Trow_once + Ncol * Trow_col)
return total_cost
def array_model_form(args):
# (
# Nelem,
# ) = args
Telem_ence = 0.00898860
Telem_copy = 0.00631888
Nelem = args
ELEM_PER_PAGE = 1024
extend_cnt = math.ceil(math.log(float(Nelem)/ELEM_PER_PAGE, 2))
if extend_cnt < 0:
extend_cnt = 0
copy_cnt = ELEM_PER_PAGE * (math.pow(2, extend_cnt) - 1)
total_cost = Telem_ence * Nelem
#total_cost += Tmem_alloc * extend_cnt
total_cost += Telem_copy * copy_cnt
return total_cost
def get_row_size(reserve, col):
size = 16
size += reserve * 16
col /= 8
size += col * (3 + 8 + 4 + 8 + 16 + 32 + 64 + 128)
size += col
return size
def round_wasted_spave(rsize, psize):
nr = math.floor(float(psize / rsize))
waste = psize - nr * rsize
return rsize + waste / nr
def get_miss_prob(Nrow, Ncol, Nord, Turn):
total_size = Nrow * get_row_size(Nord, Ncol)
TLBcovered = Turn
if TLBcovered >= 0.9 * total_size:
hit = 0.9
else:
hit = TLBcovered / total_size
return 1 - hit
def sort_model_form(args,
#Tstartup,
#Trowstore_once,
#Trowstore_col,
# Tarray_once,
# Tarray_elem_copy,
# Tordercol,
#Treserve_cell,
Tcompare,
# Trow_once,
Tmiss_K1,
Turn
# Tmiss_K2
):
(
Nrow,
Ncol,
Nordering,
) = args
total_cost = 0 #Tstartup
# total_cost += Nrow * Trow_once
#cost for rowstore
# total_cost += material_model_form((Nrow, Ncol))
# total_cost += 0.0044 * Nrow * Ncol * Nordering
# total_cost += Tordercol * Nrow * Nordering
#cost for push array
# total_cost += array_model_form(Nrow)
# cost for sorting
Nordering_cmp = Nordering
if Nordering >= 1:
Nordering_cmp = 1
compare_cost = Tcompare * Nordering_cmp + Tmiss_K1 * get_miss_prob(Nrow, Ncol, Nordering, Turn)
total_cost += Nrow * compare_cost * math.log(Nrow, 2)
#cost for get row
# total_cost += Nrow * (Tmiss_K2 * get_miss_prob(Nrow, Ncol, Nordering))
return total_cost
def sort_model_arr(arg_sets,
#Tstartup,
# Trowstore_once,
# Trowstore_col,
# Tarray_once,
# Tarray_elem_copy,
# Tordercol,
# Treserve_cell,
Tcompare,
# Trow_once,
Tmiss_K1,
Turn,
# Tmiss_K2
):
res = []
for single_arg_set in arg_sets:
res.append(sort_model_form(single_arg_set,
# Tstartup,
# Trowstore_once,
# Trowstore_col,
# Tarray_once,
# Tarray_elem_copy,
# Tordercol,
# Treserve_cell,
Tcompare,
# Trow_once,
Tmiss_K1,
Turn,
# Tmiss_K2
))
return np.array(res)
sort_model = Model(sort_model_arr)
# #sort_model.set_param_hint("Tstartup", min=0.0)
# #sort_model.set_param_hint("Trow_startup", min=0.0)
# sort_model.set_param_hint("Trow_col", min=0.0)
# #sort_model.set_param_hint("Tcmp_startup", min=0.0)
# sort_model.set_param_hint("Trow_once", min=0.0)
# sort_model.set_param_hint("Tcompare", min=0.0)
# sort_model.set_param_hint("Talloc", min=0.0)
# sort_model.set_param_hint("Treserve_cell", min=0.0)
# sort_model.set_param_hint("Tstartup", min=0)
# sort_model.set_param_hint("Trowstore_once", min=0.0)
# sort_model.set_param_hint("Trowstore_col", min=0.0)
# sort_model.set_param_hint("Tarray_once", min=0.0)
# sort_model.set_param_hint("Tarray_elem_copy", min=0.0)
# sort_model.set_param_hint("Tordercol", min=0.0)
# sort_model.set_param_hint("Treserve_cell", min=0.0)
sort_model.set_param_hint("Tcompare", min=0.0)
# sort_model.set_param_hint("Trow_once", min=0.0)
sort_model.set_param_hint("Tmiss_K1", min=0.0)
sort_model.set_param_hint("Turn", min=2097152.0, max=2097153.0)
# sort_model.set_param_hint("Tmiss_K2", min=0.0)
def extract_info_from_line(line):
splited = line.split(",")
line_info = []
for item in splited:
line_info.append(float(item))
return line_info
if __name__ == '__main__':
#file_name = "scan_model.res.formal.prep"
#out_file_name = "scan_model.fit"
#file_name = "scan_model.res.formal.prep"
#out_file_name = "scan_model.fit"
file_name = "sort_add_" + Config.u_to_test_type + "_result_final"
out_file_name = "sort_add_" + Config.u_to_test_type + "_model"
# sys.argv.extend("-i sort.prep.bigint -o sort.model".split(" "))
if os.path.exists(out_file_name):
os.remove(out_file_name)
output_fit_res = False
wrong_arg = False
opts,args = getopt.getopt(sys.argv[1:],"i:o:R:C:")
for op, value in opts:
if "-i" == op:
file_name = value
elif "-o" == op:
output_fit_res = True
out_file_name = value
elif "-R" == op:
MATERIAL_ROW_ONCE = float(value)
elif "-C" == op:
MATERIAL_ROW_COL = float(value)
else:
wrong_arg = True
if wrong_arg:
print "wrong arg"
sys.exit(1)
file = open(file_name, "r")
arg_sets = []
times = []
case_params = []
for line in file:
if line.startswith('#'):
continue
case_param = extract_info_from_line(line)
case_params.append(case_param)
arg_sets.append((case_param[0], case_param[1], case_param[2]))
times.append(case_param[4])
file.close()
arg_sets_np = np.array(arg_sets)
times_np = np.array(times)
#10, 0.20406430879623488, 0.016618100054245379, 14.0, 4.5, 37.0, -0.005, 0.5, -7.0
result = sort_model.fit(times_np, arg_sets=arg_sets_np,
# Tstartup=25.0,
# Trowstore_once=1.0,
# Trowstore_col=1.0,
# Tarray_once=1.0,
# Tarray_elem_copy=1.0,
# Tordercol=1.0,
# Treserve_cell=1.0,
Tcompare=1.0,
# Trow_once=1.0,
Tmiss_K1=1.0,
Turn=2097152,
# Tmiss_K2=1.0
)
# res_line = str(result.best_values["Tstartup"]) + ","
# res_line += str(result.best_values["Trowstore_once"]) + ","
# res_line += str(result.best_values["Trowstore_col"]) + ","
# res_line += str(result.best_values["Tarray_once"]) + ","
# res_line += str(result.best_values["Tarray_elem_copy"]) + ","
# res_line = str(result.best_values["Tordercol"]) + ","
# res_line = str(result.best_values["Treserve_cell"]) + ","
res_line = str(result.best_values["Tcompare"]) + ","
# res_line += str(result.best_values["Trow_once"]) #+ ","
res_line += str(result.best_values["Tmiss_K1"]) + ","
res_line += str(result.best_values["Turn"])
# res_line += str(result.best_values["Tmiss_K2"])
print result.fit_report()
if output_fit_res:
out_file = open(out_file_name, "w")
out_file.write(res_line)
out_file.close()

View File

@ -0,0 +1,75 @@
import math
import numpy as np
from scipy.optimize import leastsq
from scipy.optimize import curve_fit
import sys
from lmfit import Model
import getopt
import subprocess
import os
import re
types_to_test = {'bigint':['bigint', 0.0266846, 0.07364082], 'double': ['double', 0.02970336, 0.07228732], 'float':['float', 0.02512819, 0.07295116], 'timestamp':['timestamp', 0.02998249, 0.07265038],
'number':['number(20,3)', 0.08238981, 0.15730252], 'v32':['varchar(32)', 0.08476897, 0.07518651], 'v64':['varchar(64)', 0.13678196, 0.05033624], 'v128':['varchar(128)', 0.22601192, 2.2963e-08]}
def run_cmd(cmd):
print cmd
res = ''
p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
while True:
line = p.stdout.readline()
res += line
if line:
#print line.strip()
sys.stdout.flush()
else:
break
p.wait()
return res.strip()
def rm_if_exist(filename):
if os.path.exists(filename):
os.remove(filename)
def extract_kv(k, src):
pat=k + ':\s*[\d\.e\-\+]+'
mat = re.compile(pat)
return float(mat.findall(src)[0].split()[1])
for t in sorted(types_to_test.keys()):
result_file_name = 'sort.result.' + t
prep_file_name = 'sort.prep.' + t
model_file = 'sort.model.' + t
fit_file = 'sort.fit.' + t
if not os.path.exists(result_file_name):
continue
rm_if_exist(prep_file_name)
rm_if_exist(model_file)
rm_if_exist(fit_file)
run_cmd("./preprocess.py -i %s -o %s -t 7 -C 4 -d" % (result_file_name, prep_file_name))
cmd = "./fit_sort.py -i %s -R %s -C %s -o %s" % (prep_file_name, str(types_to_test[t][2]), str(types_to_test[t][1]), model_file)
print cmd
fitres = run_cmd(cmd)
# print fitres
appres = run_cmd("./apply_sort_model.py -i %s -o %s -m %s" % (prep_file_name, fit_file, model_file))
print appres
#print fitres
# Treserve_cell = extract_kv('Treserve_cell', fitres)
# Tcompare = extract_kv('Tcompare', fitres)
# Tmiss_K1 = extract_kv('Tmiss_K1', fitres)
# Turn = extract_kv('Turn', fitres)
# # Trow_once = extract_kv('Trow_once', fitres)
# print types_to_test[t][0] + ":"
# # print " Treserve_cell:\t" + str(Treserve_cell)
# print " Tcompare:\t" + str(Tcompare)
# print " Tmiss_K1:\t" + str(Tmiss_K1)
# print " Turn:\t" + str(Turn)
# print " Trow_once:\t" + str(Trow_once)

View File

@ -0,0 +1,155 @@
from mylog.mylog import MyLogger
from op_generator import op_generator
from cost_test_conf import Config
import subprocess as sp
import os
from lmfit import Model
import numpy as np
hash_cls = op_generator.gen_operator("hash_join")
conf = Config()
conf.u_to_test_op_c = 'hash'
conf.is_not_running_as_unittest_c = True
conf.schema_file_c = 'c10k1x2.schema'
conf.left_row_count_c = 1000
conf.right_row_count_c = 1000
conf.left_min_c = 1
conf.right_min_c = 1
conf.is_random_c = True
conf.left_pj_c = 10
conf.right_pj_c = 10
hash_op = hash_cls(conf)
result_file_name = "hash_join_result"
if os.path.exists(result_file_name):
os.remove(result_file_name)
# step 2 do bench and gen data
case_run_time = 7
case_count = 0
row_count_max = 100000;
row_count_step = 2000;
total_case_count = row_count_max/row_count_step
total_case_count *= total_case_count
print "Total case count %s ..." % (total_case_count)
for left_row_count in xrange(1000, row_count_max + 1, row_count_step):
for right_row_count in xrange(1000, row_count_max + 1, row_count_step):
case_count+=1
hash_op.conf.left_row_count_c = left_row_count
hash_op.conf.right_row_count_c = right_row_count
hash_op.conf.left_max_c = max(left_row_count, right_row_count) * 3
hash_op.conf.right_max_c = hash_op.conf.left_max_c
sp.check_call("echo -n '%s,%s,' >> %s" % (left_row_count, right_row_count, result_file_name), shell=True)
print "Running case %s / %s ... : %s " % (case_count, total_case_count, hash_op.get_bench_cmd())
print "%s >> %s" % (hash_op.get_bench_cmd(), result_file_name)
sp.check_call("%s >> %s" % (hash_op.get_bench_cmd(), result_file_name), shell=True)
# step 3 process data
final_file_name = "hash_join_result_final"
if os.path.exists(final_file_name):
os.remove(final_file_name)
data_cmd = hash_op.get_data_preprocess_cmd()
sp.check_call(data_cmd, shell=True)
# step 4 fit and output
out_model_file_name = "hash_model"
if os.path.exists(out_model_file_name):
os.remove(out_model_file_name)
def hash_model_form(args,
Tstart_up,
Tbuild_htable,
Tright_row_once,
Tconvert_tuple,
#Tequal_cond,
#Tfilter_cond,
Tjoin_row
):
(
Nres_row,
Nleft_row,
Nright_row,
Nequal_cond,
) = args
total_cost = Tstart_up # Tstartup
total_cost += Nleft_row * Tbuild_htable
total_cost += Nright_row * Tright_row_once
total_cost += Nequal_cond * Tconvert_tuple
total_cost += Nres_row * Tjoin_row
return total_cost
def hash_model_arr(arg_sets,
Tstart_up,
Tbuild_htable,
Tright_row_once,
Tconvert_tuple,
#Tequal_cond,
#Tfilter_cond,
Tjoin_row):
res = []
for single_arg_set in arg_sets:
res.append(hash_model_form(single_arg_set,
Tstart_up,
Tbuild_htable,
Tright_row_once,
Tconvert_tuple,
#Tequal_cond,
#Tfilter_cond,
Tjoin_row))
return np.array(res)
def extract_info_from_line(line):
splited = line.split(",")
line_info = []
for item in splited:
line_info.append(float(item))
return line_info
hash_model = Model(hash_model_arr)
hash_model.set_param_hint("Tstart_up", min=0.0)
hash_model.set_param_hint("Tbuild_htable", min=0.0)
hash_model.set_param_hint("Tright_row_once", min=0.0)
hash_model.set_param_hint("Tconvert_tuple", min=0.0)
hash_model.set_param_hint("Tjoin_row", min=0.0)
file = open(final_file_name, "r")
arg_sets = []
times = []
case_params = []
for line in file:
if line.startswith('#'):
continue
case_param = extract_info_from_line(line)
case_params.append(case_param)
arg_sets.append((case_param[2], case_param[0], case_param[1], case_param[3]))
times.append(case_param[4])
file.close()
arg_sets_np = np.array(arg_sets)
times_np = np.array(times)
result = hash_model.fit(times_np, arg_sets=arg_sets_np,
Tstartup=0.0,
Tbuild_htable=0.0,
Tright_row_once=0.0,
Tconvert_tuple=0.0,
#Tequal_cond=0.0,
#Tfilter_cond=0.0,
Tjoin_row=0.0)
res_line = str(result.best_values["Tstart_up"]) + ","
res_line += str(result.best_values["Tbuild_htable"]) + ","
res_line += str(result.best_values["Tright_row_once"]) + ","
res_line += str(result.best_values["Tconvert_tuple"]) + ","
#res_line += str(result.best_values["Tequal_cond"]) + ","
#res_line += str(result.best_values["Tfilter_cond"]) + ","
res_line += str(result.best_values["Tjoin_row"])
print result.fit_report()
if out_model_file_name:
out_file = open(out_model_file_name, "w")
out_file.write(res_line)
out_file.close()

View File

@ -0,0 +1,132 @@
from mylog.mylog import MyLogger
from op_generator import op_generator
from cost_test_conf import Config
import subprocess as sp
import os
from lmfit import Model
import numpy as np
# step 1 gen op and conf
material_cls = op_generator.gen_operator("material")
conf = Config()
conf.u_to_test_op_c = 'material'
conf.is_not_running_as_unittest_c = True
conf.schema_file_c = 'c10k1.schema'
conf.row_count_c = 1000
conf.input_projector_count_c = 1
material_op = material_cls(conf)
result_file_name = 'material_result'
if os.path.exists(result_file_name):
os.remove(result_file_name)
# step 2 do_bench and gen data
row_count_max = 1001
row_count_step = 100
column_counts = [3, 5, 8]
case_run_time = 7
total_case_count = (row_count_max / row_count_step + 1) * len(column_counts) * case_run_time
case_count = 0
print "Total case count %s ..." % (total_case_count)
for row_count in xrange(1, row_count_max + 1, row_count_step):
for column_count in column_counts:
for time in xrange(case_run_time):
case_count += 1
material_op.conf.row_count_c = row_count
material_op.conf.input_projector_count_c = column_count
sp.check_call("echo -n '%s,' >> %s" % (row_count, result_file_name), shell=True)
sp.check_call("echo -n '%s,' >> %s" % (column_count, result_file_name), shell=True)
print "Running case %s / %s ... : %s " % (case_count, total_case_count, material_op.get_bench_cmd())
print "%s >> %s" % (material_op.get_bench_cmd(), result_file_name)
sp.check_call("%s >> %s" % (material_op.get_bench_cmd(), result_file_name), shell=True)
# step 3 preprocess data
final_file_name = "material_result_final"
if os.path.exists("material_final_result"):
os.remove("material_final_result")
data_cmd = material_op.get_data_preprocess_cmd()
sp.check_call(data_cmd, shell=True)
# step 4 fit and output
# given model form, do fit using previous result data
# case param should be considered with cost_model_util.cpp output format
# eg: material_test() in cost_model_util.cpp
# output row_count, cost_time
out_model_file_name = "material_model"
if os.path.exists(out_model_file_name):
os.remove(out_model_file_name)
def material_model_form(args,
# Tstartup,
Trow_once,
Trow_col):
(
Nrow,
Ncol,
) = args
total_cost = 0 # Tstartup
total_cost += Nrow * (Trow_once + Ncol * Trow_col)
return total_cost
def material_model_arr(arg_sets,
# Tstartup,
Trow_once,
Trow_col):
res = []
for single_arg_set in arg_sets:
res.append(material_model_form(single_arg_set,
# Tstartup,
Trow_once,
Trow_col))
return np.array(res)
def extract_info_from_line(line):
splited = line.split(",")
line_info = []
for item in splited:
line_info.append(float(item))
return line_info
material_model = Model(material_model_arr)
material_model.set_param_hint("Trow_once", min=0.0)
material_model.set_param_hint("Trow_col", min=0.0)
file = open(final_file_name, "r")
arg_sets = []
times = []
case_params = []
for line in file:
if line.startswith('#'):
continue
case_param = extract_info_from_line(line)
case_params.append(case_param)
arg_sets.append((case_param[0], case_param[1]))
times.append(case_param[3])
file.close()
arg_sets_np = np.array(arg_sets)
times_np = np.array(times)
# result is the fitting result model
result = material_model.fit(times_np, arg_sets=arg_sets_np,
# Tstartup=10.0,
Trow_once=10.0,
Trow_col=1.0
)
# res_line = str(result.best_values["Tstartup"]) + ","
res_line = str(result.best_values["Trow_once"]) + ","
res_line += str(result.best_values["Trow_col"])
print result.fit_report()
if out_model_file_name:
out_file = open(out_model_file_name, "w")
out_file.write(res_line)
out_file.close()

View File

@ -0,0 +1,43 @@
import logging
import sys
class Singleton(object):
def __new__(cls, *args, **kw):
if not hasattr(cls, '_instance'):
orig = super(Singleton, cls)
cls._instance = orig.__new__(cls, *args, **kw)
return cls._instance
class MyLogger(Singleton):
log = logging.getLogger(__name__)
##set to stdout
fmt = '%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(name)s - %(message)s'
formatter = logging.Formatter(fmt)
out_hdlr = logging.StreamHandler(sys.stdout)
#handler = logging.handlers.RotatingFileHandler(LOG_FILE, maxBytes=1024 * 1024, backupCount=5)
out_hdlr.setFormatter(formatter)
out_hdlr.setLevel(logging.INFO)
log.addHandler(out_hdlr)
log.setLevel(logging.INFO)
@staticmethod
def get_logger():
return MyLogger.log
@staticmethod
def info(str, *args, **kargs):
MyLogger.log.info(str, *args, **kargs)
@staticmethod
def warn(str, *args, **kargs):
MyLogger.log.warn(str, *args, **kargs)
@staticmethod
def error(str, *args, **kargs):
MyLogger.log.error(str, *args, **kargs)
if __name__ == '__main__':
MyLogger.get_logger().info("test")
MyLogger.get_logger().warn("test warn %s", 'test')
MyLogger.error("test error")

View File

@ -0,0 +1,59 @@
from cost_test_conf import Config
from mylog.mylog import MyLogger
import subprocess as sp
def init_func(self, conf):
self.conf = conf
def get_bench_cmd(self):
cmd = './cost_model_util ' + self.conf.gen_params()
return cmd
def get_data_preprocess_cmd(self):
cmd = 'python preprocess.py -i {0} -o {1} -d'.format(
self.__class__.__name__ + '_result',
self.__class__.__name__ + '_result_final'
)
return cmd
def do_bench(self):
MyLogger.info(self.conf)
cmd = self.get_bench_cmd()
MyLogger.info(cmd)
sp.check_call(cmd, shell=True)
data_cmd = self.get_data_preprocess_cmd()
sp.check_call(data_cmd, shell=True)
class op_generator(object):
op_dict = {}
'''
name if type is not None name = operatorname + test_type_name
'''
@staticmethod
def gen_operator(name):
if op_generator.op_dict.has_key(name):
return op_generator.op_dict[name]
else:
cls = type(name, (object,), {'__init__': init_func, 'do_bench': do_bench,
'get_bench_cmd': get_bench_cmd,
'get_data_preprocess_cmd': get_data_preprocess_cmd})
op_generator.op_dict[name] = cls
return cls
if __name__ == '__main__':
##mat related conf
material_cls = op_generator.gen_operator('material')
conf = Config()
conf.u_to_test_op_c = 'material'
conf.is_not_running_as_unittest_c = True
conf.schema_file_c = 'c10k1.schema'
conf.row_count_c = 1000
conf.input_projector_count_c = 1
material_op = material_cls(conf)
material_op.do_bench()

View File

@ -0,0 +1,81 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import sys
import numpy as np
import matplotlib as mpl
from matplotlib import cm
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import math
import getopt
def extract_int_info_from_line(line):
splited = line.split(",")
line_info = []
for item in splited:
line_info.append(int(float(item)))
return line_info
def case_cmp(a,b,c):
if a[c] < b[c] :
return -1
elif a[c] > b[c] :
return 1
else :
return 0
cmp_n = [lambda x, y, z = count: case_cmp(x, y, z) for count in range(10)]
#cmp_n = [lambda x, y: cmp(x[count], y[count]) for count in range(10)]
colors = ["red", "green", "blue", "yellow", "purple", "black", "pink" , "brown", "cyan" ,"orange"]
def do_plot(file_cases):
fig = plt.figure()
fig.set_size_inches((20,10))
ax1 = fig.add_subplot(111)
for i in xrange(len(file_cases)):
ax1.plot(file_cases[i][0], file_cases[i][1], color=colors[i])
plt.show()
if __name__ == '__main__':
file_names = []
horizen = 0
demension = 0
wrong_arg = False
opts,args = getopt.getopt(sys.argv[1:],"f:h:d:")
for op, value in opts:
if "-f" == op:
file_names.append(value)
elif "-h" == op:
horizen = int(value)
elif "-d" == op:
demension = int(value)
else:
wrong_arg = True
if horizen == demension or len(file_names) == 0 or wrong_arg:
print "wrong arg"
sys.exit()
file_cases = []
for name in file_names:
file = open(name)
horizens = []
demensions = []
cases = []
for line in file:
if line[0] == '[' or line.startswith('#'):
continue
case_param = extract_int_info_from_line(line)
cases.append(case_param)
cases.sort(cmp_n[horizen])
for case in cases:
horizens.append(case[horizen])
demensions.append(case[demension])
file_cases.append([np.array(horizens), np.array(demensions)])
do_plot(file_cases)

View File

@ -0,0 +1,81 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import sys
import numpy as np
import matplotlib as mpl
from matplotlib import cm
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import math
def extract_int_info_from_line(line):
splited = line.split(",")
line_info = []
for item in splited:
line_info.append(int(float(item)))
return line_info
def case_cmp(a,b,c):
if c > 1251:
print c
if a[c] < b[c] :
return -1
elif a[c] > b[c] :
return 1
else :
return 0
cmp_n = [lambda x, y, z = count: case_cmp(x, y, z) for count in range(10)]
#cmp_n = [lambda x, y: cmp(x[count], y[count]) for count in range(10)]
colors = ["red", "green", "blue", "yellow", "purple", "black", "pink", "cyan", "brown", "gray"]
def do_plot(arg, horizen, need_columns_id,label):
arrs = []
for i in arg[0]:
arrs.append([])
for case in arg:
for i in xrange(len(case)):
arrs[i].append(case[i])
np_arrs = [np.array(a) for a in arrs]
fig = plt.figure()
fig.set_size_inches((20,10))
ax1 = fig.add_subplot(111)
ax1.set_label(label)
color_id = 0
for i in xrange(len(np_arrs)):
if i == horizen:
continue
elif i in need_columns_id:
ax1.plot(np_arrs[horizen], np_arrs[i], color=colors[color_id])
color_id = color_id + 1
plt.show()
if __name__ == '__main__':
#filename column_count horizen
if len(sys.argv) < 4:
print "wrong arg"
pass
else:
file_name = sys.argv[1]
horizen = int(sys.argv[2])
file = open(file_name, "r")
need_columns = sys.argv[3]
if need_columns == "all":
need_columns_id = [i for i in xrange(100)]
else:
need_columns_id = [int(i) for i in need_columns.split(",")]
cases = []
for line in file:
if line[0] == '[' or line.startswith('#'):
continue
case_param = extract_int_info_from_line(line)
cases.append(case_param)
cases.sort(cmp_n[horizen])
do_plot(cases, horizen, need_columns_id, file)

View File

@ -0,0 +1,175 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import sys
import os
import numpy as np
import getopt
file_name = "scan_model.res.formal"
if len(sys.argv) >= 2:
file_name = sys.argv[1]
out_file_name = file_name + ".prep"
time_per_case = 2
use_delete_min_max = False
filters = []
out_columns = [c for c in xrange(100)]
cols_supplied = False
wrong_arg = False
target_column_id = 0
#sys.argv.extend("-i sort_result -o sort.prep -t 5 -C 4 -f 0,g,1 -f 0,le,100000".split(" "))
opts,args = getopt.getopt(sys.argv[1:],"i:o:t:f:a:dc:C:")
for op, value in opts:
if "-i" == op:
file_name = value
elif "-o" == op:
out_file_name = value
elif "-t" == op:
time_per_case = int(value)
elif "-f" == op:
filter_str = value
filter_elements = filter_str.split(",")
if not filter_elements[1] in ["g","l","ge","le","e","ne"]:
print "invalid filter type"
sys.exit(1)
filters.append(filter_str.split(","))
elif "-a" == op:
time_per_case = int(value)
elif "-d" == op:
use_delete_min_max = True
elif "-C" == op:
target_column_id = int(value)
elif "-c" == op:
if not cols_supplied:
cols_supplied = True
out_columns = []
out_columns.extend([int(c) for c in value.split(",")])
else:
wrong_arg = True
if wrong_arg:
print "wrong arg"
sys.exit(1)
if time_per_case < 5:
use_delete_min_max = False
if os.path.exists(out_file_name):
os.remove(out_file_name)
origin_file = open(file_name, "r")
out_file = open(out_file_name,"w")
i = 0
column_nums = []
avgs = []
avg_strs = []
def delete(li, index):
li = li[:index] + li[index+1:]
return li
def find_max_index(l):
max = -9999999999999999999999
max_i = -1
for i in xrange(len(l)):
if l[i] > max:
max = l[i]
max_i = i
return max_i
def find_min_index(l):
min = 999999999999999999999999
min_i = -1
for i in xrange(len(l)):
if l[i] < min:
min = l[i]
min_i = i
return min_i
def delete_max_min_case(column_nums, column_id):
# min_i = find_min_index(column_nums[len(column_nums) - 1])
# for j in xrange(len(column_nums)):
# column_nums[j] = delete(column_nums[j], min_i)
max_i = find_max_index(column_nums[column_id])
for j in xrange(len(column_nums)):
column_nums[j] = delete(column_nums[j], max_i)
max_i = find_max_index(column_nums[column_id])
for j in xrange(len(column_nums)):
column_nums[j] = delete(column_nums[j], max_i)
# max_i = find_max_index(column_nums[column_id])
# for j in xrange(len(column_nums)):
# column_nums[j] = delete(column_nums[j], max_i)
# max_i = find_max_index(column_nums[column_id])
# for j in xrange(len(column_nums)):
# column_nums[j] = delete(column_nums[j], max_i)
def do_filter(column_strs):
filtered = False
for f in filters:
if f[1] == "g" and float(column_strs[int(f[0])]) <= int(f[2]) :
filtered = True
break
elif f[1] == "l" and float(column_strs[int(f[0])]) >= int(f[2]) :
filtered = True
break
elif f[1] == "ge" and float(column_strs[int(f[0])]) < int(f[2]) :
filtered = True
break
elif f[1] == "le" and float(column_strs[int(f[0])]) > int(f[2]) :
filtered = True
break
elif f[1] == "e" and float(column_strs[int(f[0])]) != int(f[2]) :
filtered = True
break
elif f[1] == "ne" and float(column_strs[int(f[0])]) == int(f[2]) :
filtered = True
break
return filtered
for line in origin_file:
if line.startswith("#"):
out_file.write(line)
continue #skip comment
column_strs_raw = line.split(",")
if do_filter(column_strs_raw):
continue
column_count = len(column_strs_raw)
if i == 0:
avg_strs = []
avgs = []
column_nums = []
for n in xrange(column_count):
column_nums.append([])
#split line and cast to float
for n in xrange(column_count):
column_nums[n].append(float(column_strs_raw[n]))
if i == time_per_case - 1:
if use_delete_min_max:
delete_max_min_case(column_nums, target_column_id)
#calc avg per column
for n in xrange(column_count):
avgs.append(np.mean(column_nums[n]))
#cast to str
avg_strs = [str(a) for a in avgs]
real_avg_strs = []
#out_columns filter
for cid in xrange(len(avg_strs)):
if cid in out_columns:
real_avg_strs.append(avg_strs[cid])
out_file.write(",".join(real_avg_strs) + "\n")
i = (i + 1) % time_per_case
origin_file.close()
out_file.close()

View File

@ -0,0 +1,114 @@
__author__ = 'canfang.scf'
from op_generator import op_generator
from cost_test_conf import Config
import subprocess as sp
import os
from lmfit import Model
import numpy as np
hash_cls = op_generator.gen_operator("hash_join")
conf = Config()
conf.u_to_test_op_c = 'hash'
conf.is_not_running_as_unittest_c = True
conf.schema_file_c = 'c10k1x2.schema'
conf.left_row_count_c = 1000
conf.right_row_count_c = 1000
conf.left_min_c = 1
conf.right_min_c = 1
conf.is_random_c = True
hash_op = hash_cls(conf)
# step 3 process data
final_file_name = "hash_join_result_final"
if os.path.exists(final_file_name):
os.remove(final_file_name)
data_cmd = hash_op.get_data_preprocess_cmd()
sp.check_call(data_cmd, shell=True)
# step 4 fit and output
out_model_file_name = "hash_model"
if os.path.exists(out_model_file_name):
os.remove(out_model_file_name)
def hash_model_form(args,
Tstart_up,
Tright_outer_once,
Tleft_outer_once,
#Tjoin_row
):
(
Nres_row,
Nleft_row,
Nright_row,
Nequal_cond,
Nno_matched_right,
Nno_matched_left
) = args
total_cost = Tstart_up # Tstartup
total_cost += Nleft_row * 0.74497774
total_cost += Nright_row * 0.26678144
total_cost += Nequal_cond * 0.86340381
total_cost += Nres_row * 0.28939532
total_cost += Nno_matched_left * Tright_outer_once
total_cost += Nno_matched_right * Tleft_outer_once
return total_cost
def hash_model_arr(arg_sets,
Tstart_up,
Tright_outer_once,
Tleft_outer_once):
res = []
for single_arg_set in arg_sets:
res.append(hash_model_form(single_arg_set,
Tstart_up,
Tright_outer_once,
Tleft_outer_once))
return np.array(res)
def extract_info_from_line(line):
splited = line.split(",")
line_info = []
for item in splited:
line_info.append(float(item))
return line_info
hash_model = Model(hash_model_arr)
hash_model.set_param_hint("Tstart_up", min=0.0)
# hash_model.set_param_hint("Tbuild_htable", min=0.0)
# hash_model.set_param_hint("Tright_row_once", min=0.0)
# hash_model.set_param_hint("Tconvert_tuple", min=0.0)
hash_model.set_param_hint("Tright_outer_once", min=0.0)
hash_model.set_param_hint("Tleft_outer_once", min=0.0)
#hash_model.set_param_hint("Tjoin_row", min=0.0)
file = open(final_file_name, "r")
arg_sets = []
times = []
case_params = []
for line in file:
if line.startswith('#'):
continue
case_param = extract_info_from_line(line)
case_params.append(case_param)
arg_sets.append((case_param[2], case_param[0], case_param[1], case_param[3], case_param[4], case_param[5]))
times.append(case_param[6])
file.close()
arg_sets_np = np.array(arg_sets)
times_np = np.array(times)
result = hash_model.fit(times_np, arg_sets=arg_sets_np,
Tstartup=0.0,
Tright_outer_once=0.0,
Tleft_outer_once=0.0)
res_line = str(result.best_values["Tstart_up"]) + ","
res_line += str(result.best_values["Tright_outer_once"]) + ","
res_line += str(result.best_values["Tleft_outer_once"])
print result.fit_report()
if out_model_file_name:
out_file = open(out_model_file_name, "w")
out_file.write(res_line)
out_file.close()

View File

@ -0,0 +1,172 @@
#!/bin/env python
__author__ = 'dongyun.zdy'
import sys
import os
import numpy as np
import getopt
file_name = "scan_model.res.formal"
if len(sys.argv) >= 2:
file_name = sys.argv[1]
out_file_name = file_name + ".prep"
time_per_case = 5
use_delete_min_max = False
filters = []
out_columns = [c for c in xrange(100)]
cols_supplied = False
wrong_arg = False
target_column_id = 0
#sys.argv.extend("-i sort_result -o sort.8.test -t 7 -C 2 -f 1,e,8".split(" "))
sys.argv.extend("-i nestloop_result -o nl_result".split(" "))
opts,args = getopt.getopt(sys.argv[1:],"i:o:t:f:a:dc:C:")
for op, value in opts:
if "-i" == op:
file_name = value
elif "-o" == op:
out_file_name = value
elif "-t" == op:
time_per_case = int(value)
elif "-f" == op:
filter_str = value
filter_elements = filter_str.split(",")
if not filter_elements[1] in ["g","l","ge","le","e","ne"]:
print "invalid filter type"
sys.exit(1)
filters.append(filter_str.split(","))
elif "-a" == op:
time_per_case = int(value)
elif "-d" == op:
use_delete_min_max = True
elif "-C" == op:
target_column_id = int(value)
elif "-c" == op:
if not cols_supplied:
cols_supplied = True
out_columns = []
out_columns.extend([int(c) for c in value.split(",")])
else:
wrong_arg = True
if wrong_arg:
print "wrong arg"
sys.exit(1)
if time_per_case < 5:
use_delete_min_max = False
if os.path.exists(out_file_name):
os.remove(out_file_name)
origin_file = open(file_name, "r")
out_file = open(out_file_name,"w")
i = 0
column_nums = []
avgs = []
avg_strs = []
def delete(li, index):
li = li[:index] + li[index+1:]
return li
def find_max_index(l):
max = -9999999999999999999999
max_i = -1
for i in xrange(len(l)):
if l[i] > max:
max = l[i]
max_i = i
return max_i
def find_min_index(l):
min = 999999999999999999999999
min_i = -1
for i in xrange(len(l)):
if l[i] < min:
min = l[i]
min_i = i
return min_i
def delete_max_min_case(column_nums, column_id):
# min_i = find_min_index(column_nums[len(column_nums) - 1])
# for j in xrange(len(column_nums)):
# column_nums[j] = delete(column_nums[j], min_i)
max_i = find_max_index(column_nums[column_id])
for j in xrange(len(column_nums)):
column_nums[j] = delete(column_nums[j], max_i)
max_i = find_max_index(column_nums[column_id])
for j in xrange(len(column_nums)):
column_nums[j] = delete(column_nums[j], max_i)
def do_filter(column_strs):
filtered = False
for f in filters:
if f[1] == "g" and float(column_strs[int(f[0])]) <= int(f[2]) :
filtered = True
break
elif f[1] == "l" and float(column_strs[int(f[0])]) >= int(f[2]) :
filtered = True
break
elif f[1] == "ge" and float(column_strs[int(f[0])]) < int(f[2]) :
filtered = True
break
elif f[1] == "le" and float(column_strs[int(f[0])]) > int(f[2]) :
filtered = True
break
elif f[1] == "e" and float(column_strs[int(f[0])]) != int(f[2]) :
filtered = True
break
elif f[1] == "ne" and float(column_strs[int(f[0])]) == int(f[2]) :
filtered = True
break
return filtered
state = 0 #comment line
elements = []
for line in origin_file:
line = line.strip()
if state == 0:
out_file.write(line + "\n")
elif state == 1:
elements = line.split(",row_count : ")
elif state == 2:
pass
elif state == 3:
pass
elif state == 4:
elements.append(line.split("join_time except conds : ")[1])
elif state == 5:
elements.append(line.split("equal_eval : ")[1])
elif state == 6:
pass
elif state == 7:
elements.append(line.split("other_eval : ")[1])
elif state == 8:
pass
elif state == 9:
elements.append(line.split("right_cache_put : ")[1])
elif state == 10:
elements.append(line.split("right_cache_acc : ")[1])
elif state == 11:
elements.append(line.split("match_group_count : ")[1])
out_file.write(",".join(elements) + "\n")
else:
print "wrong state"
state = (state + 1) % 12
origin_file.close()
out_file.close()

View File

@ -0,0 +1 @@
create table t1 (a varchar(100) primary key);

View File

@ -0,0 +1 @@
create table t1 (a varchar(200) primary key);