patch 4.0

2022-10-24 10:34:53 +08:00
parent 4ad6e00ec3
commit 93a1074b0c
10533 changed files with 2588271 additions and 2299373 deletions
--- a/unittest/sql/optimizer/cost_model_utils/README
+++ b/unittest/sql/optimizer/cost_model_utils/README
@ -0,0 +1,6 @@
+1、cost_model_util：根据输入参数采集实验数据。需要将被测目标在这里建立一个参数可控的最小运行环境，提供基本的数据生成、schema控制等功能。
+2、benchmaster_xxx：生成参数组合调用cost_model_util，需要根据被测目标的特点控制参数的种类、数量。例如对于join,需要控制左右表行数等。
+3、preprocess：对原始数据进行预处理。例如benchmaster对每组参数运行多次，在这进行去极值、取平均等工作，将同参数的几组数据合为一组。
+4、fit_xx：进行拟合
+5、plot：绘制图像
+
--- a/unittest/sql/optimizer/cost_model_utils/init.py
+++ b/unittest/sql/optimizer/cost_model_utils/init.py
--- a/unittest/sql/optimizer/cost_model_utils/apply_array_model.py
+++ b/unittest/sql/optimizer/cost_model_utils/apply_array_model.py
@ -0,0 +1,88 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+import getopt
+import sys
+import math
+
+
+
+def array_model_form(args,
+                     params):
+    Nelem = args
+
+    (
+        Telem_ence,
+        Telem_copy
+    ) = params
+
+    ELEM_PER_PAGE = 1024
+    extend_cnt = math.ceil(math.log(float(Nelem)/ELEM_PER_PAGE, 2))
+    if extend_cnt < 0:
+        extend_cnt = 0
+    copy_cnt = ELEM_PER_PAGE * (math.pow(2, extend_cnt) - 1)
+
+    total_cost = Telem_ence * Nelem
+    #total_cost += Tmem_alloc * extend_cnt
+    total_cost += Telem_copy * copy_cnt
+
+    return total_cost
+
+
+def extract_info_from_line(line):
+    splited = line.split(",")
+    line_info = []
+    for item in splited:
+        line_info.append(float(item))
+    return line_info
+
+
+file_name = "get_total.data.prep"
+output_fit_res = False
+wrong_arg = False
+opts,args = getopt.getopt(sys.argv[1:],"i:o:m:")
+for op, value in opts:
+    if "-i" == op:
+        file_name = value
+    elif "-o" == op:
+        output_fit_res = True
+        out_file_name = value
+    elif "-m" == op:
+        model_file_name = value
+    else:
+        wrong_arg = True
+
+if wrong_arg:
+    print "wrong arg"
+    sys.exit(1)
+
+input_file = open(file_name, "r")
+model_file = open(model_file_name, "r")
+out_file = open(out_file_name, "w")
+
+
+line = model_file.readline()
+model_params = [float(p) for p in line.split(",")]
+
+
+
+for line in input_file:
+    if line.startswith('#'):
+        out_file.write(line)
+        continue
+    case_param = extract_info_from_line(line)
+    args = (case_param[0])
+    time = case_param[1]
+    cost_val = array_model_form(args, model_params)
+    percent = (cost_val - time) / time
+
+    # new_line = ",".join([line.strip(),"\t" ,str(cost_val),"\t" , str(time),"\t\t" , str(percent * 100)])
+    # new_line += "\n"
+    # out_file.write(new_line)
+
+    new_line = ",".join([line.strip(), str(cost_val)])
+    new_line += "\n"
+    out_file.write(new_line)
+
+out_file.close()
+
+
--- a/unittest/sql/optimizer/cost_model_utils/apply_hg_model.py
+++ b/unittest/sql/optimizer/cost_model_utils/apply_hg_model.py
@ -0,0 +1,100 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+import getopt
+import sys
+import math
+
+
+def mg_model_form(args,
+                  params
+                  ):
+    (
+        Nrow_input,
+        Nrow_res,
+        Ncol_input,
+        Ncol_aggr,
+        Ncol_group
+    ) = args
+
+    (
+       Tstartup,
+       Trow_once,
+       Tres_once,
+       Taggr_prepare_result,
+       Taggr_process,
+       Tgroup_cmp_col,
+       Tcopy_col
+    ) = params
+
+    total_cost = Tstartup + Nrow_res * Tres_once + Nrow_input * Trow_once
+    #cost for judge group
+    total_cost += Nrow_input * Ncol_group * Tgroup_cmp_col
+
+    #cost for group related operation
+    total_cost += Nrow_res * (Ncol_input * Tcopy_col)
+    total_cost += Nrow_res * (Ncol_aggr * Taggr_prepare_result)
+
+    #cost for input row process
+    total_cost += Nrow_input * (Ncol_aggr * Taggr_process)
+
+    return total_cost
+
+
+
+
+def extract_info_from_line(line):
+    splited = line.split(",")
+    line_info = []
+    for item in splited:
+        line_info.append(float(item))
+    return line_info
+
+
+
+file_name = "get_total.data.prep"
+output_fit_res = False
+wrong_arg = False
+opts,args = getopt.getopt(sys.argv[1:],"i:o:m:")
+for op, value in opts:
+    if "-i" == op:
+        file_name = value
+    elif "-o" == op:
+        output_fit_res = True
+        out_file_name = value
+    elif "-m" == op:
+        model_file_name = value
+    else:
+        wrong_arg = True
+
+if wrong_arg:
+    print "wrong arg"
+    sys.exit(1)
+
+input_file = open(file_name, "r")
+model_file = open(model_file_name, "r")
+out_file = open(out_file_name, "w")
+
+
+line = model_file.readline()
+model_params = [float(p) for p in line.split(",")]
+
+
+for line in input_file:
+    case_param = extract_info_from_line(line)
+    args = (case_param[0],
+            case_param[5],
+            case_param[4],
+            case_param[2],
+            case_param[3])
+    time = case_param[6]
+    cost_val = mg_model_form(args, model_params)
+    percent = (cost_val - time) / time
+
+    new_line = ",".join([line.strip(),"\t" ,str(cost_val),"\t" , str(time),"\t\t" , str(percent * 100)])
+    new_line += "\n"
+    out_file.write(new_line)
+
+out_file.close()
+
+
+
--- a/unittest/sql/optimizer/cost_model_utils/apply_material_model.py
+++ b/unittest/sql/optimizer/cost_model_utils/apply_material_model.py
@ -0,0 +1,82 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+import getopt
+import sys
+import math
+
+
+
+def material_model_form(args,
+                        params):
+    (
+        Nrow,
+        Ncol,
+    ) = args
+
+    (
+        # Tstartup,
+        Trow_once,
+        Trow_col
+    ) = params
+
+    total_cost = 0 #Tstartup
+    total_cost += Nrow * (Trow_once + Ncol * Trow_col)
+
+    return total_cost
+
+
+def extract_info_from_line(line):
+    splited = line.split(",")
+    line_info = []
+    for item in splited:
+        line_info.append(float(item))
+    return line_info
+
+
+file_name = "get_total.data.prep"
+output_fit_res = False
+wrong_arg = False
+opts,args = getopt.getopt(sys.argv[1:],"i:o:m:")
+for op, value in opts:
+    if "-i" == op:
+        file_name = value
+    elif "-o" == op:
+        output_fit_res = True
+        out_file_name = value
+    elif "-m" == op:
+        model_file_name = value
+    else:
+        wrong_arg = True
+
+if wrong_arg:
+    print "wrong arg"
+    sys.exit(1)
+
+input_file = open(file_name, "r")
+model_file = open(model_file_name, "r")
+out_file = open(out_file_name, "w")
+
+
+line = model_file.readline()
+model_params = [float(p) for p in line.split(",")]
+
+
+
+for line in input_file:
+    if line.startswith('#'):
+        out_file.write(line)
+        continue
+    case_param = extract_info_from_line(line)
+    args = (case_param[0],
+            case_param[1])
+    time = case_param[3]
+    cost_val = material_model_form(args, model_params)
+    percent = (cost_val - time) / time
+
+    new_line = ",".join([line.strip(),"\t" ,str(cost_val),"\t" , str(time),"\t\t" , str(percent * 100)])
+    new_line += "\n"
+    out_file.write(new_line)
+
+out_file.close()
+
+
--- a/unittest/sql/optimizer/cost_model_utils/apply_merge_model.py
+++ b/unittest/sql/optimizer/cost_model_utils/apply_merge_model.py
@ -0,0 +1,146 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+import getopt
+import sys
+import math
+
+
+
+def merge_model_form(args,
+                     params
+                     ):
+    (
+        Nrow_res,
+        Nrow_left,
+        Nrow_right,
+        Nright_cache_in,
+        Nright_cache_out,
+        Nright_cache_clear,
+        Nequal_cond
+    ) = args
+
+    (
+        Tstartup,
+        Tres_right_op,
+        Tres_right_cache,
+        Tmatch_group,
+        #Tassemble_row,
+        Tequal_fail,
+        Trow_left,
+        Trow_right
+    ) = params
+
+    total_cost = Tstartup
+    total_cost += Nrow_left * Trow_left
+    total_cost += (Nrow_right - Nright_cache_in) * Trow_right
+    total_cost += Nright_cache_in * Tres_right_op
+    total_cost += Nright_cache_out * Tres_right_cache
+    #total_cost += Nrow_res * Tassemble_row
+    total_cost += Nright_cache_clear * Tmatch_group
+    total_cost += (Nequal_cond - Nrow_res - 2 * Tmatch_group) * Tequal_fail
+
+
+    # total_cost += Nright_cache_in * Tres_right_op
+    # total_cost += (Nrow_res - Nright_cache_in) * Tres_right_cache
+    # total_cost += Nright_cache_clear * Tmatch_group
+    # total_cost += Nrow_res * Tassemble_row
+    # total_cost += (Nequal_cond - Nrow_res - 2 * Tmatch_group) * Tequal_fail
+    # total_cost += Nrow_left * Trow_left
+    # total_cost += (Nrow_right - Nright_cache_in) * Trow_right
+
+    return total_cost
+#
+# def merge_model_form(args,
+#                      params
+#                      ):
+#     (
+#         Nrow_res,
+#         Nrow_left,
+#         Nrow_right,
+#         Nright_cache_in,
+#         Nright_cache_out,
+#         Nright_cache_clear,
+#         Nequal_cond,
+#     ) = args
+#
+#     (
+#         Tstartup,
+#         Tright_cache_in,
+#         Tright_cache_out,
+#         Tright_cache_clear,
+#         Tassemble_row,
+#         Tequal_fail,
+#         Trow_left,
+#         #Trow_right
+#     ) = params
+#
+#     total_cost = Tstartup
+#     total_cost += Nright_cache_in * Tright_cache_in
+#     total_cost += (Nright_cache_out - Nright_cache_clear) * Tright_cache_out
+#     total_cost += Nright_cache_clear * Tright_cache_clear
+#     total_cost += Nrow_res * Tassemble_row
+#     total_cost += (Nequal_cond - Nrow_res - 2 * Tright_cache_clear) * Tequal_fail
+#     total_cost += Nrow_left * Trow_left
+#     #total_cost += (Nrow_right - Nright_cache_in) * Trow_right
+#
+#     return total_cost
+
+
+
+def extract_info_from_line(line):
+    splited = line.split(",")
+    line_info = []
+    for item in splited:
+        line_info.append(float(item))
+    return line_info
+
+
+file_name = "get_total.data.prep"
+output_fit_res = False
+wrong_arg = False
+opts,args = getopt.getopt(sys.argv[1:],"i:o:m:")
+for op, value in opts:
+    if "-i" == op:
+        file_name = value
+    elif "-o" == op:
+        output_fit_res = True
+        out_file_name = value
+    elif "-m" == op:
+        model_file_name = value
+    else:
+        wrong_arg = True
+
+if wrong_arg:
+    print "wrong arg"
+    sys.exit(1)
+
+input_file = open(file_name, "r")
+model_file = open(model_file_name, "r")
+out_file = open(out_file_name, "w")
+
+
+line = model_file.readline()
+model_params = [float(p) for p in line.split(",")]
+
+
+for line in input_file:
+    case_param = extract_info_from_line(line)
+    args = (case_param[6],     #Nrow_res
+            case_param[0],     #Nrow_left
+            case_param[1],     #Nrow_right
+            case_param[-3],    #Nright_cache_in
+            case_param[-2],    #Nright_cache_out
+            case_param[-1],
+            case_param[8])
+    time = case_param[7]
+    cost_val = merge_model_form(args, model_params)
+    percent = (cost_val - time) / time
+
+    new_line = ",".join([line.strip(),"\t" ,str(cost_val),"\t" , str(time),"\t\t" , str(percent * 100)])
+    new_line += "\n"
+    out_file.write(new_line)
+
+out_file.close()
+
+
+
--- a/unittest/sql/optimizer/cost_model_utils/apply_mg_model.py
+++ b/unittest/sql/optimizer/cost_model_utils/apply_mg_model.py
@ -0,0 +1,101 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+import getopt
+import sys
+import math
+
+
+def mg_model_form(args,
+                  params
+                  ):
+    (
+        Nrow_input,
+        Nrow_res,
+        Ncol_input,
+        Ncol_aggr,
+        Ncol_group
+    ) = args
+
+    (
+       #Tstartup,
+       Trow_once,
+       Tres_once,
+       Taggr_prepare_result,
+       Taggr_process,
+       Tgroup_cmp_col,
+       Tcopy_col
+    ) = params
+
+    total_cost = Nrow_res * Tres_once + Nrow_input * Trow_once
+    #cost for judge group
+    total_cost += Nrow_res * Tgroup_cmp_col
+    total_cost += (Nrow_input - Nrow_res) * Ncol_group * Tgroup_cmp_col
+
+    #cost for group related operation
+    total_cost += Nrow_res * (Ncol_input * Tcopy_col)
+    total_cost += Nrow_res * (Ncol_aggr * Taggr_prepare_result)
+
+    #cost for input row process
+    total_cost += Nrow_input * (Ncol_aggr * Taggr_process)
+
+    return total_cost
+
+
+
+
+def extract_info_from_line(line):
+    splited = line.split(",")
+    line_info = []
+    for item in splited:
+        line_info.append(float(item))
+    return line_info
+
+
+
+file_name = "get_total.data.prep"
+output_fit_res = False
+wrong_arg = False
+opts,args = getopt.getopt(sys.argv[1:],"i:o:m:")
+for op, value in opts:
+    if "-i" == op:
+        file_name = value
+    elif "-o" == op:
+        output_fit_res = True
+        out_file_name = value
+    elif "-m" == op:
+        model_file_name = value
+    else:
+        wrong_arg = True
+
+if wrong_arg:
+    print "wrong arg"
+    sys.exit(1)
+
+input_file = open(file_name, "r")
+model_file = open(model_file_name, "r")
+out_file = open(out_file_name, "w")
+
+
+line = model_file.readline()
+model_params = [float(p) for p in line.split(",")]
+
+
+for line in input_file:
+    case_param = extract_info_from_line(line)
+    args = (case_param[0],
+            case_param[5],
+            case_param[4],
+            case_param[2],
+            case_param[3])
+    time = case_param[6]
+    cost_val = mg_model_form(args, model_params)
+    percent = (cost_val - time) / time
+
+    new_line = ",".join([line.strip(),"\t" ,str(cost_val),"\t" , str(time),"\t\t" , str(percent * 100)])
+    new_line += "\n"
+    out_file.write(new_line)
+
+out_file.close()
+
+
+
--- a/unittest/sql/optimizer/cost_model_utils/apply_nl_model.py
+++ b/unittest/sql/optimizer/cost_model_utils/apply_nl_model.py
@ -0,0 +1,97 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+import getopt
+import sys
+import math
+
+
+
+def nl_model_form(args,
+                  params
+                  ):
+    (
+        Nrow_res,
+        Nrow_left,
+        Nrow_right,
+        Nright_cache_in,
+        Nright_cache_out,
+        Nright_cache_clear,
+        Nequal_cond
+    ) = args
+
+    (
+        Tstartup,
+        #Tqual,
+        Tres,
+        Tfail,
+        Tleft_row,
+        Tright_row
+    ) = params
+
+    total_cost = Tstartup
+    total_cost += Nrow_res * Tres
+    #total_cost += Nequal_cond * Tqual
+    total_cost += (Nequal_cond - Nrow_res) * Tfail
+    total_cost += Nrow_left * Tleft_row
+    total_cost += Nrow_right * Tright_row
+
+    return total_cost
+
+
+def extract_info_from_line(line):
+    splited = line.split(",")
+    line_info = []
+    for item in splited:
+        line_info.append(float(item))
+    return line_info
+
+
+file_name = "get_total.data.prep"
+output_fit_res = False
+wrong_arg = False
+opts,args = getopt.getopt(sys.argv[1:],"i:o:m:")
+for op, value in opts:
+    if "-i" == op:
+        file_name = value
+    elif "-o" == op:
+        output_fit_res = True
+        out_file_name = value
+    elif "-m" == op:
+        model_file_name = value
+    else:
+        wrong_arg = True
+
+if wrong_arg:
+    print "wrong arg"
+    sys.exit(1)
+
+input_file = open(file_name, "r")
+model_file = open(model_file_name, "r")
+out_file = open(out_file_name, "w")
+
+
+line = model_file.readline()
+model_params = [float(p) for p in line.split(",")]
+
+
+for line in input_file:
+    case_param = extract_info_from_line(line)
+    args = (case_param[6],     #Nrow_res
+            case_param[0],     #Nrow_left
+            case_param[1],     #Nrow_right
+            case_param[-3],    #Nright_cache_in
+            case_param[-2],    #Nright_cache_out
+            case_param[-1],
+            case_param[8])
+    time = case_param[7]
+    cost_val = nl_model_form(args, model_params)
+    percent = (cost_val - time) / time
+
+    new_line = ",".join([line.strip(),"\t" ,str(cost_val),"\t" , str(time),"\t\t" , str(percent * 100)])
+    new_line += "\n"
+    out_file.write(new_line)
+
+out_file.close()
+
+
+
--- a/unittest/sql/optimizer/cost_model_utils/apply_sort_model.py
+++ b/unittest/sql/optimizer/cost_model_utils/apply_sort_model.py
@ -0,0 +1,213 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+import getopt
+import sys
+import math
+
+
+
+def material_model_form(args):
+    (
+        Nrow,
+        Ncol,
+    ) = args
+
+    Trow_col = 0.02674675
+    Trow_once = 0.07931677
+
+    total_cost = 0 #Tstartup
+    total_cost += Nrow * (Trow_once + Ncol * Trow_col)
+    return total_cost
+
+def array_model_form(args):
+    # (
+    #     Nelem,
+    # ) = args
+
+    Telem_ence = 0.00898860
+    Telem_copy = 0.00631888
+
+    Nelem = args
+
+    ELEM_PER_PAGE = 1024
+    extend_cnt = math.ceil(math.log(float(Nelem)/ELEM_PER_PAGE, 2))
+    if extend_cnt < 0:
+        extend_cnt = 0
+    copy_cnt = ELEM_PER_PAGE * (math.pow(2, extend_cnt) - 1)
+
+    total_cost = Telem_ence * Nelem
+    #total_cost += Tmem_alloc * extend_cnt
+    total_cost += Telem_copy * copy_cnt
+
+    return total_cost
+
+def get_row_size(reserve, col):
+    size = 16
+    size += reserve * 16
+    col /= 8
+    size += col * (3 + 8 + 4 + 8 + 16 + 32 + 64 + 128)
+    size += col
+    return size
+
+def get_miss_prob(Nrow, Ncol, Nord, Turn):
+    total_size = Nrow * get_row_size(Nord, Ncol)
+    TLBcovered = Turn
+    if TLBcovered >= 0.9 * total_size:
+        hit = 0.9
+    else:
+        hit = TLBcovered / total_size
+    return 1 - hit
+
+
+def sort_model_form(args,
+                    params
+                    ):
+    (
+        Nrow,
+        Ncol,
+        Nordering
+    ) = args
+
+    (
+       # Tstartup,
+        #Trowstore_once,
+        #Trowstore_col,
+        # Tarray_once,
+        # Tarray_elem_copy,
+        # Tordercol,
+        # Treserve_cell,
+        Tcompare,
+        # Trow_once,
+        Tmiss_K1,
+        Turn
+        # Tmiss_K2,
+        # Turn
+
+    ) = params
+
+
+    total_cost = 0 #Tstartup
+
+    # total_cost += Nrow * Trow_once
+    #cost for rowstore
+    # total_cost += material_model_form((Nrow, Ncol))
+    # total_cost += 0.0044 * Nrow * Ncol * Nordering
+    # total_cost += Tordercol * Nrow * Nordering
+
+    #cost for push array
+    # total_cost += array_model_form(Nrow)
+
+    # cost for sorting
+    Nordering_cmp = Nordering
+    if Nordering >= 1:
+        Nordering_cmp = 1
+    compare_cost = Tcompare * Nordering_cmp + Tmiss_K1 * get_miss_prob(Nrow, Ncol, Nordering, Turn)
+    total_cost += Nrow * compare_cost * math.log(Nrow, 2)
+
+    #cost for get row
+    # total_cost += Nrow * (Tmiss_K2 * get_miss_prob(Nrow, Ncol, Nordering))
+    return total_cost
+
+#
+# def sort_model_form(args,
+#                     params):
+#     (
+#         Nrow,
+#         Nordering,
+#         Ncol,
+#     ) = args
+#
+#     (
+#         Tstartup,
+#         Trowstore_once,
+#         Trowstore_col,
+#         # Tarray_once,
+#         # Tarray_elem_copy,
+#         Treserve_cell,
+#         Tcompare
+#     ) = params
+#
+#
+#     total_cost = Tstartup
+#
+#     #cost for row store
+#     total_cost += Nrow * (Trowstore_once + Ncol * Trowstore_col)
+#     total_cost += Treserve_cell * Nrow * Ncol * Nordering
+#
+#     #cost for array
+#     # ELEM_PER_PAGE = 1024
+#     # extend_cnt = math.ceil(math.log(float(Nrow)/ELEM_PER_PAGE, 2))
+#     # copy_cnt = ELEM_PER_PAGE * (math.pow(2, extend_cnt) - 1)
+#     #total_cost += Tarray_once * Nrow + Tarray_elem_copy * copy_cnt
+#
+#     #cost for sorting
+#     if Nordering > 2:
+#         Nordering_cmp = 2
+#     else:
+#         Nordering_cmp = Nordering
+#     compare_cost = Tcompare * Nordering_cmp
+#     total_cost += Nrow * compare_cost * math.log(Nrow, 2)
+#
+#     return total_cost
+
+
+def extract_info_from_line(line):
+    splited = line.split(",")
+    line_info = []
+    for item in splited:
+        line_info.append(float(item))
+    return line_info
+
+
+# sys.argv.extend('-i sort.prep.double -o sort.fit.double -m sort.model.double'.split())
+
+file_name = "get_total.data.prep"
+output_fit_res = False
+wrong_arg = False
+opts,args = getopt.getopt(sys.argv[1:],"i:o:m:")
+for op, value in opts:
+    if "-i" == op:
+        file_name = value
+    elif "-o" == op:
+        output_fit_res = True
+        out_file_name = value
+    elif "-m" == op:
+        model_file_name = value
+    else:
+        wrong_arg = True
+
+if wrong_arg:
+    print "wrong arg"
+    sys.exit(1)
+
+input_file = open(file_name, "r")
+model_file = open(model_file_name, "r")
+out_file = open(out_file_name, "w")
+
+
+line = model_file.readline()
+model_params = [float(p) for p in line.split(",")]
+# if len(model_params) == 1:
+#     model_params = model_params[0]
+
+
+for line in input_file:
+    if line.startswith('#'):
+        out_file.write(line)
+        continue
+    case_param = extract_info_from_line(line)
+    args = (case_param[0],
+            case_param[1],
+            case_param[2])
+    time = case_param[4]
+    cost_val = sort_model_form(args, model_params)
+    percent = (cost_val - time) / time
+
+    new_line = ",".join([line.strip(),str(cost_val),str(percent * 100)])
+    new_line += "\n"
+    out_file.write(new_line)
+
+out_file.close()
+
+
+
--- a/unittest/sql/optimizer/cost_model_utils/benchmaster_array.py
+++ b/unittest/sql/optimizer/cost_model_utils/benchmaster_array.py
@ -0,0 +1,33 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+import subprocess as sp
+import os
+
+if os.path.exists("array_result"):
+    os.remove("array_result")
+
+#cmd_form = 'LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./cost_model_util -GB -s c10k1.schema -t array -r 1000000'
+cmd_form = 'LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./cost_model_util -G -s c10k1.schema -t array -r 1000000'
+cmd_elements = cmd_form.split(" ")
+
+minrc = 1
+maxrc = 1100001
+step = 1000
+case_run_time = 5
+
+total_case_count = (maxrc - minrc) / step 
+case_count = 0
+
+print "Total case count %s ..." % (total_case_count)
+for row_count in xrange(minrc, maxrc + 1, step):
+    cmd_elements[-1] = str(row_count)
+
+    case_count += 1
+    prompt = "Running case %s / %s ... : %s " % (case_count, total_case_count, " ".join(cmd_elements))
+    print prompt
+    sp.check_call('echo "### %s" >> array_result' % prompt, shell=True)
+    for time in xrange(case_run_time):
+        #print "running the %d time" % time
+        sp.check_call("echo -n '%s,' >> array_result"%(row_count), shell=True)
+        sp.check_call(" ".join(cmd_elements) + ' >> array_result', shell=True)
+
--- a/unittest/sql/optimizer/cost_model_utils/benchmaster_material.py
+++ b/unittest/sql/optimizer/cost_model_utils/benchmaster_material.py
@ -0,0 +1,36 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+import subprocess as sp
+import os
+
+if os.path.exists("material_result"):
+    os.remove("material_result")
+
+if os.path.exists("material_final_result"):
+    os.remove("material_final_result")
+
+# cmd_form = "./cost_model_util -B -t material -s c10k1.schema -r 1000 -p 1 >> material_result"
+cmd_form = "./cost_model_util -G -t material -s c10k1.schema -r 1000 -p 1 >> material_result"
+cmd_elements = cmd_form.split(" ")
+
+row_count_max = 10001
+row_count_step = 100
+
+column_counts = [3, 5, 8]
+
+case_run_time = 7
+
+total_case_count = (row_count_max / row_count_step + 1) * len(column_counts) * case_run_time
+case_count = 0
+
+print "Total case count %s ..." % (total_case_count)
+for row_count in xrange(1, row_count_max + 1, row_count_step):
+    for column_count in column_counts:
+        for time in xrange(case_run_time):
+            case_count += 1
+            cmd_elements[7] = str(row_count)
+            cmd_elements[9] = str(column_count)
+            sp.check_call("echo -n '%s,' >> material_result" % (row_count), shell=True)
+            sp.check_call("echo -n '%s,' >> material_result" % (column_count), shell=True)
+            print "Running case %s / %s ... : %s " % (case_count, total_case_count, " ".join(cmd_elements))
+            sp.check_call(" ".join(cmd_elements), shell=True)
--- a/unittest/sql/optimizer/cost_model_utils/benchmaster_merge.py
+++ b/unittest/sql/optimizer/cost_model_utils/benchmaster_merge.py
@ -0,0 +1,102 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+import subprocess as sp
+import os
+import sys
+import getopt
+import time
+
+ISOTIMEFORMAT='%Y-%m-%d %X'
+
+#cmd_form = "./cost_model_util -B -s c10k1x2.schema -t merge -r 900 -r 900 -Z1 -Z1 -C 2 -C 2 -V 3 -V 3 >> res"
+cmd_form = "./cost_model_util -G -s c10k1x2.schema -t merge -r 900 -r 900 -Z1 -Z1 -C 2 -C 2 -V 3 -V 3 >> res"
+cmd_elements = cmd_form.split(" ")
+
+row_count_max = 10001
+row_count_step = 100
+
+
+left_row_counts = [5000, 10000, 50000, 100000]
+right_row_counts = [5000, 10000, 50000, 100000]
+
+left_steps = [1, 3, 4, 5, 7, 10]
+right_steps = [1, 3, 4, 5, 7, 10]
+
+left_step_lengths = [1, 2, 4, 5, 10]
+right_step_lengths = [1, 2, 4, 5, 10]
+
+case_run_time = 7
+
+total_case_count = len(left_row_counts)
+total_case_count *= len(right_row_counts)
+total_case_count *= len(left_steps)
+total_case_count *= len(right_steps)
+total_case_count *= len(left_step_lengths)
+total_case_count *= len(right_step_lengths)
+total_case_count *= case_run_time
+
+
+wrong_arg = False
+
+out_file_name = "merge_result"
+opts,args = getopt.getopt(sys.argv[1:],"o:")
+for op, value in opts:
+    if "-o" == op:
+        out_file_name = value
+    else:
+        wrong_arg = True
+
+if wrong_arg:
+    print "wrong arg"
+    sys.exit(1)
+
+
+
+case_count = 0
+cmd_elements[-1] = out_file_name
+if os.path.exists(out_file_name):
+    os.remove(out_file_name)
+
+print "Total case count %s ..." % (total_case_count)
+for left_row_count in left_row_counts:
+    for right_row_count in right_row_counts:
+        for left_step in left_steps:
+            for right_step in right_steps:
+                for left_step_length in left_step_lengths:
+                    for right_step_length in right_step_lengths:
+                        for i in xrange(case_run_time):
+                            case_count += 1
+                            cmd_elements[7] = str(left_row_count)
+                            cmd_elements[9] = str(right_row_count)
+                            cmd_elements[13] = str(left_step)
+                            cmd_elements[15] = str(right_step)
+                            cmd_elements[17] = str(left_step_length)
+                            cmd_elements[19] = str(right_step_length)
+
+                            prompt = "%s Running case %s / %s ... : %s " % (time.strftime( ISOTIMEFORMAT, time.localtime()), case_count, total_case_count, " ".join(cmd_elements))
+                            print prompt
+
+                            params = [str(p) for p in [left_row_count, right_row_count, left_step, right_step, left_step_length, right_step_length]]
+                            sp.check_call("echo '#%s' >> %s"%(prompt, out_file_name), shell=True)
+                            sp.check_call("echo -n '%s,' >> %s"%(",".join(params), out_file_name), shell=True)
+                            sp.check_call(" ".join(cmd_elements), shell=True)
+
+
+
+
+#
+# total_case_count = (row_count_max / row_count_step + 1) * len(column_counts) * case_run_time
+# case_count = 0
+#
+# print "Total case count %s ..." % (total_case_count)
+# for row_count in xrange(1, row_count_max + 1, row_count_step):
+#     for column_count in column_counts:
+#         for time in xrange(case_run_time):
+#             case_count += 1
+#             cmd_elements[7] = str(row_count)
+#             cmd_elements[9] = str(column_count)
+#             sp.check_call("echo -n '%s,' >> material_result"%(row_count), shell=True)
+#             sp.check_call("echo -n '%s,' >> material_result"%(column_count), shell=True)
+#             print "Running case %s / %s ... : %s " % (case_count, total_case_count, " ".join(cmd_elements))
+#             sp.check_call(" ".join(cmd_elements), shell=True)
+#
--- a/unittest/sql/optimizer/cost_model_utils/benchmaster_mergegroupby.py
+++ b/unittest/sql/optimizer/cost_model_utils/benchmaster_mergegroupby.py
@ -0,0 +1,87 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+import subprocess as sp
+import os
+import sys
+import getopt
+import time
+
+ISOTIMEFORMAT = '%Y-%m-%d %X'
+
+# cmd_form = "./cost_model_util -t mg -B -s c10k1.schema -r 10000 -Z 1 -V 10 -e 1 -o 10 -p 1 >> out_file"
+cmd_form = "./cost_model_util -t mg -G -s c10k1.schema -r 10000 -Z 1 -V 10 -e 1 -o 10 -p 1 >> mergegroupby_result"
+cmd_elements = cmd_form.split(" ")
+
+row_counts = [10, 30, 50, 70, 100, 1000, 5000, 10000]
+steps = [1, 3, 5, 10, 20]
+aggr_funcs = [1, 4, 7, 10]
+group_cols = [1, 4, 7, 10]
+non_group_cols = [10]
+
+case_run_time = 7
+
+total_case_count = len(row_counts)
+total_case_count *= len(steps)
+total_case_count *= len(aggr_funcs)
+total_case_count *= len(group_cols)
+total_case_count *= len(non_group_cols)
+total_case_count *= case_run_time
+
+print total_case_count
+wrong_arg = False
+
+out_file_name = "mergegroupby_result"
+if os.path.exists(out_file_name):
+    os.remove(out_file_name)
+opts, args = getopt.getopt(sys.argv[1:], "o:")
+for op, value in opts:
+    if "-o" == op:
+        out_file_name = value
+    else:
+        wrong_arg = True
+
+if wrong_arg:
+    print "wrong arg"
+    sys.exit(1)
+
+case_count = 0
+cmd_elements[-1] = out_file_name
+if os.path.exists(out_file_name):
+    os.remove(out_file_name)
+
+print "Total case count %s ..." % (total_case_count)
+
+for row_count in row_counts:
+    for step in steps:
+        for aggr_func in aggr_funcs:
+            for group_col in group_cols:
+                for non_group_col in non_group_cols:
+                    for run_time in xrange(case_run_time):
+
+                        cmd_elements[7] = str(row_count)
+                        cmd_elements[11] = str(step)
+                        cmd_elements[13] = str(aggr_func)
+                        cmd_elements[15] = str(group_col)
+                        cmd_elements[17] = str(non_group_col)
+                        cmd_elements[19] = out_file_name
+
+                        param = ",".join([cmd_elements[7],
+                                          cmd_elements[11],
+                                          cmd_elements[13],
+                                          cmd_elements[15],
+                                          cmd_elements[17]]) + ","
+
+                        prompt = "%s Running case %s / %s ... : %s " % (
+                        time.strftime(ISOTIMEFORMAT, time.localtime()), case_count, total_case_count,
+                        " ".join(cmd_elements))
+                        print prompt
+
+                        case_count += 1
+
+                        sp.check_call("echo '#%s' >> %s" % (prompt, out_file_name), shell=True)
+
+                        if group_col <= non_group_col:
+                            sp.check_call("echo -n '%s' >> %s" % (param, out_file_name), shell=True)
+                            sp.check_call(" ".join(cmd_elements), shell=True)
+                        else:
+                            sp.check_call("echo '#%s skipped' >> %s" % (param, out_file_name), shell=True)
--- a/unittest/sql/optimizer/cost_model_utils/benchmaster_miss.py
+++ b/unittest/sql/optimizer/cost_model_utils/benchmaster_miss.py
@ -0,0 +1,101 @@
+#!/bin/env python
+# -*- coding: utf-8 -*-
+__author__ = 'dongyun.zdy'
+
+import datetime
+import multiprocessing
+import MySQLdb
+import Queue
+import signal
+import re
+import argparse
+import time
+import sys
+import subprocess as sp
+import os
+
+outfile = 'miss.result'
+schema_file = 'miss.schema'
+if os.path.exists(outfile):
+    os.remove(outfile)
+
+
+def remove_schema():
+    global schema_file
+    if os.path.exists(schema_file):
+        os.remove(schema_file)
+
+
+def write_schema(s):
+    global schema_file
+    of = open(schema_file, 'w')
+    of.write(s)
+    of.close()
+
+
+def make_seq(t, cnt):
+    types = [t]
+    types *= cnt
+    return types
+
+
+def make_schema(types):
+    global schema_file
+    remove_schema()
+    col_id = 1
+    s = "create table t1 ("
+    for t in types:
+        s += "c%d %s, " % (col_id, t)
+        col_id += 1
+    s = s[:-2]
+    s += ', primary key (c1))'
+    run_cmd('echo "# %s" >> ' % s + outfile)
+    write_schema(s)
+
+
+def run_cmd(cmd):
+    # print cmd
+    res = ''
+    p = sp.Popen(cmd, shell=True, stdout=sp.PIPE, stderr=sp.STDOUT)
+    while True:
+        line = p.stdout.readline()
+        res += line
+        if line:
+            # print line.strip()
+            sys.stdout.flush()
+        else:
+            break
+    p.wait()
+    return res
+
+
+#cmd_form1 = 'LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./cost_model_util -BGK -t material -s miss.schema -r 500000'.split()
+cmd_form1 = 'LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./cost_model_util -GK -t material -s miss.schema -r 500000'.split()
+
+types_to_test = {'bigint': 'bigint', 'double': 'double', 'float': 'float', 'timestamp': 'timestamp',
+                 'number': 'number(20,3)', 'v32': 'varchar(32)', 'v64': 'varchar(64)', 'v128': 'varchar(128)'}
+row_counts = [1000, 2000, 4000, 7000, 8000, 10000, 20000, 50000]
+input_col_cnts = [1, 2, 3, 6]
+case_run_time = 7
+
+total_case_count = len(row_counts) * len(input_col_cnts)
+case_count = 0
+
+print "Total case count %s ..." % (total_case_count)
+for col_count in input_col_cnts:
+    make_schema(sorted(types_to_test.values()) * col_count)
+    for row_count in row_counts:
+        cmd_form1[-1] = str(row_count)
+        case_count += 1
+        prompt = "Running case %s / %s ... : %s " % (case_count, total_case_count, " ".join(cmd_form1))
+        print prompt
+        sp.check_call('echo "### %s" >> ' % prompt + outfile, shell=True)
+        caseinfo = '%d,%d,' % (row_count, col_count)
+        for t in xrange(case_run_time):
+            print t
+            res = caseinfo + run_cmd(" ".join(cmd_form1) + " -i3").strip()
+            run_cmd('echo "%s" >> ' % (res) + outfile)
+        for t in xrange(case_run_time):
+            print t
+            res = caseinfo + run_cmd(" ".join(cmd_form1) + " -i4").strip()
+            run_cmd('echo "%s" >> ' % (res) + outfile)
--- a/unittest/sql/optimizer/cost_model_utils/benchmaster_nl.py
+++ b/unittest/sql/optimizer/cost_model_utils/benchmaster_nl.py
@ -0,0 +1,103 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+import subprocess as sp
+import os
+import sys
+import getopt
+import time
+
+ISOTIMEFORMAT='%Y-%m-%d %X'
+
+# cmd_form = "./cost_model_util -B -s c10k1x2.schema -t nestloop -r 900 -r 900 -Z1 -Z1 -C 2 -C 2 -V 3 -V 3 >> res"
+cmd_form = "./cost_model_util -G -s c10k1x2.schema -t nestloop -r 900 -r 900 -Z1 -Z1 -C 2 -C 2 -V 3 -V 3 >> nl_result"
+cmd_elements = cmd_form.split(" ")
+
+row_count_max = 10001
+row_count_step = 100
+
+
+left_row_counts = [10, 100, 500, 1000]
+right_row_counts = [10, 100, 500, 1000]
+
+left_steps = [1, 3, 4, 5, 7, 10]
+right_steps = [1, 3, 4, 5, 7, 10]
+
+left_step_lengths = [1, 2, 4, 5, 10]
+right_step_lengths = [1, 2, 4, 5, 10]
+
+case_run_time = 7
+
+total_case_count = len(left_row_counts)
+total_case_count *= len(right_row_counts)
+total_case_count *= len(left_steps)
+total_case_count *= len(right_steps)
+total_case_count *= len(left_step_lengths)
+total_case_count *= len(right_step_lengths)
+total_case_count *= case_run_time
+
+
+wrong_arg = False
+
+#out_file_name = "nestloop_result"
+out_file_name = "nl_result"
+opts,args = getopt.getopt(sys.argv[1:],"o:")
+for op, value in opts:
+    if "-o" == op:
+        out_file_name = value
+    else:
+        wrong_arg = True
+
+if wrong_arg:
+    print "wrong arg"
+    sys.exit(1)
+
+
+
+case_count = 0
+cmd_elements[-1] = out_file_name
+if os.path.exists(out_file_name):
+    os.remove(out_file_name)
+
+print "Total case count %s ..." % (total_case_count)
+for left_row_count in left_row_counts:
+    for right_row_count in right_row_counts:
+        for left_step in left_steps:
+            for right_step in right_steps:
+                for left_step_length in left_step_lengths:
+                    for right_step_length in right_step_lengths:
+                        for i in xrange(case_run_time):
+                            case_count += 1
+                            cmd_elements[7] = str(left_row_count)
+                            cmd_elements[9] = str(right_row_count)
+                            cmd_elements[13] = str(left_step)
+                            cmd_elements[15] = str(right_step)
+                            cmd_elements[17] = str(left_step_length)
+                            cmd_elements[19] = str(right_step_length)
+
+                            prompt = "%s Running case %s / %s ... : %s " % (time.strftime( ISOTIMEFORMAT, time.localtime()), case_count, total_case_count, " ".join(cmd_elements))
+                            print prompt
+
+                            params = [str(p) for p in [left_row_count, right_row_count, left_step, right_step, left_step_length, right_step_length]]
+                            sp.check_call("echo '#%s' >> %s"%(prompt, out_file_name), shell=True)
+                            sp.check_call("echo -n '%s,' >> %s"%(",".join(params), out_file_name), shell=True)
+                            sp.check_call(" ".join(cmd_elements), shell=True)
+
+
+
+
+#
+# total_case_count = (row_count_max / row_count_step + 1) * len(column_counts) * case_run_time
+# case_count = 0
+#
+# print "Total case count %s ..." % (total_case_count)
+# for row_count in xrange(1, row_count_max + 1, row_count_step):
+#     for column_count in column_counts:
+#         for time in xrange(case_run_time):
+#             case_count += 1
+#             cmd_elements[7] = str(row_count)
+#             cmd_elements[9] = str(column_count)
+#             sp.check_call("echo -n '%s,' >> material_result"%(row_count), shell=True)
+#             sp.check_call("echo -n '%s,' >> material_result"%(column_count), shell=True)
+#             print "Running case %s / %s ... : %s " % (case_count, total_case_count, " ".join(cmd_elements))
+#             sp.check_call(" ".join(cmd_elements), shell=True)
+#
--- a/unittest/sql/optimizer/cost_model_utils/benchmaster_rowstore.py
+++ b/unittest/sql/optimizer/cost_model_utils/benchmaster_rowstore.py
@ -0,0 +1,76 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+import subprocess as sp
+import os
+
+schema_file = 'rowstore.schema'
+outfile = 'rowstore.result'
+
+
+def remove_schema():
+    global schema_file
+    if os.path.exists(schema_file):
+        os.remove(schema_file)
+
+def write_schema(s):
+    global schema_file
+    of = open(schema_file, 'w')
+    of.write(s)
+    of.close()
+
+
+def make_seq(t, cnt):
+    types = [t]
+    types *= cnt
+    return types
+
+
+def make_schema(types):
+    global schema_file
+    remove_schema()
+    col_id = 1
+    s = "create table t1 ("
+    for t in types:
+        s += "c%d %s, " % (col_id, t)
+        col_id += 1
+    s = s[:-2]
+    s += ', primary key (c1))'
+    print s
+    write_schema(s)
+
+
+cmdform = 'LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./cost_model_util -RBGK -t material -s rowstore.schema -r 10 -i1'.split()
+
+
+types_to_test = {'bigint':'bigint', 'double':'double', 'float':'float', 'timestamp':'timestamp', 'number':'number(20,3)', 'v32':'varchar(32)', 'v64':'varchar(64)', 'v128':'varchar(128)'}
+row_counts = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 20000, 50000]
+col_nums = [1, 3, 20, 50]
+case_run_time = 7
+
+total_case_count = len(types_to_test) * len(row_counts) * len(col_nums)
+case_count = 0
+
+if os.path.exists(outfile):
+    os.remove(outfile)
+
+# for t in types_to_test:
+#     outfile = 'rowstore.result.' + t
+#     if os.path.exists(outfile):
+#         os.remove(outfile)
+#     for n in col_nums:
+#         make_schema(make_seq(types_to_test[t], n))
+#         for rc in row_counts:
+#             cmdform[8] = str(rc)
+#             case_count += 1
+#             prompt = "# %d / %d %s col_cnt = %d rc = %d \n# %s" % (case_count, total_case_count, t, n, rc, ' '.join(cmdform))
+#             print prompt
+#             sp.check_call('echo "%s" >> ' % prompt + outfile, shell=True)
+#             for times in xrange(0, case_run_time):
+#                 print times
+#                 sp.check_call("echo -n '%s,' >> " % str(rc) + outfile, shell=True)
+#                 sp.check_call("echo -n '%s,' >> " % str(n) + outfile, shell=True)
+#                 sp.check_call(" ".join(cmdform) + ' >> ' + outfile, shell=True)
+
+make_schema(make_seq('bigint', 50))
+
+
--- a/unittest/sql/optimizer/cost_model_utils/benchmaster_sort.py
+++ b/unittest/sql/optimizer/cost_model_utils/benchmaster_sort.py
@ -0,0 +1,44 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+import subprocess as sp
+import os
+
+if os.path.exists("sort_result"):
+    os.remove("sort_result")
+
+#cmd_form = "LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./cost_model_util -GBR -t sort -s c20.schema -r 1000 -c 10 -p 10 >> sort_result"
+cmd_form = "LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./cost_model_util -GR -t sort -s sort.schema -r 1000 -c 10 -p 10 >> sort_result"
+cmd_elements = cmd_form.split(" ")
+
+row_counts = [1, 100, 500, 800, 1000, 3000, 5000, 8000, 9000, 10000, 20000, 40000, 60000, 70000, 100000, 300000]
+column_counts = [1, 2, 3, 4, 5]
+#input_col_cnts = [15, 30, 45]
+input_col_cnts = [3, 5, 9] #schema file related, col counts should not be less than projector count
+case_run_time = 7
+
+total_case_count = len(row_counts) * len(column_counts) * len(input_col_cnts)
+case_count = 0
+
+print "Total case count %s ..." % (total_case_count)
+for row_count in row_counts:
+    for column_count in column_counts:
+        for input_col in input_col_cnts:
+            cmd_elements[8] = str(row_count)
+            cmd_elements[10] = str(column_count)
+            cmd_elements[12] = str(input_col)
+
+            case_count += 1
+            prompt = "Running case %s / %s ... : %s " % (case_count, total_case_count, " ".join(cmd_elements))
+            print prompt
+            sp.check_call('echo "### %s" >> sort_result' % prompt, shell=True)
+            if column_count > input_col:
+                print "### PASS"
+                sp.check_call('echo "### PASS" >> sort_result', shell=True)
+                continue
+            for time in xrange(case_run_time):
+                print "running the %d time" % time
+                sp.check_call("echo -n '%s,' >> sort_result"%(row_count), shell=True)
+                sp.check_call("echo -n '%s,' >> sort_result"%(column_count), shell=True)
+                sp.check_call("echo -n '%s,' >> sort_result"%(input_col), shell=True)
+                sp.check_call(" ".join(cmd_elements), shell=True)
+
--- a/unittest/sql/optimizer/cost_model_utils/benchmaster_sort_add.py
+++ b/unittest/sql/optimizer/cost_model_utils/benchmaster_sort_add.py
@ -0,0 +1,97 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+import subprocess as sp
+import os
+from cost_test_conf import Config
+
+schema_file = 'sort.schema'
+outfile = 'sort.result'
+
+
+def remove_schema():
+    global schema_file
+    if os.path.exists(schema_file):
+        os.remove(schema_file)
+
+def write_schema(s):
+    global schema_file
+    of = open(schema_file, 'w')
+    of.write(s)
+    of.close()
+
+
+def make_seq(t, cnt):
+    types = [t]
+    types *= cnt
+    return types
+
+
+def make_schema(types):
+    global schema_file
+    remove_schema()
+    col_id = 1
+    s = "create table t1 ("
+    for t in types:
+        s += "c%d %s, " % (col_id, t)
+        col_id += 1
+    s = s[:-2]
+    s += ', primary key (c1))'
+    print s
+    write_schema(s)
+
+if os.path.exists("sort_result"):
+    os.remove("sort_result")
+
+#cmd_form = "LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./cost_model_util -GBR -t sort -s c20.schema -r 1000 -c 10  -i4".split()
+cmd_form = "LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./cost_model_util -GR -t sort -s c20.schema -r 1000 -c 10  -i4".split()
+
+
+types_to_test = {'bigint':'bigint', 'double':'double', 'float':'float', 'timestamp':'timestamp', 'number':'number(20,3)', 'v32':'varchar(32)', 'v64':'varchar(64)', 'v128':'varchar(128)'}
+row_counts = [1000, 2000, 4000, 8000, 10000, 20000, 50000]
+sort_column_counts = [1, 2, 3, 5]
+input_col_cnts = [1, 2, 6]
+case_run_time = 7
+
+keys = sorted(types_to_test.keys())
+
+total_case_count = len(row_counts) * len(sort_column_counts) * len(input_col_cnts) * len(keys)
+case_count = 0
+
+cmd_form[6] = schema_file
+
+
+
+
+def make_headed_seq(head, arr):
+    a = [head] + arr[0:arr.index(head)] + arr[arr.index(head) + 1:]
+    b = [types_to_test[i] for i in a]
+    return b
+
+#for t in keys:
+if Config.u_to_test_type is not None:
+    #outfile = 'sort.result.' + t
+    t = Config.u_to_test_type
+    outfile = 'sort_add_' + t + '_' + 'result'
+    if os.path.exists(outfile):
+        os.remove(outfile)
+    for n in input_col_cnts:
+        make_schema(make_headed_seq(t, keys) * n)
+        for rc in row_counts:
+            cmd_form[8] = str(rc)
+            for order_count in sort_column_counts:
+                cmd_form[-2] = str(order_count)
+                case_count+=1
+                prompt = "# %d / %d  %s col_cnt = %d rc = %d order_cnt = %d\n# %s" % (case_count, total_case_count, t, n * len(keys), rc, order_count, ' '.join(cmd_form))
+
+                print prompt
+                sp.check_call('echo "%s" >> ' % prompt + outfile, shell=True)
+                if order_count > n * len(keys):
+                    print 'PASS'
+                    sp.check_call('echo "# PASS" >> '  + outfile, shell=True)
+                    continue
+                for times in xrange(0, case_run_time):
+                    print times
+                    sp.check_call("echo -n '%s,' >> " % str(rc) + outfile, shell=True)
+                    sp.check_call("echo -n '%s,' >> " % str(n) + outfile, shell=True)
+                    sp.check_call("echo -n '%s,' >> " % str(order_count) + outfile, shell=True)
+                    sp.check_call(" ".join(cmd_form) + ' >> ' + outfile, shell=True)
--- a/unittest/sql/optimizer/cost_model_utils/benchmaster_sort_with_type.py
+++ b/unittest/sql/optimizer/cost_model_utils/benchmaster_sort_with_type.py
@ -0,0 +1,94 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+import subprocess as sp
+import os
+
+schema_file = 'sort.schema'
+outfile = 'sort.result'
+
+
+def remove_schema():
+    global schema_file
+    if os.path.exists(schema_file):
+        os.remove(schema_file)
+
+
+def write_schema(s):
+    global schema_file
+    of = open(schema_file, 'w')
+    of.write(s)
+    of.close()
+
+
+def make_seq(t, cnt):
+    types = [t]
+    types *= cnt
+    return types
+
+
+def make_schema(types):
+    global schema_file
+    remove_schema()
+    col_id = 1
+    s = "create table t1 ("
+    for t in types:
+        s += "c%d %s, " % (col_id, t)
+        col_id += 1
+    s = s[:-2]
+    s += ', primary key (c1))'
+    print s
+    write_schema(s)
+
+
+if os.path.exists("sort_result"):
+    os.remove("sort_result")
+
+cmd_form = "LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./cost_model_util -GBR -t sort -s c20.schema -r 1000 -c 10  -i1".split()
+
+types_to_test = {'bigint': 'bigint', 'double': 'double', 'float': 'float', 'timestamp': 'timestamp',
+                 'number': 'number(20,3)', 'v32': 'varchar(32)', 'v64': 'varchar(64)', 'v128': 'varchar(128)'}
+row_counts = [1000, 2000, 4000, 7000, 8000, 10000, 20000, 50000]
+sort_column_counts = [1, 2, 3]
+input_col_cnts = [1, 2, 3, 6]
+case_run_time = 7
+
+keys = sorted(types_to_test.keys())
+
+total_case_count = len(row_counts) * len(sort_column_counts) * len(input_col_cnts) * len(keys)
+case_count = 0
+
+cmd_form[6] = schema_file
+
+
+def make_headed_seq(head, arr):
+    a = [head] + arr[0:arr.index(head)] + arr[arr.index(head) + 1:]
+    b = [types_to_test[i] for i in a]
+    return b
+
+
+for t in keys:
+    outfile = 'sort.result.' + t
+    if os.path.exists(outfile):
+        os.remove(outfile)
+    for n in input_col_cnts:
+        make_schema(make_headed_seq(t, keys) * n)
+        for rc in row_counts:
+            cmd_form[8] = str(rc)
+            for order_count in sort_column_counts:
+                cmd_form[-2] = str(order_count)
+                case_count += 1
+                prompt = "# %d / %d  %s col_cnt = %d rc = %d order_cnt = %d\n# %s" % (
+                case_count, total_case_count, t, n * len(keys), rc, order_count, ' '.join(cmd_form))
+
+                print prompt
+                sp.check_call('echo "%s" >> ' % prompt + outfile, shell=True)
+                if order_count > n * len(keys):
+                    print 'PASS'
+                    sp.check_call('echo "# PASS" >> ' + outfile, shell=True)
+                    continue
+                for times in xrange(0, case_run_time):
+                    print times
+                    sp.check_call("echo -n '%s,' >> " % str(rc) + outfile, shell=True)
+                    sp.check_call("echo -n '%s,' >> " % str(n) + outfile, shell=True)
+                    sp.check_call("echo -n '%s,' >> " % str(order_count) + outfile, shell=True)
+                    sp.check_call(" ".join(cmd_form) + ' >> ' + outfile, shell=True)
--- a/unittest/sql/optimizer/cost_model_utils/c10k1.schema
+++ b/unittest/sql/optimizer/cost_model_utils/c10k1.schema
@ -0,0 +1 @@
+create table t1 (c1 bigint, c2 bigint, c3 bigint, c4 bigint, c5 bigint, c6 bigint, c7 bigint, c8 bigint, c9 bigint, c10 bigint, primary key(c1))
--- a/unittest/sql/optimizer/cost_model_utils/c10k1x2.schema
+++ b/unittest/sql/optimizer/cost_model_utils/c10k1x2.schema
@ -0,0 +1,2 @@
+create table t1(c1 bigint,c2 bigint,c3 bigint,c4 bigint,c5 bigint,c6 bigint,c7 bigint,c8 bigint,c9 bigint,c10 bigint,primary key(c1))
+create table t2(c1 bigint,c2 bigint,c3 bigint,c4 bigint,c5 bigint,c6 bigint,c7 bigint,c8 bigint,c9 bigint,c10 bigint,primary key(c1))
--- a/unittest/sql/optimizer/cost_model_utils/cost_main.py
+++ b/unittest/sql/optimizer/cost_model_utils/cost_main.py
@ -0,0 +1,42 @@
+from mylog.mylog import MyLogger
+import subprocess as sp
+'''
+class Tester(object):
+    bench_script = "python benchmaster_{0}.py"
+    data_process_script = 'python preprocess.py -i {0} -o {1} -d'
+    fit_script = 'python fit_{0}.py'
+
+    def __init__(self, conf):
+        self.conf = conf
+
+    def do_all(self):
+        # MyLogger.log('try to do all test fit plot')
+        pass
+
+    def do_bench(self):
+        # MyLogger.log('try to do bench')
+        sp.check_call(Tester.bench_script.format(self.conf.u_to_test_op_c), shell=True)
+
+    def do_fit(self):
+        # MyLogger.log('try to do fit')
+        sp.check_call(Tester.fit_script.format(self.conf.u_to_test_op_c), shell=True)
+
+    def do_plot(self):
+        # MyLogger.log('try to do plot')
+        pass
+
+    def do_data_process(self):
+        if self.conf.u_to_test_type_c is None:
+            sp.check_call(Tester.data_process_script.format(self.conf.u_to_test_op_c + '_result',
+                                                            self.conf.u_to_test_op_c + '_result_final'), shell=True)
+        else:
+            sp.check_call(
+                Tester.data_process_script.format(self.conf.u_to_test_op + '_' + self.conf.u_to_test_type + '_result',
+                                                  self.conf.u_to_test_op + '_' + self.conf.u_to_test_type + '_result_final'
+                                                  ), shell=True)
+'''
+
+
+if __name__ == '__main__':
+    MyLogger.info("start to do cost model unittest")
+    sp.check_call('python %s' % ('material.py'), shell=True)
--- a/unittest/sql/optimizer/cost_model_utils/cost_test_conf.py
+++ b/unittest/sql/optimizer/cost_model_utils/cost_test_conf.py
@ -0,0 +1,118 @@
+class Config(object):
+    '''
+    user input info
+    '''
+    ################
+    operators = {
+        'array': 'array',
+        'material': 'material',
+        'mergegroupby': 'mergegroupby',
+        'merge': 'merge',
+        'hash': 'hash',
+        'miss': 'miss',
+        'nl': 'nl',
+        'rowstore': 'rowstore',
+        'sort_add': 'sort_add',
+        'sort': 'sort',
+        'sort_with_type': 'sort_with_type'
+    }
+    types_to_test = {'bigint': 'bigint', 'double': 'double', 'float': 'float', 'timestamp': 'timestamp',
+                     'number': 'number(20,3)', 'v32': 'varchar(32)', 'v64': 'varchar(64)', 'v128': 'varchar(128)'}
+    config_map_dict = {
+        'is_printing_help_c': ' -h ',
+        'schema_file_c': ' -s ',
+        'row_count_c': ' -r ',
+        'left_row_count_c': ' -r ',
+        'right_row_count_c': ' -r ',
+        'sort_col_count_c': ' -c ',
+        'input_projector_count_c': ' -p ',
+        'left_pj_c': ' -p ',
+        'right_pj_c': ' -p ',
+        'is_printing_output_c': ' -O ',
+        'equal_cond_count_c': ' -e ',
+        'other_cond_count_c': ' -o ',
+        'u_to_test_op_c': ' -t ',
+        'u_to_test_type_c': '',
+        'is_binding_cpu_c': ' -B ',
+        'seed_min_c': ' -Z ',
+        'left_min_c': ' -Z ',
+        'right_min_c': ' -Z ',
+        'seed_max_c': ' -X ',
+        'left_max_c': '-X',
+        'right_max_c': '-X',
+        'seed_step_c': ' -C ',
+        'left_seed_step_c': ' -C ',
+        'right_seed_step_c': ' -C ',
+        'seed_step_len_c': ' -V ',
+        'left_seed_step_len_c': ' -V ',
+        'right_seed_step_len_c': ' -V ',
+        'limit_c': ' -L ',
+        'is_random_c': ' -R ',
+        'is_experimental_c': ' -K ',
+        'sleep_before_test_c': ' -S ',
+        'add_sort_column_c': ' -T ',
+        'info_type_c': ' -i ',
+        'common_prefix_len_c': ' -l ',
+        'is_not_running_as_unittest_c': ' -G '
+    }
+
+    def __init__(self):
+        # config info based on cost_model_util.cpp
+        self.is_printing_help_c = False
+        self.schema_file_c = None
+        self.row_count_c = None
+        self.left_row_count_c = None
+        self.right_row_count_c = None
+        self.sort_col_count_c = None
+        self.input_projector_count_c = None
+        self.left_pj = None
+        self.right_pj = None
+        self.is_printing_output_c = False
+        self.equal_cond_count_c = None
+        self.other_cond_count_c = None
+        self.u_to_test_op_c = None
+        self.u_to_test_type_c = None  # special
+        self.is_binding_cpu_c = False
+        self.seed_min_c = None
+        self.left_min_c = None
+        self.rigt_min_c = None
+        self.seed_max_c = None
+        self.left_max_c = None
+        self.right_max_c = None
+        self.seed_step_c = None
+        self.left_seed_step_c = None
+        self.right_seed_step_c = None
+        self.seed_step_len_c = None
+        self.left_seed_step_len_c = None
+        self.right_seed_step_len_c = None
+        self.limit_c = None
+        self.is_random_c = False
+        self.is_experimental_c = False
+        self.sleep_before_test_c = None
+        self.add_sort_column_c = None
+        self.info_type_c = None
+        self.common_prefix_len_c = None
+        self.is_not_running_as_unittest_c = False
+
+    def gen_params(self):
+        if self.is_printing_help_c:
+            return " -h "
+        else:
+            args = " "
+            for key in filter(lambda aname: aname.endswith('_c') and aname != 'is_printing_help_c', dir(self)):
+                val = self.__getattribute__(key)
+                # MyLogger.info("config object %s %s", key, val)
+                if key.startswith('is'):
+                    if val is True:
+                        args = args + Config.config_map_dict[key]
+                else:
+                    if val is not None:
+                        args = args + Config.config_map_dict[key]
+                        args = args + " " + str(val) + " "
+            return args
+
+
+if __name__ == '__main__':
+    conf = Config()
+    conf.is_printing_help_c = True
+    print conf.gen_params()
--- a/unittest/sql/optimizer/cost_model_utils/fit_array.py
+++ b/unittest/sql/optimizer/cost_model_utils/fit_array.py
@ -0,0 +1,148 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+
+
+import math
+import numpy as np
+from scipy.optimize import leastsq
+from scipy.optimize import curve_fit
+import sys
+from lmfit import Model
+import getopt
+import os
+#
+# def array_model_form(args):
+#     # (
+#     #     Nelem,
+#     # ) = args
+#
+#     Telem_ence = 0.00898860
+#     Telem_copy = 0.00631888
+#
+#     Nelem = args
+#
+#     ELEM_PER_PAGE = 1024
+#     extend_cnt = math.ceil(math.log(float(Nelem)/ELEM_PER_PAGE, 2))
+#     if extend_cnt < 0:
+#         extend_cnt = 0
+#     copy_cnt = ELEM_PER_PAGE * (math.pow(2, extend_cnt) - 1)
+#
+#     total_cost = Telem_ence * Nelem
+#     #total_cost += Tmem_alloc * extend_cnt
+#     total_cost += Telem_copy * copy_cnt
+#
+#     return total_cost
+
+def array_model_form(args,
+                     #Tstartup,
+                     Telem_ence,
+                     Telem_copy,
+                     #Tmem_alloc
+                     ):
+    # (
+    #     Nelem,
+    # ) = args
+
+    Nelem = args
+
+    ELEM_PER_PAGE = 1024
+    extend_cnt = math.ceil(math.log(float(Nelem)/ELEM_PER_PAGE, 2))
+    if extend_cnt < 0:
+        extend_cnt = 0
+    copy_cnt = ELEM_PER_PAGE * (math.pow(2, extend_cnt) - 1)
+
+    total_cost = Telem_ence * Nelem
+    #total_cost += Tmem_alloc * extend_cnt
+    total_cost += Telem_copy * copy_cnt
+
+    return total_cost
+
+def material_model_arr(arg_sets,
+                       # Tstartup,
+                       Telem_ence,
+                       Telem_copy,
+                       #Tmem_alloc
+                       ):
+    res = []
+    for single_arg_set in arg_sets:
+        res.append(array_model_form(single_arg_set,
+                                    # Tstartup,
+                                    Telem_ence,
+                                    Telem_copy,
+                                    #Tmem_alloc
+                                    ))
+    return np.array(res)
+
+material_model = Model(material_model_arr)
+# material_model.set_param_hint("Tstartup", min=0.0)
+material_model.set_param_hint("Telem_ence", min=0.0)
+material_model.set_param_hint("Telem_copy", min=0.0)
+#material_model.set_param_hint("Tmem_alloc", min=0.0)
+
+def extract_info_from_line(line):
+    splited = line.split(",")
+    line_info = []
+    for item in splited:
+        line_info.append(float(item))
+    return line_info
+
+
+if __name__ == '__main__':
+    #file_name = "scan_model.res.formal.prep"
+    #out_file_name = "scan_model.fit"
+    file_name = "array_result_final"
+    out_file_name = "array_model"
+    if os.path.exists(out_file_name):
+        os.remove(out_file_name)
+    #sys.argv.extend("-i arr.prep -o arr.model".split(" "))
+
+    output_fit_res = True
+    wrong_arg = False
+    opts,args = getopt.getopt(sys.argv[1:],"i:o:")
+    for op, value in opts:
+        if "-i" == op:
+            file_name = value
+        elif "-o" == op:
+            output_fit_res = True
+            out_file_name = value
+        else:
+            wrong_arg = True
+
+    if wrong_arg:
+        print "wrong arg"
+        sys.exit(1)
+
+    file = open(file_name, "r")
+    arg_sets = []
+    times = []
+    case_params = []
+    for line in file:
+        if line.startswith('#'):
+            continue
+        case_param = extract_info_from_line(line)
+        case_params.append(case_param)
+        arg_sets.append((case_param[0]))
+        times.append(case_param[1])
+    file.close()
+    arg_sets_np = np.array(arg_sets)
+    times_np = np.array(times)
+    #10, 0.20406430879623488, 0.016618100054245379, 14.0, 4.5, 37.0, -0.005, 0.5, -7.0
+    result = material_model.fit(times_np, arg_sets=arg_sets_np,
+                            # Tstartup=10.0,
+                            Telem_ence=1.0,
+                            Telem_copy=1.0,
+                            #Tmem_alloc=1.0
+                            )
+
+
+    # res_line = str(result.best_values["Tstartup"]) + ","
+    res_line = str(result.best_values["Telem_ence"]) + ","
+    res_line += str(result.best_values["Telem_copy"])# + ","
+    #res_line += str(result.best_values["Tmem_alloc"])
+
+    print result.fit_report()
+
+    if output_fit_res:
+        out_file = open(out_file_name, "w")
+        out_file.write(res_line)
+        out_file.close()
--- a/unittest/sql/optimizer/cost_model_utils/fit_hg.py
+++ b/unittest/sql/optimizer/cost_model_utils/fit_hg.py
@ -0,0 +1,162 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+
+
+import math
+import numpy as np
+from scipy.optimize import leastsq
+from scipy.optimize import curve_fit
+import sys
+from lmfit import Model
+import getopt
+
+
+
+def mg_model_form(args,
+                  Tstartup,
+                  Trow_once,
+                  Tres_once,
+                  Taggr_prepare_result,
+                  Taggr_process,
+                  Tgroup_hash_col,
+                  Tcopy_col
+                  ):
+    (
+        Nrow_input,
+        Nrow_res,
+        Ncol_input,
+        Ncol_aggr,
+        Ncol_group
+    ) = args
+
+    total_cost = Tstartup +  Nrow_res * Tres_once + Nrow_input * Trow_once
+    #cost for judge group
+    total_cost += Nrow_input * Ncol_group * Tgroup_hash_col
+
+    #cost for group related operation
+    total_cost += Nrow_res * (Ncol_input * Tcopy_col)
+    total_cost += Nrow_res * (Ncol_aggr * Taggr_prepare_result)
+
+    #cost for input row process
+    total_cost += Nrow_input * (Ncol_aggr * Taggr_process)
+
+    return total_cost
+
+
+
+
+eval_count = 0
+
+
+def mg_model_arr(arg_sets,
+                 Tstartup,
+                 Trow_once,
+                 Tres_once,
+                 Taggr_prepare_result,
+                 Taggr_process,
+                 Tgroup_hash_col,
+                 Tcopy_col
+                 ) :
+
+    res = [mg_model_form(single_arg_set,
+                         Tstartup,
+                         Trow_once,
+                         Tres_once,
+                         Taggr_prepare_result,
+                         Taggr_process,
+                         Tgroup_hash_col,
+                         Tcopy_col
+                        ) for single_arg_set in arg_sets]
+    global eval_count
+    eval_count += 1
+    print "eval "+ str(eval_count)
+    return np.array(res)
+
+mg_model = Model(mg_model_arr)
+mg_model.set_param_hint("Tstartup", min=0.0)
+mg_model.set_param_hint("Trow_once", min=0.0)
+mg_model.set_param_hint("Tres_once", min=0.0)
+mg_model.set_param_hint("Taggr_prepare_result", min=0.0)
+mg_model.set_param_hint("Taggr_process", min=0.0)
+mg_model.set_param_hint("Tgroup_hash_col", min=0.0)
+mg_model.set_param_hint("Tcopy_col", min=0.0)
+def extract_info_from_line(line):
+    splited = line.split(",")
+    line_info = []
+    for item in splited:
+        line_info.append(float(item))
+    return line_info
+
+
+if __name__ == '__main__':
+    file_name = "scan_model.res.formal.prep"
+    out_file_name = "scan_model.fit"
+
+
+    output_fit_res = False
+    wrong_arg = False
+    opts,args = getopt.getopt(sys.argv[1:],"i:o:")
+    for op, value in opts:
+        if "-i" == op:
+            file_name = value
+        elif "-o" == op:
+            output_fit_res = True
+            out_file_name = value
+        else:
+            wrong_arg = True
+
+    if wrong_arg:
+        print "wrong arg"
+        sys.exit(1)
+
+    file = open(file_name, "r")
+    arg_sets = []
+    times = []
+    case_params = []
+    for line in file:
+        case_param = extract_info_from_line(line)
+        case_params.append(case_param)
+
+        # Nrow_input,
+        # Nrow_res,
+        # Ncol_input,
+        # Ncol_aggr,
+        # Ncol_group
+
+
+        arg_sets.append((case_param[0],
+                         case_param[5],
+                         case_param[4],
+                         case_param[2],
+                         case_param[3]
+                         ))
+        times.append(case_param[6])
+    file.close()
+    arg_sets_np = np.array(arg_sets)
+    times_np = np.array(times)
+    #10, 0.20406430879623488, 0.016618100054245379, 14.0, 4.5, 37.0, -0.005, 0.5, -7.0
+    result = mg_model.fit(times_np, arg_sets=arg_sets_np,
+                         Tstartup = 0.1,
+                         Trow_once = 0.1,
+                         Tres_once = 0.1,
+                         Taggr_prepare_result = 0.1,
+                         Taggr_process = 0.1,
+                         Tgroup_hash_col = 0.1,
+                         Tcopy_col = 0.1
+                         )
+
+    res_line = str(result.best_values["Tstartup"]) + ","
+    res_line += str(result.best_values["Trow_once"]) + ","
+    res_line += str(result.best_values["Tres_once"]) + "," 
+    res_line += str(result.best_values["Taggr_prepare_result"]) + ","
+    res_line += str(result.best_values["Taggr_process"]) + ","
+    res_line += str(result.best_values["Tgroup_hash_col"]) + ","
+    res_line += str(result.best_values["Tcopy_col"]) 
+
+
+    print result.fit_report()
+
+    if output_fit_res:
+        out_file = open(out_file_name, "w")
+        out_file.write(res_line)
+        out_file.close()
--- a/unittest/sql/optimizer/cost_model_utils/fit_material.py
+++ b/unittest/sql/optimizer/cost_model_utils/fit_material.py
@ -0,0 +1,111 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+
+import math
+import numpy as np
+from scipy.optimize import leastsq
+from scipy.optimize import curve_fit
+import sys
+from lmfit import Model
+import getopt
+import os
+
+
+def material_model_form(args,
+                        # Tstartup,
+                        Trow_once,
+                        Trow_col):
+    (
+        Nrow,
+        Ncol,
+    ) = args
+
+    total_cost = 0  # Tstartup
+    total_cost += Nrow * (Trow_once + Ncol * Trow_col)
+    return total_cost
+
+
+def material_model_arr(arg_sets,
+                       # Tstartup,
+                       Trow_once,
+                       Trow_col):
+    res = []
+    for single_arg_set in arg_sets:
+        res.append(material_model_form(single_arg_set,
+                                       # Tstartup,
+                                       Trow_once,
+                                       Trow_col))
+    return np.array(res)
+
+
+material_model = Model(material_model_arr)
+# material_model.set_param_hint("Tstartup", min=0.0)
+material_model.set_param_hint("Trow_once", min=0.0)
+material_model.set_param_hint("Trow_col", min=0.0)
+
+
+def extract_info_from_line(line):
+    splited = line.split(",")
+    line_info = []
+    for item in splited:
+        line_info.append(float(item))
+    return line_info
+
+
+if __name__ == '__main__':
+    # file_name = "scan_model.res.formal.prep"
+    file_name = "material_result_final"
+    # out_file_name = "scan_model.fit"
+    out_file_name = "material_model"
+
+    if os.path.exists(out_file_name):
+        os.remove(out_file_name)
+    # sys.argv.extend("-i rowstore.prepare.bigint -o rowstore.model".split(" "))
+
+    output_fit_res = True
+    wrong_arg = False
+    opts, args = getopt.getopt(sys.argv[1:], "i:o:")
+    for op, value in opts:
+        if "-i" == op:
+            file_name = value
+        elif "-o" == op:
+            output_fit_res = True
+            out_file_name = value
+        else:
+            wrong_arg = True
+
+    if wrong_arg:
+        print "wrong arg"
+        sys.exit(1)
+
+    file = open(file_name, "r")
+    arg_sets = []
+    times = []
+    case_params = []
+    for line in file:
+        if line.startswith('#'):
+            continue
+        case_param = extract_info_from_line(line)
+        case_params.append(case_param)
+        arg_sets.append((case_param[0], case_param[1]))
+        times.append(case_param[3])
+    file.close()
+    arg_sets_np = np.array(arg_sets)
+    times_np = np.array(times)
+    # 10, 0.20406430879623488, 0.016618100054245379, 14.0, 4.5, 37.0, -0.005, 0.5, -7.0
+    result = material_model.fit(times_np, arg_sets=arg_sets_np,
+                                # Tstartup=10.0,
+                                Trow_once=10.0,
+                                Trow_col=1.0
+                                )
+
+    # res_line = str(result.best_values["Tstartup"]) + ","
+    res_line = str(result.best_values["Trow_once"]) + ","
+    res_line += str(result.best_values["Trow_col"])
+
+    print result.fit_report()
+
+    if output_fit_res:
+        out_file = open(out_file_name, "w")
+        out_file.write(res_line)
+        out_file.close()
--- a/unittest/sql/optimizer/cost_model_utils/fit_merge.py
+++ b/unittest/sql/optimizer/cost_model_utils/fit_merge.py
@ -0,0 +1,167 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+
+
+import math
+import numpy as np
+from scipy.optimize import leastsq
+from scipy.optimize import curve_fit
+import sys
+from lmfit import Model
+import getopt
+
+def merge_model_form(args,
+                     Tstartup,
+                     Tres_right_op,
+                     Tres_right_cache,
+                     Tmatch_group,
+                     #Tassemble_row,
+                     Tequal_fail,
+                     Trow_left,
+                     Trow_right
+                     ):
+    (
+        Nrow_res,
+        Nrow_left,
+        Nrow_right,
+        Nright_cache_in,
+        Nright_cache_out,
+        Nright_cache_clear,
+        Nequal_cond
+    ) = args
+
+    total_cost = Tstartup
+    total_cost += Nrow_left * Trow_left
+    total_cost += (Nrow_right - Nright_cache_in) * Trow_right
+    total_cost += Nright_cache_in * Tres_right_op
+    total_cost += Nright_cache_out * Tres_right_cache
+    #total_cost += Nrow_res * Tassemble_row
+    total_cost += Nright_cache_clear * Tmatch_group
+    total_cost += (Nequal_cond - Nrow_res - 2 * Tmatch_group) * Tequal_fail
+
+
+    # total_cost += Nright_cache_in * Tres_right_op
+    # total_cost += (Nrow_res - Nright_cache_in) * Tres_right_cache
+    # total_cost += Nright_cache_clear * Tmatch_group
+    # total_cost += Nrow_res * Tassemble_row
+    # total_cost += (Nequal_cond - Nrow_res - 2 * Tmatch_group) * Tequal_fail
+    # total_cost += Nrow_left * Trow_left
+    # total_cost += (Nrow_right - Nright_cache_in) * Trow_right
+
+    return total_cost
+
+eval_count = 0
+
+def merge_model_arr(arg_sets,
+                    Tstartup,
+                    Tres_right_op,
+                    Tres_right_cache,
+                    Tmatch_group,
+                    #Tassemble_row,
+                    Tequal_fail,
+                    Trow_left,
+                    Trow_right
+                    ):
+    res = [merge_model_form(single_arg_set,
+                            Tstartup,
+                            Tres_right_op,
+                            Tres_right_cache,
+                            Tmatch_group,
+                            #Tassemble_row,
+                            Tequal_fail,
+                            Trow_left,
+                            Trow_right
+                            ) for single_arg_set in arg_sets]
+    global eval_count
+    eval_count += 1
+    return np.array(res)
+
+
+merge_model = Model(merge_model_arr)
+merge_model.set_param_hint("Tstartup", min=0.0)
+merge_model.set_param_hint("Tres_right_op", min=0.0)
+merge_model.set_param_hint("Tres_right_cache", min=0.0)
+merge_model.set_param_hint("Tmatch_group", min=0.0)
+#merge_model.set_param_hint("Tassemble_row", min=0.0)
+merge_model.set_param_hint("Tequal_fail", min=0.0)
+merge_model.set_param_hint("Trow_left", min=0.0)
+merge_model.set_param_hint("Trow_right", min=0.0)
+
+def extract_info_from_line(line):
+    splited = line.split(",")
+    line_info = []
+    for item in splited:
+        line_info.append(float(item))
+    return line_info
+
+
+if __name__ == '__main__':
+    file_name = "scan_model.res.formal.prep"
+    out_file_name = "scan_model.fit"
+
+    sys.argv.extend("-i merge.prep.1 -o merge.model".split(" "))
+
+    output_fit_res = False
+    wrong_arg = False
+    opts,args = getopt.getopt(sys.argv[1:],"i:o:")
+    for op, value in opts:
+        if "-i" == op:
+            file_name = value
+        elif "-o" == op:
+            output_fit_res = True
+            out_file_name = value
+        else:
+            wrong_arg = True
+
+    if wrong_arg:
+        print "wrong arg"
+        sys.exit(1)
+
+    file = open(file_name, "r")
+    arg_sets = []
+    times = []
+    case_params = []
+    for line in file:
+        case_param = extract_info_from_line(line)
+        case_params.append(case_param)
+        arg_sets.append((case_param[6],     #Nrow_res
+                         case_param[0],     #Nrow_left
+                         case_param[1],     #Nrow_right
+                         case_param[-3],    #Nright_cache_in
+                         case_param[-2],    #Nright_cache_out
+                         case_param[-1],    #Nright_cache_clear
+                         case_param[8]      #Nequal_cond
+                         ))
+        times.append(case_param[7])
+    file.close()
+    arg_sets_np = np.array(arg_sets)
+    times_np = np.array(times)
+    #10, 0.20406430879623488, 0.016618100054245379, 14.0, 4.5, 37.0, -0.005, 0.5, -7.0
+    result = merge_model.fit(times_np, arg_sets=arg_sets_np,
+                             Tstartup=0.1,
+                             Tres_right_op=0.1,
+                             Tres_right_cache=0.1,
+                             Tmatch_group=1.0,
+                             #Tassemble_row=0.5,
+                             Tequal_fail=1.0,
+                             Trow_left=0.05,
+                             Trow_right=0.05
+                            )
+
+
+    res_line = str(result.best_values["Tstartup"]) + ","
+    res_line += str(result.best_values["Tres_right_op"]) + ","
+    res_line += str(result.best_values["Tres_right_cache"]) + ","
+    res_line += str(result.best_values["Tmatch_group"]) + ","
+    #res_line += str(result.best_values["Tassemble_row"]) + ","
+    res_line += str(result.best_values["Tequal_fail"]) + ","
+    res_line += str(result.best_values["Trow_left"]) + ","
+    res_line += str(result.best_values["Trow_right"])
+
+
+    print result.fit_report()
+
+    if output_fit_res:
+        out_file = open(out_file_name, "w")
+        out_file.write(res_line)
+        out_file.close()
--- a/unittest/sql/optimizer/cost_model_utils/fit_mergegroupby.py
+++ b/unittest/sql/optimizer/cost_model_utils/fit_mergegroupby.py
@ -0,0 +1,166 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+
+
+import math
+import numpy as np
+from scipy.optimize import leastsq
+from scipy.optimize import curve_fit
+import sys
+from lmfit import Model
+import getopt
+
+
+
+def mg_model_form(args,
+                  #Tstartup,
+                  Trow_once,
+                  Tres_once,
+                  Taggr_prepare_result,
+                  Taggr_process,
+                  Tgroup_cmp_col,
+                  Tcopy_col
+                  ):
+    (
+        Nrow_input,
+        Nrow_res,
+        Ncol_input,
+        Ncol_aggr,
+        Ncol_group
+    ) = args
+
+    total_cost = Nrow_res * Tres_once + Nrow_input * Trow_once
+    #cost for judge group
+    total_cost += Nrow_res * Tgroup_cmp_col
+    total_cost += (Nrow_input - Nrow_res) * Ncol_group * Tgroup_cmp_col
+
+    #cost for group related operation
+    total_cost += Nrow_res * (Ncol_input * Tcopy_col)
+    total_cost += Nrow_res * (Ncol_aggr * Taggr_prepare_result)
+
+    #cost for input row process
+    total_cost += Nrow_input * (Ncol_aggr * Taggr_process)
+
+    return total_cost
+
+
+
+
+eval_count = 0
+
+
+def mg_model_arr(arg_sets,
+                 #Tstartup,
+                 Trow_once,
+                 Tres_once,
+                 Taggr_prepare_result,
+                 Taggr_process,
+                 Tgroup_cmp_col,
+                 Tcopy_col
+                 ) :
+
+    res = [mg_model_form(single_arg_set,
+                         #Tstartup,
+                         Trow_once,
+                         Tres_once,
+                         Taggr_prepare_result,
+                         Taggr_process,
+                         Tgroup_cmp_col,
+                         Tcopy_col
+                        ) for single_arg_set in arg_sets]
+    global eval_count
+    eval_count += 1
+    print "eval "+ str(eval_count)
+    return np.array(res)
+
+mg_model = Model(mg_model_arr)
+#mg_model.set_param_hint("Tstartup", min=0.0)
+mg_model.set_param_hint("Trow_once", min=0.0)
+mg_model.set_param_hint("Tres_once", min=0.0)
+mg_model.set_param_hint("Taggr_prepare_result", min=0.0)
+mg_model.set_param_hint("Taggr_process", min=0.0)
+mg_model.set_param_hint("Tgroup_cmp_col", min=0.0)
+mg_model.set_param_hint("Tcopy_col", min=0.0)
+def extract_info_from_line(line):
+    splited = line.split(",")
+    line_info = []
+    for item in splited:
+        line_info.append(float(item))
+    return line_info
+
+
+if __name__ == '__main__':
+    #file_name = "scan_model.res.formal.prep"
+    #out_file_name = "scan_model.fit"
+
+    file_name = "mergegroupby_result_final"
+    out_file_name = "mergegroupby_model"
+
+    output_fit_res = True
+    wrong_arg = False
+    opts,args = getopt.getopt(sys.argv[1:],"i:o:")
+    for op, value in opts:
+        if "-i" == op:
+            file_name = value
+        elif "-o" == op:
+            output_fit_res = True
+            out_file_name = value
+        else:
+            wrong_arg = True
+
+    if wrong_arg:
+        print "wrong arg"
+        sys.exit(1)
+
+    file = open(file_name, "r")
+    arg_sets = []
+    times = []
+    case_params = []
+    for line in file:
+        if line.startswith('#'):
+            continue
+        case_param = extract_info_from_line(line)
+        case_params.append(case_param)
+
+        # Nrow_input,
+        # Nrow_res,
+        # Ncol_input,
+        # Ncol_aggr,
+        # Ncol_group
+
+
+        arg_sets.append((case_param[0],
+                         case_param[5],
+                         case_param[4],
+                         case_param[2],
+                         case_param[3]
+                         ))
+        times.append(case_param[6])
+    file.close()
+    arg_sets_np = np.array(arg_sets)
+    times_np = np.array(times)
+    #10, 0.20406430879623488, 0.016618100054245379, 14.0, 4.5, 37.0, -0.005, 0.5, -7.0
+    result = mg_model.fit(times_np, arg_sets=arg_sets_np,
+                         #Tstartup = 0.1,
+                         Trow_once = 0.1,
+                         Tres_once = 0.1,
+                         Taggr_prepare_result = 0.1,
+                         Taggr_process = 0.1,
+                         Tgroup_cmp_col = 0.1,
+                         Tcopy_col = 0.1
+                         )
+
+    res_line = str(result.best_values["Trow_once"]) + ","
+    res_line += str(result.best_values["Tres_once"]) + "," 
+    res_line += str(result.best_values["Taggr_prepare_result"]) + ","
+    res_line += str(result.best_values["Taggr_process"]) + ","
+    res_line += str(result.best_values["Tgroup_cmp_col"]) + ","
+    res_line += str(result.best_values["Tcopy_col"]) 
+
+
+    print result.fit_report()
+
+    if output_fit_res:
+        out_file = open(out_file_name, "w")
+        out_file.write(res_line)
+        out_file.close()
--- a/unittest/sql/optimizer/cost_model_utils/fit_mg.py
+++ b/unittest/sql/optimizer/cost_model_utils/fit_mg.py
@ -0,0 +1,164 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+
+
+import math
+import numpy as np
+from scipy.optimize import leastsq
+from scipy.optimize import curve_fit
+import sys
+from lmfit import Model
+import getopt
+
+
+
+def mg_model_form(args,
+                  #Tstartup,
+                  Trow_once,
+                  Tres_once,
+                  Taggr_prepare_result,
+                  Taggr_process,
+                  Tgroup_cmp_col,
+                  Tcopy_col
+                  ):
+    (
+        Nrow_input,
+        Nrow_res,
+        Ncol_input,
+        Ncol_aggr,
+        Ncol_group
+    ) = args
+
+    total_cost = Nrow_res * Tres_once + Nrow_input * Trow_once
+    #cost for judge group
+    total_cost += Nrow_res * Tgroup_cmp_col
+    total_cost += (Nrow_input - Nrow_res) * Ncol_group * Tgroup_cmp_col
+
+    #cost for group related operation
+    total_cost += Nrow_res * (Ncol_input * Tcopy_col)
+    total_cost += Nrow_res * (Ncol_aggr * Taggr_prepare_result)
+
+    #cost for input row process
+    total_cost += Nrow_input * (Ncol_aggr * Taggr_process)
+
+    return total_cost
+
+
+
+
+eval_count = 0
+
+
+def mg_model_arr(arg_sets,
+                 #Tstartup,
+                 Trow_once,
+                 Tres_once,
+                 Taggr_prepare_result,
+                 Taggr_process,
+                 Tgroup_cmp_col,
+                 Tcopy_col
+                 ) :
+
+    res = [mg_model_form(single_arg_set,
+                         #Tstartup,
+                         Trow_once,
+                         Tres_once,
+                         Taggr_prepare_result,
+                         Taggr_process,
+                         Tgroup_cmp_col,
+                         Tcopy_col
+                        ) for single_arg_set in arg_sets]
+    global eval_count
+    eval_count += 1
+    print "eval "+ str(eval_count)
+    return np.array(res)
+
+mg_model = Model(mg_model_arr)
+#mg_model.set_param_hint("Tstartup", min=0.0)
+mg_model.set_param_hint("Trow_once", min=0.0)
+mg_model.set_param_hint("Tres_once", min=0.0)
+mg_model.set_param_hint("Taggr_prepare_result", min=0.0)
+mg_model.set_param_hint("Taggr_process", min=0.0)
+mg_model.set_param_hint("Tgroup_cmp_col", min=0.0)
+mg_model.set_param_hint("Tcopy_col", min=0.0)
+def extract_info_from_line(line):
+    splited = line.split(",")
+    line_info = []
+    for item in splited:
+        line_info.append(float(item))
+    return line_info
+
+
+if __name__ == '__main__':
+    #file_name = "scan_model.res.formal.prep"
+    #out_file_name = "scan_model.fit"
+
+    file_name = "mergegroupby_result_final"
+    out_file_name = "mergegroupby_model"
+
+    output_fit_res = True
+    wrong_arg = False
+    opts,args = getopt.getopt(sys.argv[1:],"i:o:")
+    for op, value in opts:
+        if "-i" == op:
+            file_name = value
+        elif "-o" == op:
+            output_fit_res = True
+            out_file_name = value
+        else:
+            wrong_arg = True
+
+    if wrong_arg:
+        print "wrong arg"
+        sys.exit(1)
+
+    file = open(file_name, "r")
+    arg_sets = []
+    times = []
+    case_params = []
+    for line in file:
+        case_param = extract_info_from_line(line)
+        case_params.append(case_param)
+
+        # Nrow_input,
+        # Nrow_res,
+        # Ncol_input,
+        # Ncol_aggr,
+        # Ncol_group
+
+
+        arg_sets.append((case_param[0],
+                         case_param[5],
+                         case_param[4],
+                         case_param[2],
+                         case_param[3]
+                         ))
+        times.append(case_param[6])
+    file.close()
+    arg_sets_np = np.array(arg_sets)
+    times_np = np.array(times)
+    #10, 0.20406430879623488, 0.016618100054245379, 14.0, 4.5, 37.0, -0.005, 0.5, -7.0
+    result = mg_model.fit(times_np, arg_sets=arg_sets_np,
+                         #Tstartup = 0.1,
+                         Trow_once = 0.1,
+                         Tres_once = 0.1,
+                         Taggr_prepare_result = 0.1,
+                         Taggr_process = 0.1,
+                         Tgroup_cmp_col = 0.1,
+                         Tcopy_col = 0.1
+                         )
+
+    res_line = str(result.best_values["Trow_once"]) + ","
+    res_line += str(result.best_values["Tres_once"]) + "," 
+    res_line += str(result.best_values["Taggr_prepare_result"]) + ","
+    res_line += str(result.best_values["Taggr_process"]) + ","
+    res_line += str(result.best_values["Tgroup_cmp_col"]) + ","
+    res_line += str(result.best_values["Tcopy_col"]) 
+
+
+    print result.fit_report()
+
+    if output_fit_res:
+        out_file = open(out_file_name, "w")
+        out_file.write(res_line)
+        out_file.close()
--- a/unittest/sql/optimizer/cost_model_utils/fit_miss.py
+++ b/unittest/sql/optimizer/cost_model_utils/fit_miss.py
@ -0,0 +1,140 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+
+
+import math
+import numpy as np
+from scipy.optimize import leastsq
+from scipy.optimize import curve_fit
+import sys
+from lmfit import Model
+import getopt
+
+
+def get_row_size(col):
+    size = 16
+    size += col * (3 + 8 + 4 + 8 + 16 + 32 + 64 + 128)
+    size += col
+    return size
+
+def round_wasted_spave(rsize, psize):
+    nr = math.floor(float(psize / rsize))
+    waste = psize - nr * rsize
+    return rsize + waste / nr
+
+
+
+def get_miss_prob(Nrow, Ncol, Turn):
+    total_size = Nrow * get_row_size(Ncol)
+    TLBcovered = Turn
+    if TLBcovered >= 0.9 * total_size:
+        hit = 0.9
+    else:
+        hit = TLBcovered / total_size
+    return 1 - hit
+
+def sort_model_form(args,
+                    Tmiss,
+                    Turn
+                    ):
+    (
+        Nrow,
+        Ncol,
+    ) = args
+
+    total_cost = 0
+
+    total_cost += Nrow * Tmiss * Ncol * get_miss_prob(Nrow, Ncol, Turn)
+
+    return total_cost
+
+def sort_model_arr(arg_sets,
+                   Tmiss,
+                   Turn,
+                   ):
+    res = []
+    for single_arg_set in arg_sets:
+        res.append(sort_model_form(single_arg_set,
+                                   Tmiss,
+                                   Turn,
+                                   ))
+    return np.array(res)
+
+sort_model = Model(sort_model_arr)
+sort_model.set_param_hint("Tmiss", min=0.0)
+sort_model.set_param_hint("Turn", min=2097152.0, max=2097153.0)
+
+# sort_model.set_param_hint("Tmiss_K2", min=0.0)
+
+def extract_info_from_line(line):
+    splited = line.split(",")
+    line_info = []
+    for item in splited:
+        line_info.append(float(item))
+    return line_info
+
+
+if __name__ == '__main__':
+    file_name = "miss.prep.1"
+    out_file_name = "miss.model"
+
+    # sys.argv.extend("-i sort.prep.bigint -o sort.model".split(" "))
+
+    output_fit_res = False
+    wrong_arg = False
+    opts,args = getopt.getopt(sys.argv[1:],"i:o:R:C:")
+    for op, value in opts:
+        if "-i" == op:
+            file_name = value
+        elif "-o" == op:
+            output_fit_res = True
+            out_file_name = value
+        elif "-R" == op:
+            MATERIAL_ROW_ONCE = float(value)
+        elif "-C" == op:
+            MATERIAL_ROW_COL = float(value)
+        else:
+            wrong_arg = True
+
+    if wrong_arg:
+        print "wrong arg"
+        sys.exit(1)
+
+    file = open(file_name, "r")
+    arg_sets = []
+    times = []
+    case_params = []
+    for line in file:
+        if line.startswith('#'):
+            continue
+        case_param = extract_info_from_line(line)
+        case_params.append(case_param)
+        arg_sets.append((case_param[0], case_param[1]))
+        times.append(case_param[3])
+    file.close()
+    arg_sets_np = np.array(arg_sets)
+    times_np = np.array(times)
+    #10, 0.20406430879623488, 0.016618100054245379, 14.0, 4.5, 37.0, -0.005, 0.5, -7.0
+    result = sort_model.fit(times_np, arg_sets=arg_sets_np,
+                            Tmiss=1.0,
+                            Turn=2097152,
+                            )
+
+    Tmiss = result.best_values["Tmiss"]
+    Turn = result.best_values["Turn"]
+    res_line = str(Tmiss) + ","
+    res_line += str(Turn)
+    # res_line += str(result.best_values["Tmiss_K2"])
+
+
+    print result.fit_report()
+
+    if output_fit_res:
+        out_file = open(out_file_name, "w")
+        out_file.write(res_line)
+        out_file.close()
+
+    for i, args in enumerate(arg_sets):
+        cost = sort_model_form(args, Tmiss, Turn)
+        time = times[i]
+        print "\t".join([str(args), str(time), str(cost)])
--- a/unittest/sql/optimizer/cost_model_utils/fit_nl.py
+++ b/unittest/sql/optimizer/cost_model_utils/fit_nl.py
@ -0,0 +1,147 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+
+
+import math
+import numpy as np
+from scipy.optimize import leastsq
+from scipy.optimize import curve_fit
+import sys
+from lmfit import Model
+import getopt
+
+
+
+def nl_model_form(args,
+                  Tstartup,
+                  #Tqual,
+                  Tres,
+                  Tfail,
+                  Tleft_row,
+                  Tright_row
+                  ):
+    (
+        Nrow_res,
+        Nrow_left,
+        Nrow_right,
+        Nright_cache_in,
+        Nright_cache_out,
+        Nright_cache_clear,
+        Nequal_cond
+    ) = args
+
+    total_cost = Tstartup
+    total_cost += Nrow_res * Tres
+    #total_cost += Nequal_cond * Tqual
+    total_cost += (Nequal_cond - Nrow_res) * Tfail
+    total_cost += Nrow_left * Tleft_row
+    total_cost += Nrow_right * Tright_row
+
+    return total_cost
+
+eval_count = 0
+
+def nl_model_arr(arg_sets,
+                 Tstartup,
+                 #Tqual,
+                 Tres,
+                 Tfail,
+                 Tleft_row,
+                 Tright_row
+                 ):
+    res = [nl_model_form(single_arg_set,
+                         Tstartup,
+                         #Tqual,
+                         Tres,
+                         Tfail,
+                         Tleft_row,
+                         Tright_row
+                         ) for single_arg_set in arg_sets]
+    global eval_count
+    eval_count += 1
+    return np.array(res)
+
+
+nl_model = Model(nl_model_arr)
+nl_model.set_param_hint("Tstartup", min=0.0, max = 50)
+#nl_model.set_param_hint("Tqual", min=0.0)
+nl_model.set_param_hint("Tres", min=0.0)
+nl_model.set_param_hint("Tfail", min=0.0)
+nl_model.set_param_hint("Tleft_row", min=0.0)
+nl_model.set_param_hint("Tright_row", min=0.0)
+
+
+def extract_info_from_line(line):
+    splited = line.split(",")
+    line_info = []
+    for item in splited:
+        line_info.append(float(item))
+    return line_info
+
+
+if __name__ == '__main__':
+    file_name = "scan_model.res.formal.prep"
+    out_file_name = "scan_model.fit"
+
+    sys.argv.extend("-i nl.prep -o nl.model".split(" "))
+
+    output_fit_res = False
+    wrong_arg = False
+    opts,args = getopt.getopt(sys.argv[1:],"i:o:")
+    for op, value in opts:
+        if "-i" == op:
+            file_name = value
+        elif "-o" == op:
+            output_fit_res = True
+            out_file_name = value
+        else:
+            wrong_arg = True
+
+    if wrong_arg:
+        print "wrong arg"
+        sys.exit(1)
+
+    file = open(file_name, "r")
+    arg_sets = []
+    times = []
+    case_params = []
+    for line in file:
+        case_param = extract_info_from_line(line)
+        case_params.append(case_param)
+        arg_sets.append((case_param[6],     #Nrow_res
+                         case_param[0],     #Nrow_left
+                         case_param[1],     #Nrow_right
+                         case_param[-3],    #Nright_cache_in
+                         case_param[-2],    #Nright_cache_out
+                         case_param[-1],    #Nright_cache_clear
+                         case_param[8]      #Nequal_cond
+                         ))
+        times.append(case_param[7])
+    file.close()
+    arg_sets_np = np.array(arg_sets)
+    times_np = np.array(times)
+    #10, 0.20406430879623488, 0.016618100054245379, 14.0, 4.5, 37.0, -0.005, 0.5, -7.0
+    result = nl_model.fit(times_np, arg_sets=arg_sets_np,
+                             Tstartup=50.0,
+                             #Tqual=0.1,
+                             Tres=0.3,
+                             Tfail=0.3,
+                             Tleft_row=0.3,
+                             Tright_row=0.3
+                            )
+
+
+    res_line = str(result.best_values["Tstartup"]) + ","
+    #res_line += str(result.best_values["Tqual"]) + ","
+    res_line += str(result.best_values["Tres"]) + ","
+    res_line += str(result.best_values["Tfail"]) + ","
+    res_line += str(result.best_values["Tleft_row"]) + ","
+    res_line += str(result.best_values["Tright_row"])
+
+
+    print result.fit_report()
+
+    if output_fit_res:
+        out_file = open(out_file_name, "w")
+        out_file.write(res_line)
+        out_file.close()
--- a/unittest/sql/optimizer/cost_model_utils/fit_rowstore_master.py
+++ b/unittest/sql/optimizer/cost_model_utils/fit_rowstore_master.py
@ -0,0 +1,60 @@
+import math
+import numpy as np
+from scipy.optimize import leastsq
+from scipy.optimize import curve_fit
+import sys
+from lmfit import Model
+import getopt
+import subprocess
+import os
+import re
+
+
+types_to_test = {'bigint':'bigint', 'double':'double', 'float':'float', 'timestamp':'timestamp', 'number':'number(20,3)','v1':'varchar(1)','v32':'varchar(32)', 'v64':'varchar(64)', 'v128':'varchar(128)'}
+
+def run_cmd(cmd):
+    #print cmd
+    res = ''
+    p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    while True:
+        line = p.stdout.readline()
+        res += line
+        if line:
+            #print line.strip()
+            sys.stdout.flush()
+        else:
+            break
+    p.wait()
+    return res.strip()
+
+def rm_if_exist(filename):
+    if os.path.exists(filename):
+        os.remove(filename)
+
+def extract_kv(k, src):
+    pat=k + ':\s*[\d\.e\-\+]+'
+    mat = re.compile(pat)
+    return float(mat.findall(src)[0].split()[1])
+
+for t in sorted(types_to_test.keys()):
+    result_file_name = 'rowstore.result.' + t
+    prep_file_name = 'rowstore.prep.' + t
+    model_file = 'rowstore.model.' + t
+    fit_file = 'rowstore.fit.' + t
+    rm_if_exist(prep_file_name)
+    run_cmd("./preprocess.py -i %s -o %s -t 7 -C 3 -d" % (result_file_name, prep_file_name))
+    fitres = run_cmd("./fit_material.py -i " + prep_file_name + " -o " + model_file)
+    # print fitres
+    run_cmd("./apply_material_model.py -i %s -o %s -m %s" % (prep_file_name, fit_file, model_file))
+    Trow_col = extract_kv('Trow_col', fitres)
+    Trow_once = extract_kv('Trow_once', fitres)
+    print types_to_test[t] + ":"
+    print "  " + str(Trow_col)
+    print "  " + str(Trow_once)
+
+
+
+
+
+
+
--- a/unittest/sql/optimizer/cost_model_utils/fit_sort.py
+++ b/unittest/sql/optimizer/cost_model_utils/fit_sort.py
@ -0,0 +1,268 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+
+
+import math
+import numpy as np
+from scipy.optimize import leastsq
+from scipy.optimize import curve_fit
+import sys
+from lmfit import Model
+import getopt
+import os
+
+
+MATERIAL_ROW_COL = 0.02674675
+MATERIAL_ROW_ONCE = 0.07931677
+RESERVE_CELL = 0.0044
+
+def material_model_form(args):
+    (
+        Nrow,
+        Ncol,
+    ) = args
+
+    global MATERIAL_ROW_COL
+    global MATERIAL_ROW_ONCE
+
+    Trow_col = MATERIAL_ROW_COL
+    Trow_once = MATERIAL_ROW_ONCE
+
+    total_cost = 0 #Tstartup
+    total_cost += Nrow * (Trow_once + Ncol * Trow_col)
+    return total_cost
+
+def array_model_form(args):
+    # (
+    #     Nelem,
+    # ) = args
+
+    Telem_ence = 0.00898860
+    Telem_copy = 0.00631888
+
+    Nelem = args
+
+    ELEM_PER_PAGE = 1024
+    extend_cnt = math.ceil(math.log(float(Nelem)/ELEM_PER_PAGE, 2))
+    if extend_cnt < 0:
+        extend_cnt = 0
+    copy_cnt = ELEM_PER_PAGE * (math.pow(2, extend_cnt) - 1)
+
+    total_cost = Telem_ence * Nelem
+    #total_cost += Tmem_alloc * extend_cnt
+    total_cost += Telem_copy * copy_cnt
+
+    return total_cost
+
+def get_row_size(reserve, col):
+    size = 16
+    size += reserve * 16
+    col /= 8
+    size += col * (3 + 8 + 4 + 8 + 16 + 32 + 64 + 128)
+    size += col
+    return size
+
+def round_wasted_spave(rsize, psize):
+    nr = math.floor(float(psize / rsize))
+    waste = psize - nr * rsize
+    return rsize + waste / nr
+
+
+
+def get_miss_prob(Nrow, Ncol, Nord, Turn):
+    total_size = Nrow * get_row_size(Nord, Ncol)
+    TLBcovered = Turn
+    if TLBcovered >= 0.9 * total_size:
+        hit = 0.9
+    else:
+        hit = TLBcovered / total_size
+    return 1 - hit
+
+
+
+def sort_model_form(args,
+                    #Tstartup,
+                    #Trowstore_once,
+                    #Trowstore_col,
+                    # Tarray_once,
+                    # Tarray_elem_copy,
+                    # Tordercol,
+                    #Treserve_cell,
+                    Tcompare,
+                    # Trow_once,
+                    Tmiss_K1,
+                    Turn
+                    # Tmiss_K2
+                    ):
+    (
+        Nrow,
+        Ncol,
+        Nordering,
+    ) = args
+
+    total_cost = 0 #Tstartup
+
+    # total_cost += Nrow * Trow_once
+    #cost for rowstore
+    # total_cost += material_model_form((Nrow, Ncol))
+    # total_cost += 0.0044 * Nrow * Ncol * Nordering
+    # total_cost += Tordercol * Nrow * Nordering
+
+    #cost for push array
+    # total_cost += array_model_form(Nrow)
+
+    # cost for sorting
+    Nordering_cmp = Nordering
+    if Nordering >= 1:
+        Nordering_cmp = 1
+    compare_cost = Tcompare * Nordering_cmp + Tmiss_K1 * get_miss_prob(Nrow, Ncol, Nordering, Turn)
+    total_cost += Nrow * compare_cost * math.log(Nrow, 2)
+
+    #cost for get row
+    # total_cost += Nrow * (Tmiss_K2 * get_miss_prob(Nrow, Ncol, Nordering))
+    return total_cost
+
+def sort_model_arr(arg_sets,
+                   #Tstartup,
+                   # Trowstore_once,
+                   # Trowstore_col,
+                   # Tarray_once,
+                   # Tarray_elem_copy,
+                   # Tordercol,
+                   # Treserve_cell,
+                   Tcompare,
+                   # Trow_once,
+                   Tmiss_K1,
+                   Turn,
+                   # Tmiss_K2
+                   ):
+    res = []
+    for single_arg_set in arg_sets:
+        res.append(sort_model_form(single_arg_set,
+                                   # Tstartup,
+                                   # Trowstore_once,
+                                   # Trowstore_col,
+                                   # Tarray_once,
+                                   # Tarray_elem_copy,
+                                   # Tordercol,
+                                   # Treserve_cell,
+                                   Tcompare,
+                                   # Trow_once,
+                                   Tmiss_K1,
+                                   Turn,
+                                   # Tmiss_K2
+                                   ))
+    return np.array(res)
+
+sort_model = Model(sort_model_arr)
+# #sort_model.set_param_hint("Tstartup", min=0.0)
+# #sort_model.set_param_hint("Trow_startup", min=0.0)
+# sort_model.set_param_hint("Trow_col", min=0.0)
+# #sort_model.set_param_hint("Tcmp_startup", min=0.0)
+# sort_model.set_param_hint("Trow_once", min=0.0)
+# sort_model.set_param_hint("Tcompare", min=0.0)
+# sort_model.set_param_hint("Talloc", min=0.0)
+# sort_model.set_param_hint("Treserve_cell", min=0.0)
+
+# sort_model.set_param_hint("Tstartup", min=0)
+# sort_model.set_param_hint("Trowstore_once", min=0.0)
+# sort_model.set_param_hint("Trowstore_col", min=0.0)
+# sort_model.set_param_hint("Tarray_once", min=0.0)
+# sort_model.set_param_hint("Tarray_elem_copy", min=0.0)
+# sort_model.set_param_hint("Tordercol", min=0.0)
+# sort_model.set_param_hint("Treserve_cell", min=0.0)
+sort_model.set_param_hint("Tcompare", min=0.0)
+# sort_model.set_param_hint("Trow_once", min=0.0)
+sort_model.set_param_hint("Tmiss_K1", min=0.0)
+sort_model.set_param_hint("Turn", min=2097152.0, max=2097153.0)
+
+# sort_model.set_param_hint("Tmiss_K2", min=0.0)
+
+def extract_info_from_line(line):
+    splited = line.split(",")
+    line_info = []
+    for item in splited:
+        line_info.append(float(item))
+    return line_info
+
+
+if __name__ == '__main__':
+    #file_name = "scan_model.res.formal.prep"
+    #out_file_name = "scan_model.fit"
+    file_name = "sort_result_final"
+    out_file_name = "sort_model"
+
+    if os.path.exists(out_file_name):
+        os.remove(out_file_name)
+    # sys.argv.extend("-i sort.prep.bigint -o sort.model".split(" "))
+
+    output_fit_res = True
+    wrong_arg = False
+    opts,args = getopt.getopt(sys.argv[1:],"i:o:R:C:")
+    for op, value in opts:
+        if "-i" == op:
+            file_name = value
+        elif "-o" == op:
+            output_fit_res = True
+            out_file_name = value
+        elif "-R" == op:
+            MATERIAL_ROW_ONCE = float(value)
+        elif "-C" == op:
+            MATERIAL_ROW_COL = float(value)
+        else:
+            wrong_arg = True
+
+    if wrong_arg:
+        print "wrong arg"
+        sys.exit(1)
+
+    file = open(file_name, "r")
+    arg_sets = []
+    times = []
+    case_params = []
+    for line in file:
+        if line.startswith('#'):
+            continue
+        case_param = extract_info_from_line(line)
+        case_params.append(case_param)
+        arg_sets.append((case_param[0], case_param[1], case_param[2]))
+        times.append(case_param[4])
+    file.close()
+    arg_sets_np = np.array(arg_sets)
+    times_np = np.array(times)
+    #10, 0.20406430879623488, 0.016618100054245379, 14.0, 4.5, 37.0, -0.005, 0.5, -7.0
+    result = sort_model.fit(times_np, arg_sets=arg_sets_np,
+                            # Tstartup=25.0,
+                            # Trowstore_once=1.0,
+                            # Trowstore_col=1.0,
+                            # Tarray_once=1.0,
+                            # Tarray_elem_copy=1.0,
+                            # Tordercol=1.0,
+                            # Treserve_cell=1.0,
+                            Tcompare=1.0,
+                            # Trow_once=1.0,
+                            Tmiss_K1=1.0,
+                            Turn=2097152,
+                            # Tmiss_K2=1.0
+                            )
+
+    # res_line = str(result.best_values["Tstartup"]) + ","
+    # res_line += str(result.best_values["Trowstore_once"]) + ","
+    # res_line += str(result.best_values["Trowstore_col"]) + ","
+    # res_line += str(result.best_values["Tarray_once"]) + ","
+    # res_line += str(result.best_values["Tarray_elem_copy"]) + ","
+    # res_line = str(result.best_values["Tordercol"]) + ","
+    # res_line = str(result.best_values["Treserve_cell"]) + ","
+    res_line = str(result.best_values["Tcompare"]) + ","
+    # res_line += str(result.best_values["Trow_once"]) #+ ","
+    res_line += str(result.best_values["Tmiss_K1"]) + ","
+    res_line += str(result.best_values["Turn"])
+    # res_line += str(result.best_values["Tmiss_K2"])
+
+
+    print result.fit_report()
+
+    if output_fit_res:
+        out_file = open(out_file_name, "w")
+        out_file.write(res_line)
+        out_file.close()
--- a/unittest/sql/optimizer/cost_model_utils/fit_sort_add.py
+++ b/unittest/sql/optimizer/cost_model_utils/fit_sort_add.py
@ -0,0 +1,271 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+
+
+import math
+import numpy as np
+from scipy.optimize import leastsq
+from scipy.optimize import curve_fit
+import sys
+from lmfit import Model
+import getopt
+import os
+from cost_test_conf import Config
+
+MATERIAL_ROW_COL = 0.02674675
+MATERIAL_ROW_ONCE = 0.07931677
+RESERVE_CELL = 0.0044
+
+def material_model_form(args):
+    (
+        Nrow,
+        Ncol,
+    ) = args
+
+    global MATERIAL_ROW_COL
+    global MATERIAL_ROW_ONCE
+
+    Trow_col = MATERIAL_ROW_COL
+    Trow_once = MATERIAL_ROW_ONCE
+
+    total_cost = 0 #Tstartup
+    total_cost += Nrow * (Trow_once + Ncol * Trow_col)
+    return total_cost
+
+def array_model_form(args):
+    # (
+    #     Nelem,
+    # ) = args
+
+    Telem_ence = 0.00898860
+    Telem_copy = 0.00631888
+
+    Nelem = args
+
+    ELEM_PER_PAGE = 1024
+    extend_cnt = math.ceil(math.log(float(Nelem)/ELEM_PER_PAGE, 2))
+    if extend_cnt < 0:
+        extend_cnt = 0
+    copy_cnt = ELEM_PER_PAGE * (math.pow(2, extend_cnt) - 1)
+
+    total_cost = Telem_ence * Nelem
+    #total_cost += Tmem_alloc * extend_cnt
+    total_cost += Telem_copy * copy_cnt
+
+    return total_cost
+
+def get_row_size(reserve, col):
+    size = 16
+    size += reserve * 16
+    col /= 8
+    size += col * (3 + 8 + 4 + 8 + 16 + 32 + 64 + 128)
+    size += col
+    return size
+
+def round_wasted_spave(rsize, psize):
+    nr = math.floor(float(psize / rsize))
+    waste = psize - nr * rsize
+    return rsize + waste / nr
+
+
+
+def get_miss_prob(Nrow, Ncol, Nord, Turn):
+    total_size = Nrow * get_row_size(Nord, Ncol)
+    TLBcovered = Turn
+    if TLBcovered >= 0.9 * total_size:
+        hit = 0.9
+    else:
+        hit = TLBcovered / total_size
+    return 1 - hit
+
+
+
+def sort_model_form(args,
+                    #Tstartup,
+                    #Trowstore_once,
+                    #Trowstore_col,
+                    # Tarray_once,
+                    # Tarray_elem_copy,
+                    # Tordercol,
+                    #Treserve_cell,
+                    Tcompare,
+                    # Trow_once,
+                    Tmiss_K1,
+                    Turn
+                    # Tmiss_K2
+                    ):
+    (
+        Nrow,
+        Ncol,
+        Nordering,
+    ) = args
+
+    total_cost = 0 #Tstartup
+
+    # total_cost += Nrow * Trow_once
+    #cost for rowstore
+    # total_cost += material_model_form((Nrow, Ncol))
+    # total_cost += 0.0044 * Nrow * Ncol * Nordering
+    # total_cost += Tordercol * Nrow * Nordering
+
+    #cost for push array
+    # total_cost += array_model_form(Nrow)
+
+    # cost for sorting
+    Nordering_cmp = Nordering
+    if Nordering >= 1:
+        Nordering_cmp = 1
+    compare_cost = Tcompare * Nordering_cmp + Tmiss_K1 * get_miss_prob(Nrow, Ncol, Nordering, Turn)
+    total_cost += Nrow * compare_cost * math.log(Nrow, 2)
+
+    #cost for get row
+    # total_cost += Nrow * (Tmiss_K2 * get_miss_prob(Nrow, Ncol, Nordering))
+    return total_cost
+
+def sort_model_arr(arg_sets,
+                   #Tstartup,
+                   # Trowstore_once,
+                   # Trowstore_col,
+                   # Tarray_once,
+                   # Tarray_elem_copy,
+                   # Tordercol,
+                   # Treserve_cell,
+                   Tcompare,
+                   # Trow_once,
+                   Tmiss_K1,
+                   Turn,
+                   # Tmiss_K2
+                   ):
+    res = []
+    for single_arg_set in arg_sets:
+        res.append(sort_model_form(single_arg_set,
+                                   # Tstartup,
+                                   # Trowstore_once,
+                                   # Trowstore_col,
+                                   # Tarray_once,
+                                   # Tarray_elem_copy,
+                                   # Tordercol,
+                                   # Treserve_cell,
+                                   Tcompare,
+                                   # Trow_once,
+                                   Tmiss_K1,
+                                   Turn,
+                                   # Tmiss_K2
+                                   ))
+    return np.array(res)
+
+sort_model = Model(sort_model_arr)
+# #sort_model.set_param_hint("Tstartup", min=0.0)
+# #sort_model.set_param_hint("Trow_startup", min=0.0)
+# sort_model.set_param_hint("Trow_col", min=0.0)
+# #sort_model.set_param_hint("Tcmp_startup", min=0.0)
+# sort_model.set_param_hint("Trow_once", min=0.0)
+# sort_model.set_param_hint("Tcompare", min=0.0)
+# sort_model.set_param_hint("Talloc", min=0.0)
+# sort_model.set_param_hint("Treserve_cell", min=0.0)
+
+# sort_model.set_param_hint("Tstartup", min=0)
+# sort_model.set_param_hint("Trowstore_once", min=0.0)
+# sort_model.set_param_hint("Trowstore_col", min=0.0)
+# sort_model.set_param_hint("Tarray_once", min=0.0)
+# sort_model.set_param_hint("Tarray_elem_copy", min=0.0)
+# sort_model.set_param_hint("Tordercol", min=0.0)
+# sort_model.set_param_hint("Treserve_cell", min=0.0)
+sort_model.set_param_hint("Tcompare", min=0.0)
+# sort_model.set_param_hint("Trow_once", min=0.0)
+sort_model.set_param_hint("Tmiss_K1", min=0.0)
+sort_model.set_param_hint("Turn", min=2097152.0, max=2097153.0)
+
+# sort_model.set_param_hint("Tmiss_K2", min=0.0)
+
+def extract_info_from_line(line):
+    splited = line.split(",")
+    line_info = []
+    for item in splited:
+        line_info.append(float(item))
+    return line_info
+
+
+if __name__ == '__main__':
+    #file_name = "scan_model.res.formal.prep"
+    #out_file_name = "scan_model.fit"
+
+    #file_name = "scan_model.res.formal.prep"
+    #out_file_name = "scan_model.fit"
+
+    file_name = "sort_add_" + Config.u_to_test_type + "_result_final"
+    out_file_name = "sort_add_" + Config.u_to_test_type + "_model"
+    # sys.argv.extend("-i sort.prep.bigint -o sort.model".split(" "))
+    if os.path.exists(out_file_name):
+        os.remove(out_file_name)
+
+    output_fit_res = False
+    wrong_arg = False
+    opts,args = getopt.getopt(sys.argv[1:],"i:o:R:C:")
+    for op, value in opts:
+        if "-i" == op:
+            file_name = value
+        elif "-o" == op:
+            output_fit_res = True
+            out_file_name = value
+        elif "-R" == op:
+            MATERIAL_ROW_ONCE = float(value)
+        elif "-C" == op:
+            MATERIAL_ROW_COL = float(value)
+        else:
+            wrong_arg = True
+
+    if wrong_arg:
+        print "wrong arg"
+        sys.exit(1)
+
+    file = open(file_name, "r")
+    arg_sets = []
+    times = []
+    case_params = []
+    for line in file:
+        if line.startswith('#'):
+            continue
+        case_param = extract_info_from_line(line)
+        case_params.append(case_param)
+        arg_sets.append((case_param[0], case_param[1], case_param[2]))
+        times.append(case_param[4])
+    file.close()
+    arg_sets_np = np.array(arg_sets)
+    times_np = np.array(times)
+    #10, 0.20406430879623488, 0.016618100054245379, 14.0, 4.5, 37.0, -0.005, 0.5, -7.0
+    result = sort_model.fit(times_np, arg_sets=arg_sets_np,
+                            # Tstartup=25.0,
+                            # Trowstore_once=1.0,
+                            # Trowstore_col=1.0,
+                            # Tarray_once=1.0,
+                            # Tarray_elem_copy=1.0,
+                            # Tordercol=1.0,
+                            # Treserve_cell=1.0,
+                            Tcompare=1.0,
+                            # Trow_once=1.0,
+                            Tmiss_K1=1.0,
+                            Turn=2097152,
+                            # Tmiss_K2=1.0
+                            )
+
+    # res_line = str(result.best_values["Tstartup"]) + ","
+    # res_line += str(result.best_values["Trowstore_once"]) + ","
+    # res_line += str(result.best_values["Trowstore_col"]) + ","
+    # res_line += str(result.best_values["Tarray_once"]) + ","
+    # res_line += str(result.best_values["Tarray_elem_copy"]) + ","
+    # res_line = str(result.best_values["Tordercol"]) + ","
+    # res_line = str(result.best_values["Treserve_cell"]) + ","
+    res_line = str(result.best_values["Tcompare"]) + ","
+    # res_line += str(result.best_values["Trow_once"]) #+ ","
+    res_line += str(result.best_values["Tmiss_K1"]) + ","
+    res_line += str(result.best_values["Turn"])
+    # res_line += str(result.best_values["Tmiss_K2"])
+
+
+    print result.fit_report()
+
+    if output_fit_res:
+        out_file = open(out_file_name, "w")
+        out_file.write(res_line)
+        out_file.close()
--- a/unittest/sql/optimizer/cost_model_utils/fit_sort_master.py
+++ b/unittest/sql/optimizer/cost_model_utils/fit_sort_master.py
@ -0,0 +1,75 @@
+import math
+import numpy as np
+from scipy.optimize import leastsq
+from scipy.optimize import curve_fit
+import sys
+from lmfit import Model
+import getopt
+import subprocess
+import os
+import re
+
+
+types_to_test = {'bigint':['bigint', 0.0266846, 0.07364082], 'double': ['double', 0.02970336, 0.07228732], 'float':['float', 0.02512819, 0.07295116], 'timestamp':['timestamp', 0.02998249, 0.07265038],
+                 'number':['number(20,3)', 0.08238981, 0.15730252], 'v32':['varchar(32)', 0.08476897, 0.07518651], 'v64':['varchar(64)', 0.13678196, 0.05033624], 'v128':['varchar(128)', 0.22601192, 2.2963e-08]}
+
+def run_cmd(cmd):
+    print cmd
+    res = ''
+    p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    while True:
+        line = p.stdout.readline()
+        res += line
+        if line:
+            #print line.strip()
+            sys.stdout.flush()
+        else:
+            break
+    p.wait()
+    return res.strip()
+
+def rm_if_exist(filename):
+    if os.path.exists(filename):
+        os.remove(filename)
+
+def extract_kv(k, src):
+    pat=k + ':\s*[\d\.e\-\+]+'
+    mat = re.compile(pat)
+    return float(mat.findall(src)[0].split()[1])
+
+for t in sorted(types_to_test.keys()):
+    result_file_name = 'sort.result.' + t
+    prep_file_name = 'sort.prep.' + t
+    model_file = 'sort.model.' + t
+    fit_file = 'sort.fit.' + t
+    if not os.path.exists(result_file_name):
+        continue
+    rm_if_exist(prep_file_name)
+    rm_if_exist(model_file)
+    rm_if_exist(fit_file)
+    run_cmd("./preprocess.py -i %s -o %s -t 7 -C 4 -d" % (result_file_name, prep_file_name))
+    cmd = "./fit_sort.py -i %s -R %s -C %s -o %s" % (prep_file_name, str(types_to_test[t][2]), str(types_to_test[t][1]), model_file)
+    print cmd
+    fitres = run_cmd(cmd)
+    # print fitres
+    appres = run_cmd("./apply_sort_model.py -i %s -o %s -m %s" % (prep_file_name, fit_file, model_file))
+    print appres
+    #print fitres
+    # Treserve_cell = extract_kv('Treserve_cell', fitres)
+    # Tcompare = extract_kv('Tcompare', fitres)
+    # Tmiss_K1 = extract_kv('Tmiss_K1', fitres)
+    # Turn = extract_kv('Turn', fitres)
+    # # Trow_once = extract_kv('Trow_once', fitres)
+    # print types_to_test[t][0] + ":"
+    # # print "  Treserve_cell:\t" + str(Treserve_cell)
+    # print "  Tcompare:\t" + str(Tcompare)
+    # print "  Tmiss_K1:\t" + str(Tmiss_K1)
+    # print "  Turn:\t" + str(Turn)
+    # print "  Trow_once:\t" + str(Trow_once)
+
+
+
+
+
+
+
--- a/unittest/sql/optimizer/cost_model_utils/hash_join.py
+++ b/unittest/sql/optimizer/cost_model_utils/hash_join.py
@ -0,0 +1,155 @@
+from mylog.mylog import MyLogger
+from op_generator import op_generator
+from cost_test_conf import Config
+import subprocess as sp
+import os
+from lmfit import Model
+import numpy as np
+
+hash_cls = op_generator.gen_operator("hash_join")
+conf = Config()
+conf.u_to_test_op_c = 'hash'
+conf.is_not_running_as_unittest_c = True
+conf.schema_file_c = 'c10k1x2.schema'
+conf.left_row_count_c = 1000
+conf.right_row_count_c = 1000
+conf.left_min_c = 1
+conf.right_min_c = 1
+conf.is_random_c = True
+conf.left_pj_c = 10
+conf.right_pj_c = 10
+hash_op = hash_cls(conf)
+result_file_name = "hash_join_result"
+if os.path.exists(result_file_name):
+    os.remove(result_file_name)
+
+# step 2 do bench and gen data
+
+case_run_time = 7
+case_count = 0
+row_count_max = 100000;
+row_count_step = 2000;
+total_case_count = row_count_max/row_count_step
+total_case_count *= total_case_count
+
+print "Total case count %s ..." % (total_case_count)
+for left_row_count in xrange(1000, row_count_max + 1, row_count_step):
+     for right_row_count in xrange(1000, row_count_max + 1, row_count_step):
+         case_count+=1
+         hash_op.conf.left_row_count_c = left_row_count
+         hash_op.conf.right_row_count_c = right_row_count
+         hash_op.conf.left_max_c = max(left_row_count, right_row_count) * 3
+         hash_op.conf.right_max_c = hash_op.conf.left_max_c
+         sp.check_call("echo -n '%s,%s,' >> %s" % (left_row_count, right_row_count, result_file_name), shell=True)
+         print "Running case %s / %s ... : %s " % (case_count, total_case_count, hash_op.get_bench_cmd())
+         print "%s >> %s" % (hash_op.get_bench_cmd(), result_file_name)
+         sp.check_call("%s >> %s" % (hash_op.get_bench_cmd(), result_file_name), shell=True)
+
+# step 3 process data
+final_file_name = "hash_join_result_final"
+if os.path.exists(final_file_name):
+    os.remove(final_file_name)
+
+data_cmd = hash_op.get_data_preprocess_cmd()
+sp.check_call(data_cmd, shell=True)
+
+# step 4 fit and output
+
+out_model_file_name = "hash_model"
+if os.path.exists(out_model_file_name):
+    os.remove(out_model_file_name)
+
+
+def hash_model_form(args,
+                    Tstart_up,
+                    Tbuild_htable,
+                    Tright_row_once,
+                    Tconvert_tuple,
+                    #Tequal_cond,
+                    #Tfilter_cond,
+                    Tjoin_row
+                    ):
+    (
+        Nres_row,
+        Nleft_row,
+        Nright_row,
+        Nequal_cond,
+    ) = args
+    total_cost = Tstart_up  # Tstartup
+    total_cost += Nleft_row * Tbuild_htable
+    total_cost += Nright_row * Tright_row_once
+    total_cost += Nequal_cond * Tconvert_tuple
+    total_cost += Nres_row * Tjoin_row
+    return total_cost
+
+
+def hash_model_arr(arg_sets,
+                   Tstart_up,
+                   Tbuild_htable,
+                   Tright_row_once,
+                   Tconvert_tuple,
+                   #Tequal_cond,
+                   #Tfilter_cond,
+                   Tjoin_row):
+    res = []
+    for single_arg_set in arg_sets:
+        res.append(hash_model_form(single_arg_set,
+                                   Tstart_up,
+                                   Tbuild_htable,
+                                   Tright_row_once,
+                                   Tconvert_tuple,
+                                   #Tequal_cond,
+                                   #Tfilter_cond,
+                                   Tjoin_row))
+    return np.array(res)
+
+
+def extract_info_from_line(line):
+    splited = line.split(",")
+    line_info = []
+    for item in splited:
+        line_info.append(float(item))
+    return line_info
+
+hash_model = Model(hash_model_arr)
+hash_model.set_param_hint("Tstart_up", min=0.0)
+hash_model.set_param_hint("Tbuild_htable", min=0.0)
+hash_model.set_param_hint("Tright_row_once", min=0.0)
+hash_model.set_param_hint("Tconvert_tuple", min=0.0)
+hash_model.set_param_hint("Tjoin_row", min=0.0)
+file = open(final_file_name, "r")
+arg_sets = []
+times = []
+case_params = []
+for line in file:
+    if line.startswith('#'):
+        continue
+    case_param = extract_info_from_line(line)
+    case_params.append(case_param)
+    arg_sets.append((case_param[2], case_param[0], case_param[1], case_param[3]))
+    times.append(case_param[4])
+file.close()
+arg_sets_np = np.array(arg_sets)
+times_np = np.array(times)
+
+result = hash_model.fit(times_np, arg_sets=arg_sets_np,
+                        Tstartup=0.0,
+                        Tbuild_htable=0.0,
+                        Tright_row_once=0.0,
+                        Tconvert_tuple=0.0,
+                        #Tequal_cond=0.0,
+                        #Tfilter_cond=0.0,
+                        Tjoin_row=0.0)
+res_line = str(result.best_values["Tstart_up"]) + ","
+res_line += str(result.best_values["Tbuild_htable"]) + ","
+res_line += str(result.best_values["Tright_row_once"]) + ","
+res_line += str(result.best_values["Tconvert_tuple"]) + ","
+#res_line += str(result.best_values["Tequal_cond"]) + ","
+#res_line += str(result.best_values["Tfilter_cond"]) + ","
+res_line += str(result.best_values["Tjoin_row"])
+print result.fit_report()
+
+if out_model_file_name:
+    out_file = open(out_model_file_name, "w")
+    out_file.write(res_line)
+    out_file.close()
--- a/unittest/sql/optimizer/cost_model_utils/material.py
+++ b/unittest/sql/optimizer/cost_model_utils/material.py
@ -0,0 +1,132 @@
+from mylog.mylog import MyLogger
+from op_generator import op_generator
+from cost_test_conf import Config
+import subprocess as sp
+import os
+from lmfit import Model
+import numpy as np
+
+# step 1 gen op and conf
+material_cls = op_generator.gen_operator("material")
+conf = Config()
+conf.u_to_test_op_c = 'material'
+conf.is_not_running_as_unittest_c = True
+conf.schema_file_c = 'c10k1.schema'
+conf.row_count_c = 1000
+conf.input_projector_count_c = 1
+
+material_op = material_cls(conf)
+result_file_name = 'material_result'
+if os.path.exists(result_file_name):
+    os.remove(result_file_name)
+
+# step 2 do_bench and gen data
+row_count_max = 1001
+row_count_step = 100
+
+column_counts = [3, 5, 8]
+
+case_run_time = 7
+
+total_case_count = (row_count_max / row_count_step + 1) * len(column_counts) * case_run_time
+case_count = 0
+
+print "Total case count %s ..." % (total_case_count)
+for row_count in xrange(1, row_count_max + 1, row_count_step):
+    for column_count in column_counts:
+        for time in xrange(case_run_time):
+            case_count += 1
+            material_op.conf.row_count_c = row_count
+            material_op.conf.input_projector_count_c = column_count
+            sp.check_call("echo -n '%s,' >> %s" % (row_count, result_file_name), shell=True)
+            sp.check_call("echo -n '%s,' >> %s" % (column_count, result_file_name), shell=True)
+            print "Running case %s / %s ... : %s " % (case_count, total_case_count, material_op.get_bench_cmd())
+            print "%s >> %s" % (material_op.get_bench_cmd(), result_file_name)
+            sp.check_call("%s >> %s" % (material_op.get_bench_cmd(), result_file_name), shell=True)
+
+# step 3 preprocess data
+final_file_name = "material_result_final"
+if os.path.exists("material_final_result"):
+    os.remove("material_final_result")
+data_cmd = material_op.get_data_preprocess_cmd()
+sp.check_call(data_cmd, shell=True)
+
+# step 4 fit and output
+# given model form, do fit using previous result data
+# case param should be considered with cost_model_util.cpp output format
+# eg: material_test() in cost_model_util.cpp
+#     output row_count, cost_time
+out_model_file_name = "material_model"
+if os.path.exists(out_model_file_name):
+    os.remove(out_model_file_name)
+
+
+def material_model_form(args,
+                        # Tstartup,
+                        Trow_once,
+                        Trow_col):
+    (
+        Nrow,
+        Ncol,
+    ) = args
+
+    total_cost = 0  # Tstartup
+    total_cost += Nrow * (Trow_once + Ncol * Trow_col)
+    return total_cost
+
+
+def material_model_arr(arg_sets,
+                       # Tstartup,
+                       Trow_once,
+                       Trow_col):
+    res = []
+    for single_arg_set in arg_sets:
+        res.append(material_model_form(single_arg_set,
+                                       # Tstartup,
+                                       Trow_once,
+                                       Trow_col))
+    return np.array(res)
+
+
+def extract_info_from_line(line):
+    splited = line.split(",")
+    line_info = []
+    for item in splited:
+        line_info.append(float(item))
+    return line_info
+
+
+material_model = Model(material_model_arr)
+material_model.set_param_hint("Trow_once", min=0.0)
+material_model.set_param_hint("Trow_col", min=0.0)
+file = open(final_file_name, "r")
+arg_sets = []
+times = []
+case_params = []
+for line in file:
+    if line.startswith('#'):
+        continue
+    case_param = extract_info_from_line(line)
+    case_params.append(case_param)
+    arg_sets.append((case_param[0], case_param[1]))
+    times.append(case_param[3])
+file.close()
+arg_sets_np = np.array(arg_sets)
+times_np = np.array(times)
+# result is the fitting result model
+result = material_model.fit(times_np, arg_sets=arg_sets_np,
+                            # Tstartup=10.0,
+                            Trow_once=10.0,
+                            Trow_col=1.0
+                            )
+
+# res_line = str(result.best_values["Tstartup"]) + ","
+res_line = str(result.best_values["Trow_once"]) + ","
+res_line += str(result.best_values["Trow_col"])
+
+print result.fit_report()
+
+if out_model_file_name:
+    out_file = open(out_model_file_name, "w")
+    out_file.write(res_line)
+    out_file.close()
--- a/unittest/sql/optimizer/cost_model_utils/mylog/init.py
+++ b/unittest/sql/optimizer/cost_model_utils/mylog/init.py
--- a/unittest/sql/optimizer/cost_model_utils/mylog/mylog.py
+++ b/unittest/sql/optimizer/cost_model_utils/mylog/mylog.py
@ -0,0 +1,43 @@
+import logging
+import sys
+
+class Singleton(object):
+    def __new__(cls, *args, **kw):
+        if not hasattr(cls, '_instance'):
+            orig = super(Singleton, cls)
+            cls._instance = orig.__new__(cls, *args, **kw)
+        return cls._instance
+
+
+class MyLogger(Singleton):
+    log = logging.getLogger(__name__)
+    ##set to stdout
+    fmt = '%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(name)s - %(message)s'
+    formatter = logging.Formatter(fmt)
+    out_hdlr = logging.StreamHandler(sys.stdout)
+    #handler = logging.handlers.RotatingFileHandler(LOG_FILE, maxBytes=1024 * 1024, backupCount=5)
+    out_hdlr.setFormatter(formatter)
+    out_hdlr.setLevel(logging.INFO)
+    log.addHandler(out_hdlr)
+    log.setLevel(logging.INFO)
+
+    @staticmethod
+    def get_logger():
+        return MyLogger.log
+
+    @staticmethod
+    def info(str, *args, **kargs):
+        MyLogger.log.info(str, *args, **kargs)
+
+    @staticmethod
+    def warn(str, *args, **kargs):
+        MyLogger.log.warn(str, *args, **kargs)
+
+    @staticmethod
+    def error(str, *args, **kargs):
+        MyLogger.log.error(str, *args, **kargs)
+
+if __name__ == '__main__':
+    MyLogger.get_logger().info("test")
+    MyLogger.get_logger().warn("test warn %s", 'test')
+    MyLogger.error("test error")
--- a/unittest/sql/optimizer/cost_model_utils/op_generator.py
+++ b/unittest/sql/optimizer/cost_model_utils/op_generator.py
@ -0,0 +1,59 @@
+from cost_test_conf import Config
+from mylog.mylog import MyLogger
+import subprocess as sp
+
+
+def init_func(self, conf):
+    self.conf = conf
+
+
+def get_bench_cmd(self):
+    cmd = './cost_model_util ' + self.conf.gen_params()
+    return cmd
+
+def get_data_preprocess_cmd(self):
+    cmd = 'python preprocess.py -i {0} -o {1} -d'.format(
+        self.__class__.__name__ + '_result',
+        self.__class__.__name__ + '_result_final'
+    )
+    return cmd
+
+def do_bench(self):
+    MyLogger.info(self.conf)
+    cmd = self.get_bench_cmd()
+    MyLogger.info(cmd)
+    sp.check_call(cmd, shell=True)
+    data_cmd = self.get_data_preprocess_cmd()
+    sp.check_call(data_cmd, shell=True)
+
+
+class op_generator(object):
+    op_dict = {}
+    '''
+    name if type is not None name = operatorname + test_type_name
+    '''
+
+    @staticmethod
+    def gen_operator(name):
+        if op_generator.op_dict.has_key(name):
+            return op_generator.op_dict[name]
+        else:
+            cls = type(name, (object,), {'__init__': init_func, 'do_bench': do_bench,
+                                         'get_bench_cmd': get_bench_cmd,
+                                         'get_data_preprocess_cmd': get_data_preprocess_cmd})
+            op_generator.op_dict[name] = cls
+            return cls
+
+
+if __name__ == '__main__':
+    ##mat related conf
+    material_cls = op_generator.gen_operator('material')
+    conf = Config()
+    conf.u_to_test_op_c = 'material'
+    conf.is_not_running_as_unittest_c = True
+    conf.schema_file_c = 'c10k1.schema'
+    conf.row_count_c = 1000
+    conf.input_projector_count_c = 1
+
+    material_op = material_cls(conf)
+    material_op.do_bench()
--- a/unittest/sql/optimizer/cost_model_utils/plot_demension.py
+++ b/unittest/sql/optimizer/cost_model_utils/plot_demension.py
@ -0,0 +1,81 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+
+
+import sys
+import numpy as np
+import matplotlib as mpl
+from matplotlib import cm
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+import math
+import getopt
+
+def extract_int_info_from_line(line):
+    splited = line.split(",")
+    line_info = []
+    for item in splited:
+        line_info.append(int(float(item)))
+    return line_info
+
+def case_cmp(a,b,c):
+    if a[c] < b[c] :
+        return -1
+    elif a[c] > b[c] :
+        return 1
+    else :
+        return 0
+
+cmp_n = [lambda x, y, z = count: case_cmp(x, y, z) for count in range(10)]
+#cmp_n = [lambda x, y: cmp(x[count], y[count]) for count in range(10)]
+
+colors = ["red", "green", "blue", "yellow", "purple", "black", "pink" , "brown", "cyan" ,"orange"]
+
+def do_plot(file_cases):
+    fig = plt.figure()
+    fig.set_size_inches((20,10))
+    ax1 = fig.add_subplot(111)
+    for i in xrange(len(file_cases)):
+        ax1.plot(file_cases[i][0], file_cases[i][1], color=colors[i])
+    plt.show()
+
+if __name__ == '__main__':
+
+    file_names = []
+    horizen = 0
+    demension = 0
+    wrong_arg = False
+    opts,args = getopt.getopt(sys.argv[1:],"f:h:d:")
+
+    for op, value in opts:
+        if "-f" == op:
+            file_names.append(value)
+        elif "-h" == op:
+            horizen = int(value)
+        elif "-d" == op:
+            demension = int(value)
+        else:
+            wrong_arg = True
+
+    if horizen == demension or len(file_names) == 0 or wrong_arg:
+        print "wrong arg"
+        sys.exit()
+
+    file_cases = []
+    for name in file_names:
+        file = open(name)
+        horizens = []
+        demensions = []
+        cases = []
+        for line in file:
+            if line[0] == '[' or line.startswith('#'):
+                continue
+            case_param = extract_int_info_from_line(line)
+            cases.append(case_param)
+        cases.sort(cmp_n[horizen])
+        for case in cases:
+            horizens.append(case[horizen])
+            demensions.append(case[demension])
+        file_cases.append([np.array(horizens), np.array(demensions)])
+
+    do_plot(file_cases)
--- a/unittest/sql/optimizer/cost_model_utils/plot_multi.py
+++ b/unittest/sql/optimizer/cost_model_utils/plot_multi.py
@ -0,0 +1,81 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+
+
+import sys
+import numpy as np
+import matplotlib as mpl
+from matplotlib import cm
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+import math
+
+def extract_int_info_from_line(line):
+    splited = line.split(",")
+    line_info = []
+    for item in splited:
+        line_info.append(int(float(item)))
+    return line_info
+
+def case_cmp(a,b,c):
+    if c > 1251:
+        print c
+    if a[c] < b[c] :
+        return -1
+    elif a[c] > b[c] :
+        return 1
+    else :
+        return 0
+
+cmp_n = [lambda x, y, z = count: case_cmp(x, y, z) for count in range(10)]
+#cmp_n = [lambda x, y: cmp(x[count], y[count]) for count in range(10)]
+
+colors = ["red", "green", "blue", "yellow", "purple", "black", "pink", "cyan", "brown", "gray"]
+
+def do_plot(arg, horizen, need_columns_id,label):
+    arrs = []
+    for i in arg[0]:
+        arrs.append([])
+    for case in arg:
+        for i in xrange(len(case)):
+            arrs[i].append(case[i])
+
+    np_arrs = [np.array(a) for a in arrs]
+    fig = plt.figure()
+
+    fig.set_size_inches((20,10))
+    ax1 = fig.add_subplot(111)
+    ax1.set_label(label)
+    color_id = 0
+
+    for i in xrange(len(np_arrs)):
+        if i == horizen:
+            continue
+        elif i in need_columns_id:
+            ax1.plot(np_arrs[horizen], np_arrs[i], color=colors[color_id])
+            color_id = color_id + 1
+    plt.show()
+
+if __name__ == '__main__':
+    #filename column_count horizen
+    if len(sys.argv) < 4:
+        print "wrong arg"
+        pass
+    else:
+        file_name = sys.argv[1]
+        horizen = int(sys.argv[2])
+        file = open(file_name, "r")
+        need_columns = sys.argv[3]
+        if need_columns == "all":
+            need_columns_id = [i for i in xrange(100)]
+        else:
+            need_columns_id = [int(i) for i in need_columns.split(",")]
+
+        cases = []
+        for line in file:
+            if line[0] == '[' or line.startswith('#'):
+                continue
+            case_param = extract_int_info_from_line(line)
+            cases.append(case_param)
+        cases.sort(cmp_n[horizen])
+        do_plot(cases, horizen, need_columns_id, file)
--- a/unittest/sql/optimizer/cost_model_utils/preprocess.py
+++ b/unittest/sql/optimizer/cost_model_utils/preprocess.py
@ -0,0 +1,175 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+
+import sys
+import os
+import numpy as np
+import getopt
+
+
+file_name = "scan_model.res.formal"
+if len(sys.argv) >= 2:
+    file_name = sys.argv[1]
+out_file_name = file_name + ".prep"
+time_per_case = 2
+use_delete_min_max = False
+filters = []
+out_columns = [c for c in xrange(100)]
+cols_supplied = False
+wrong_arg = False
+target_column_id = 0
+
+#sys.argv.extend("-i sort_result -o sort.prep -t 5 -C 4 -f 0,g,1 -f 0,le,100000".split(" "))
+
+opts,args = getopt.getopt(sys.argv[1:],"i:o:t:f:a:dc:C:")
+for op, value in opts:
+    if "-i" == op:
+        file_name = value
+    elif "-o" == op:
+        out_file_name = value
+    elif "-t" == op:
+        time_per_case = int(value)
+    elif "-f" == op:
+        filter_str = value
+        filter_elements = filter_str.split(",")
+        if not filter_elements[1] in ["g","l","ge","le","e","ne"]:
+            print "invalid filter type"
+            sys.exit(1)
+        filters.append(filter_str.split(","))
+    elif "-a" == op:
+        time_per_case = int(value)
+    elif "-d" == op:
+        use_delete_min_max = True
+    elif "-C" == op:
+        target_column_id = int(value)
+    elif "-c" == op:
+        if not cols_supplied:
+            cols_supplied = True
+            out_columns = []
+        out_columns.extend([int(c) for c in value.split(",")])
+    else:
+        wrong_arg = True
+
+if wrong_arg:
+    print "wrong arg"
+    sys.exit(1)
+
+if time_per_case < 5:
+    use_delete_min_max = False
+
+if os.path.exists(out_file_name):
+    os.remove(out_file_name)
+
+origin_file = open(file_name, "r")
+out_file = open(out_file_name,"w")
+
+i = 0
+column_nums = []
+avgs = []
+avg_strs = []
+
+def delete(li, index):
+    li = li[:index] + li[index+1:]
+    return li
+
+def find_max_index(l):
+    max = -9999999999999999999999
+    max_i = -1
+    for i in xrange(len(l)):
+        if l[i] > max:
+            max = l[i]
+            max_i = i
+    return max_i
+
+def find_min_index(l):
+    min = 999999999999999999999999
+    min_i = -1
+    for i in xrange(len(l)):
+        if l[i] < min:
+            min = l[i]
+            min_i = i
+    return min_i
+
+def delete_max_min_case(column_nums, column_id):
+    # min_i = find_min_index(column_nums[len(column_nums) - 1])
+    # for j in xrange(len(column_nums)):
+    #     column_nums[j] = delete(column_nums[j], min_i)
+    max_i = find_max_index(column_nums[column_id])
+    for j in xrange(len(column_nums)):
+       column_nums[j] = delete(column_nums[j], max_i)
+    max_i = find_max_index(column_nums[column_id])
+    for j in xrange(len(column_nums)):
+       column_nums[j] = delete(column_nums[j], max_i)
+    # max_i = find_max_index(column_nums[column_id])
+    # for j in xrange(len(column_nums)):
+    #    column_nums[j] = delete(column_nums[j], max_i)
+    # max_i = find_max_index(column_nums[column_id])
+    # for j in xrange(len(column_nums)):
+    #    column_nums[j] = delete(column_nums[j], max_i)
+
+
+def do_filter(column_strs):
+    filtered = False
+    for f in filters:
+        if f[1] == "g" and float(column_strs[int(f[0])]) <= int(f[2]) :
+            filtered = True
+            break
+        elif f[1] == "l" and float(column_strs[int(f[0])]) >= int(f[2]) :
+            filtered = True
+            break
+        elif f[1] == "ge" and float(column_strs[int(f[0])]) < int(f[2]) :
+            filtered = True
+            break
+        elif f[1] == "le" and float(column_strs[int(f[0])]) > int(f[2]) :
+            filtered = True
+            break
+        elif f[1] == "e" and float(column_strs[int(f[0])]) != int(f[2]) :
+            filtered = True
+            break
+        elif f[1] == "ne" and float(column_strs[int(f[0])]) == int(f[2]) :
+            filtered = True
+            break
+    return filtered
+
+
+for line in origin_file:
+    if line.startswith("#"):
+        out_file.write(line)
+        continue #skip comment
+    column_strs_raw = line.split(",")
+    if do_filter(column_strs_raw):
+        continue
+    column_count = len(column_strs_raw)
+    if i == 0:
+        avg_strs = []
+        avgs = []
+        column_nums = []
+        for n in xrange(column_count):
+            column_nums.append([])
+    #split line and cast to float
+    for n in xrange(column_count):
+        column_nums[n].append(float(column_strs_raw[n]))
+    if i == time_per_case - 1:
+        if use_delete_min_max:
+            delete_max_min_case(column_nums, target_column_id)
+        #calc avg per column
+        for n in xrange(column_count):
+            avgs.append(np.mean(column_nums[n]))
+        #cast to str
+        avg_strs = [str(a) for a in avgs]
+        real_avg_strs = []
+        #out_columns filter
+        for cid in xrange(len(avg_strs)):
+            if cid in out_columns:
+                real_avg_strs.append(avg_strs[cid])
+
+        out_file.write(",".join(real_avg_strs) + "\n")
+    i = (i + 1) % time_per_case
+
+origin_file.close()
+out_file.close()
+
+
+
+
+
--- a/unittest/sql/optimizer/cost_model_utils/pro_hash.py
+++ b/unittest/sql/optimizer/cost_model_utils/pro_hash.py
@ -0,0 +1,114 @@
+__author__ = 'canfang.scf'
+from op_generator import op_generator
+from cost_test_conf import Config
+import subprocess as sp
+import os
+from lmfit import Model
+import numpy as np
+
+hash_cls = op_generator.gen_operator("hash_join")
+conf = Config()
+conf.u_to_test_op_c = 'hash'
+conf.is_not_running_as_unittest_c = True
+conf.schema_file_c = 'c10k1x2.schema'
+conf.left_row_count_c = 1000
+conf.right_row_count_c = 1000
+conf.left_min_c = 1
+conf.right_min_c = 1
+conf.is_random_c = True
+hash_op = hash_cls(conf)
+# step 3 process data
+final_file_name = "hash_join_result_final"
+if os.path.exists(final_file_name):
+    os.remove(final_file_name)
+
+data_cmd = hash_op.get_data_preprocess_cmd()
+sp.check_call(data_cmd, shell=True)
+
+# step 4 fit and output
+
+out_model_file_name = "hash_model"
+if os.path.exists(out_model_file_name):
+    os.remove(out_model_file_name)
+
+
+def hash_model_form(args,
+                    Tstart_up,
+                    Tright_outer_once,
+                    Tleft_outer_once,
+                    #Tjoin_row
+                    ):
+    (
+        Nres_row,
+        Nleft_row,
+        Nright_row,
+        Nequal_cond,
+        Nno_matched_right,
+        Nno_matched_left
+    ) = args
+    total_cost = Tstart_up  # Tstartup
+    total_cost += Nleft_row * 0.74497774
+    total_cost += Nright_row * 0.26678144
+    total_cost += Nequal_cond * 0.86340381
+    total_cost += Nres_row * 0.28939532
+    total_cost += Nno_matched_left * Tright_outer_once
+    total_cost += Nno_matched_right * Tleft_outer_once
+    return total_cost
+
+
+def hash_model_arr(arg_sets,
+                   Tstart_up,
+                   Tright_outer_once,
+                   Tleft_outer_once):
+    res = []
+    for single_arg_set in arg_sets:
+        res.append(hash_model_form(single_arg_set,
+                                   Tstart_up,
+                                   Tright_outer_once,
+                                   Tleft_outer_once))
+    return np.array(res)
+
+
+def extract_info_from_line(line):
+    splited = line.split(",")
+    line_info = []
+    for item in splited:
+        line_info.append(float(item))
+    return line_info
+
+hash_model = Model(hash_model_arr)
+hash_model.set_param_hint("Tstart_up", min=0.0)
+# hash_model.set_param_hint("Tbuild_htable", min=0.0)
+# hash_model.set_param_hint("Tright_row_once", min=0.0)
+# hash_model.set_param_hint("Tconvert_tuple", min=0.0)
+hash_model.set_param_hint("Tright_outer_once", min=0.0)
+hash_model.set_param_hint("Tleft_outer_once", min=0.0)
+#hash_model.set_param_hint("Tjoin_row", min=0.0)
+file = open(final_file_name, "r")
+arg_sets = []
+times = []
+case_params = []
+for line in file:
+    if line.startswith('#'):
+        continue
+    case_param = extract_info_from_line(line)
+    case_params.append(case_param)
+    arg_sets.append((case_param[2], case_param[0], case_param[1], case_param[3], case_param[4], case_param[5]))
+    times.append(case_param[6])
+file.close()
+arg_sets_np = np.array(arg_sets)
+times_np = np.array(times)
+
+result = hash_model.fit(times_np, arg_sets=arg_sets_np,
+                        Tstartup=0.0,
+                        Tright_outer_once=0.0,
+                        Tleft_outer_once=0.0)
+res_line = str(result.best_values["Tstart_up"]) + ","
+res_line += str(result.best_values["Tright_outer_once"]) + ","
+res_line += str(result.best_values["Tleft_outer_once"])
+print result.fit_report()
+
+if out_model_file_name:
+    out_file = open(out_model_file_name, "w")
+    out_file.write(res_line)
+    out_file.close()
--- a/unittest/sql/optimizer/cost_model_utils/process_nestloop.py
+++ b/unittest/sql/optimizer/cost_model_utils/process_nestloop.py
@ -0,0 +1,172 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+
+import sys
+import os
+import numpy as np
+import getopt
+
+
+file_name = "scan_model.res.formal"
+if len(sys.argv) >= 2:
+    file_name = sys.argv[1]
+out_file_name = file_name + ".prep"
+time_per_case = 5
+use_delete_min_max = False
+filters = []
+out_columns = [c for c in xrange(100)]
+cols_supplied = False
+wrong_arg = False
+target_column_id = 0
+
+#sys.argv.extend("-i sort_result -o sort.8.test -t 7 -C 2 -f 1,e,8".split(" "))
+sys.argv.extend("-i nestloop_result -o nl_result".split(" "))
+opts,args = getopt.getopt(sys.argv[1:],"i:o:t:f:a:dc:C:")
+for op, value in opts:
+    if "-i" == op:
+        file_name = value
+    elif "-o" == op:
+        out_file_name = value
+    elif "-t" == op:
+        time_per_case = int(value)
+    elif "-f" == op:
+        filter_str = value
+        filter_elements = filter_str.split(",")
+        if not filter_elements[1] in ["g","l","ge","le","e","ne"]:
+            print "invalid filter type"
+            sys.exit(1)
+        filters.append(filter_str.split(","))
+    elif "-a" == op:
+        time_per_case = int(value)
+    elif "-d" == op:
+        use_delete_min_max = True
+    elif "-C" == op:
+        target_column_id = int(value)
+    elif "-c" == op:
+        if not cols_supplied:
+            cols_supplied = True
+            out_columns = []
+        out_columns.extend([int(c) for c in value.split(",")])
+    else:
+        wrong_arg = True
+
+if wrong_arg:
+    print "wrong arg"
+    sys.exit(1)
+
+if time_per_case < 5:
+    use_delete_min_max = False
+
+if os.path.exists(out_file_name):
+    os.remove(out_file_name)
+
+origin_file = open(file_name, "r")
+out_file = open(out_file_name,"w")
+
+i = 0
+column_nums = []
+avgs = []
+avg_strs = []
+
+def delete(li, index):
+    li = li[:index] + li[index+1:]
+    return li
+
+def find_max_index(l):
+    max = -9999999999999999999999
+    max_i = -1
+    for i in xrange(len(l)):
+        if l[i] > max:
+            max = l[i]
+            max_i = i
+    return max_i
+
+def find_min_index(l):
+    min = 999999999999999999999999
+    min_i = -1
+    for i in xrange(len(l)):
+        if l[i] < min:
+            min = l[i]
+            min_i = i
+    return min_i
+
+def delete_max_min_case(column_nums, column_id):
+    # min_i = find_min_index(column_nums[len(column_nums) - 1])
+    # for j in xrange(len(column_nums)):
+    #     column_nums[j] = delete(column_nums[j], min_i)
+    max_i = find_max_index(column_nums[column_id])
+    for j in xrange(len(column_nums)):
+       column_nums[j] = delete(column_nums[j], max_i)
+    max_i = find_max_index(column_nums[column_id])
+    for j in xrange(len(column_nums)):
+       column_nums[j] = delete(column_nums[j], max_i)
+
+
+def do_filter(column_strs):
+    filtered = False
+    for f in filters:
+        if f[1] == "g" and float(column_strs[int(f[0])]) <= int(f[2]) :
+            filtered = True
+            break
+        elif f[1] == "l" and float(column_strs[int(f[0])]) >= int(f[2]) :
+            filtered = True
+            break
+        elif f[1] == "ge" and float(column_strs[int(f[0])]) < int(f[2]) :
+            filtered = True
+            break
+        elif f[1] == "le" and float(column_strs[int(f[0])]) > int(f[2]) :
+            filtered = True
+            break
+        elif f[1] == "e" and float(column_strs[int(f[0])]) != int(f[2]) :
+            filtered = True
+            break
+        elif f[1] == "ne" and float(column_strs[int(f[0])]) == int(f[2]) :
+            filtered = True
+            break
+    return filtered
+
+
+
+state = 0 #comment line
+
+elements = []
+
+
+for line in origin_file:
+    line = line.strip()
+    if state == 0:
+        out_file.write(line + "\n")
+    elif state == 1:
+        elements = line.split(",row_count : ")
+    elif state == 2:
+        pass
+    elif state == 3:
+        pass
+    elif state == 4:
+        elements.append(line.split("join_time except conds : ")[1])
+    elif state == 5:
+        elements.append(line.split("equal_eval : ")[1])
+    elif state == 6:
+        pass
+    elif state == 7:
+        elements.append(line.split("other_eval : ")[1])
+    elif state == 8:
+        pass
+    elif state == 9:
+        elements.append(line.split("right_cache_put : ")[1])
+    elif state == 10:
+        elements.append(line.split("right_cache_acc : ")[1])
+    elif state == 11:
+        elements.append(line.split("match_group_count : ")[1])
+        out_file.write(",".join(elements) + "\n")
+    else:
+        print "wrong state"
+    state = (state + 1) % 12
+
+origin_file.close()
+out_file.close()
+
+
+
+
+
--- a/unittest/sql/optimizer/cost_model_utils/varchar1_100.schema
+++ b/unittest/sql/optimizer/cost_model_utils/varchar1_100.schema
@ -0,0 +1 @@
+create table t1 (a varchar(100) primary key);
--- a/unittest/sql/optimizer/cost_model_utils/varchar1_200.schema
+++ b/unittest/sql/optimizer/cost_model_utils/varchar1_200.schema
@ -0,0 +1 @@
+create table t1 (a varchar(200) primary key);
				`@ -0,0 +1 @@`
				`create table t1 (c1 bigint, c2 bigint, c3 bigint, c4 bigint, c5 bigint, c6 bigint, c7 bigint, c8 bigint, c9 bigint, c10 bigint, primary key(c1))`
				`@ -0,0 +1 @@`
				`create table t1 (a varchar(100) primary key);`
				`@ -0,0 +1 @@`
				`create table t1 (a varchar(200) primary key);`