patch 4.0
This commit is contained in:
175
unittest/sql/optimizer/cost_model_utils/preprocess.py
Executable file
175
unittest/sql/optimizer/cost_model_utils/preprocess.py
Executable file
@ -0,0 +1,175 @@
|
||||
#!/bin/env python
|
||||
__author__ = 'dongyun.zdy'
|
||||
|
||||
import sys
|
||||
import os
|
||||
import numpy as np
|
||||
import getopt
|
||||
|
||||
|
||||
file_name = "scan_model.res.formal"
|
||||
if len(sys.argv) >= 2:
|
||||
file_name = sys.argv[1]
|
||||
out_file_name = file_name + ".prep"
|
||||
time_per_case = 2
|
||||
use_delete_min_max = False
|
||||
filters = []
|
||||
out_columns = [c for c in xrange(100)]
|
||||
cols_supplied = False
|
||||
wrong_arg = False
|
||||
target_column_id = 0
|
||||
|
||||
#sys.argv.extend("-i sort_result -o sort.prep -t 5 -C 4 -f 0,g,1 -f 0,le,100000".split(" "))
|
||||
|
||||
opts,args = getopt.getopt(sys.argv[1:],"i:o:t:f:a:dc:C:")
|
||||
for op, value in opts:
|
||||
if "-i" == op:
|
||||
file_name = value
|
||||
elif "-o" == op:
|
||||
out_file_name = value
|
||||
elif "-t" == op:
|
||||
time_per_case = int(value)
|
||||
elif "-f" == op:
|
||||
filter_str = value
|
||||
filter_elements = filter_str.split(",")
|
||||
if not filter_elements[1] in ["g","l","ge","le","e","ne"]:
|
||||
print "invalid filter type"
|
||||
sys.exit(1)
|
||||
filters.append(filter_str.split(","))
|
||||
elif "-a" == op:
|
||||
time_per_case = int(value)
|
||||
elif "-d" == op:
|
||||
use_delete_min_max = True
|
||||
elif "-C" == op:
|
||||
target_column_id = int(value)
|
||||
elif "-c" == op:
|
||||
if not cols_supplied:
|
||||
cols_supplied = True
|
||||
out_columns = []
|
||||
out_columns.extend([int(c) for c in value.split(",")])
|
||||
else:
|
||||
wrong_arg = True
|
||||
|
||||
if wrong_arg:
|
||||
print "wrong arg"
|
||||
sys.exit(1)
|
||||
|
||||
if time_per_case < 5:
|
||||
use_delete_min_max = False
|
||||
|
||||
if os.path.exists(out_file_name):
|
||||
os.remove(out_file_name)
|
||||
|
||||
origin_file = open(file_name, "r")
|
||||
out_file = open(out_file_name,"w")
|
||||
|
||||
i = 0
|
||||
column_nums = []
|
||||
avgs = []
|
||||
avg_strs = []
|
||||
|
||||
def delete(li, index):
|
||||
li = li[:index] + li[index+1:]
|
||||
return li
|
||||
|
||||
def find_max_index(l):
|
||||
max = -9999999999999999999999
|
||||
max_i = -1
|
||||
for i in xrange(len(l)):
|
||||
if l[i] > max:
|
||||
max = l[i]
|
||||
max_i = i
|
||||
return max_i
|
||||
|
||||
def find_min_index(l):
|
||||
min = 999999999999999999999999
|
||||
min_i = -1
|
||||
for i in xrange(len(l)):
|
||||
if l[i] < min:
|
||||
min = l[i]
|
||||
min_i = i
|
||||
return min_i
|
||||
|
||||
def delete_max_min_case(column_nums, column_id):
|
||||
# min_i = find_min_index(column_nums[len(column_nums) - 1])
|
||||
# for j in xrange(len(column_nums)):
|
||||
# column_nums[j] = delete(column_nums[j], min_i)
|
||||
max_i = find_max_index(column_nums[column_id])
|
||||
for j in xrange(len(column_nums)):
|
||||
column_nums[j] = delete(column_nums[j], max_i)
|
||||
max_i = find_max_index(column_nums[column_id])
|
||||
for j in xrange(len(column_nums)):
|
||||
column_nums[j] = delete(column_nums[j], max_i)
|
||||
# max_i = find_max_index(column_nums[column_id])
|
||||
# for j in xrange(len(column_nums)):
|
||||
# column_nums[j] = delete(column_nums[j], max_i)
|
||||
# max_i = find_max_index(column_nums[column_id])
|
||||
# for j in xrange(len(column_nums)):
|
||||
# column_nums[j] = delete(column_nums[j], max_i)
|
||||
|
||||
|
||||
def do_filter(column_strs):
|
||||
filtered = False
|
||||
for f in filters:
|
||||
if f[1] == "g" and float(column_strs[int(f[0])]) <= int(f[2]) :
|
||||
filtered = True
|
||||
break
|
||||
elif f[1] == "l" and float(column_strs[int(f[0])]) >= int(f[2]) :
|
||||
filtered = True
|
||||
break
|
||||
elif f[1] == "ge" and float(column_strs[int(f[0])]) < int(f[2]) :
|
||||
filtered = True
|
||||
break
|
||||
elif f[1] == "le" and float(column_strs[int(f[0])]) > int(f[2]) :
|
||||
filtered = True
|
||||
break
|
||||
elif f[1] == "e" and float(column_strs[int(f[0])]) != int(f[2]) :
|
||||
filtered = True
|
||||
break
|
||||
elif f[1] == "ne" and float(column_strs[int(f[0])]) == int(f[2]) :
|
||||
filtered = True
|
||||
break
|
||||
return filtered
|
||||
|
||||
|
||||
for line in origin_file:
|
||||
if line.startswith("#"):
|
||||
out_file.write(line)
|
||||
continue #skip comment
|
||||
column_strs_raw = line.split(",")
|
||||
if do_filter(column_strs_raw):
|
||||
continue
|
||||
column_count = len(column_strs_raw)
|
||||
if i == 0:
|
||||
avg_strs = []
|
||||
avgs = []
|
||||
column_nums = []
|
||||
for n in xrange(column_count):
|
||||
column_nums.append([])
|
||||
#split line and cast to float
|
||||
for n in xrange(column_count):
|
||||
column_nums[n].append(float(column_strs_raw[n]))
|
||||
if i == time_per_case - 1:
|
||||
if use_delete_min_max:
|
||||
delete_max_min_case(column_nums, target_column_id)
|
||||
#calc avg per column
|
||||
for n in xrange(column_count):
|
||||
avgs.append(np.mean(column_nums[n]))
|
||||
#cast to str
|
||||
avg_strs = [str(a) for a in avgs]
|
||||
real_avg_strs = []
|
||||
#out_columns filter
|
||||
for cid in xrange(len(avg_strs)):
|
||||
if cid in out_columns:
|
||||
real_avg_strs.append(avg_strs[cid])
|
||||
|
||||
out_file.write(",".join(real_avg_strs) + "\n")
|
||||
i = (i + 1) % time_per_case
|
||||
|
||||
origin_file.close()
|
||||
out_file.close()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user