oceanbase/unittest/sql/optimizer/cost_model_utils/preprocess.py

#!/bin/env python
__author__ = 'dongyun.zdy'

import sys
import os
import numpy as np
import getopt


file_name = "scan_model.res.formal"
if len(sys.argv) >= 2:
    file_name = sys.argv[1]
out_file_name = file_name + ".prep"
time_per_case = 2
use_delete_min_max = False
filters = []
out_columns = [c for c in xrange(100)]
cols_supplied = False
wrong_arg = False
target_column_id = 0

#sys.argv.extend("-i sort_result -o sort.prep -t 5 -C 4 -f 0,g,1 -f 0,le,100000".split(" "))

opts,args = getopt.getopt(sys.argv[1:],"i:o:t:f:a:dc:C:")
for op, value in opts:
    if "-i" == op:
        file_name = value
    elif "-o" == op:
        out_file_name = value
    elif "-t" == op:
        time_per_case = int(value)
    elif "-f" == op:
        filter_str = value
        filter_elements = filter_str.split(",")
        if not filter_elements[1] in ["g","l","ge","le","e","ne"]:
            print "invalid filter type"
            sys.exit(1)
        filters.append(filter_str.split(","))
    elif "-a" == op:
        time_per_case = int(value)
    elif "-d" == op:
        use_delete_min_max = True
    elif "-C" == op:
        target_column_id = int(value)
    elif "-c" == op:
        if not cols_supplied:
            cols_supplied = True
            out_columns = []
        out_columns.extend([int(c) for c in value.split(",")])
    else:
        wrong_arg = True

if wrong_arg:
    print "wrong arg"
    sys.exit(1)

if time_per_case < 5:
    use_delete_min_max = False

if os.path.exists(out_file_name):
    os.remove(out_file_name)

origin_file = open(file_name, "r")
out_file = open(out_file_name,"w")

i = 0
column_nums = []
avgs = []
avg_strs = []

def delete(li, index):
    li = li[:index] + li[index+1:]
    return li

def find_max_index(l):
    max = -9999999999999999999999
    max_i = -1
    for i in xrange(len(l)):
        if l[i] > max:
            max = l[i]
            max_i = i
    return max_i

def find_min_index(l):
    min = 999999999999999999999999
    min_i = -1
    for i in xrange(len(l)):
        if l[i] < min:
            min = l[i]
            min_i = i
    return min_i

def delete_max_min_case(column_nums, column_id):
    # min_i = find_min_index(column_nums[len(column_nums) - 1])
    # for j in xrange(len(column_nums)):
    #     column_nums[j] = delete(column_nums[j], min_i)
    max_i = find_max_index(column_nums[column_id])
    for j in xrange(len(column_nums)):
       column_nums[j] = delete(column_nums[j], max_i)
    max_i = find_max_index(column_nums[column_id])
    for j in xrange(len(column_nums)):
       column_nums[j] = delete(column_nums[j], max_i)
    # max_i = find_max_index(column_nums[column_id])
    # for j in xrange(len(column_nums)):
    #    column_nums[j] = delete(column_nums[j], max_i)
    # max_i = find_max_index(column_nums[column_id])
    # for j in xrange(len(column_nums)):
    #    column_nums[j] = delete(column_nums[j], max_i)


def do_filter(column_strs):
    filtered = False
    for f in filters:
        if f[1] == "g" and float(column_strs[int(f[0])]) <= int(f[2]) :
            filtered = True
            break
        elif f[1] == "l" and float(column_strs[int(f[0])]) >= int(f[2]) :
            filtered = True
            break
        elif f[1] == "ge" and float(column_strs[int(f[0])]) < int(f[2]) :
            filtered = True
            break
        elif f[1] == "le" and float(column_strs[int(f[0])]) > int(f[2]) :
            filtered = True
            break
        elif f[1] == "e" and float(column_strs[int(f[0])]) != int(f[2]) :
            filtered = True
            break
        elif f[1] == "ne" and float(column_strs[int(f[0])]) == int(f[2]) :
            filtered = True
            break
    return filtered


for line in origin_file:
    if line.startswith("#"):
        out_file.write(line)
        continue #skip comment
    column_strs_raw = line.split(",")
    if do_filter(column_strs_raw):
        continue
    column_count = len(column_strs_raw)
    if i == 0:
        avg_strs = []
        avgs = []
        column_nums = []
        for n in xrange(column_count):
            column_nums.append([])
    #split line and cast to float
    for n in xrange(column_count):
        column_nums[n].append(float(column_strs_raw[n]))
    if i == time_per_case - 1:
        if use_delete_min_max:
            delete_max_min_case(column_nums, target_column_id)
        #calc avg per column
        for n in xrange(column_count):
            avgs.append(np.mean(column_nums[n]))
        #cast to str
        avg_strs = [str(a) for a in avgs]
        real_avg_strs = []
        #out_columns filter
        for cid in xrange(len(avg_strs)):
            if cid in out_columns:
                real_avg_strs.append(avg_strs[cid])

        out_file.write(",".join(real_avg_strs) + "\n")
    i = (i + 1) % time_per_case

origin_file.close()
out_file.close()