patch 4.0

2022-10-24 10:34:53 +08:00
parent 4ad6e00ec3
commit 93a1074b0c
10533 changed files with 2588271 additions and 2299373 deletions
--- a/unittest/sql/optimizer/cost_model_utils/preprocess.py
+++ b/unittest/sql/optimizer/cost_model_utils/preprocess.py
@ -0,0 +1,175 @@
+#!/bin/env python
+__author__ = 'dongyun.zdy'
+
+import sys
+import os
+import numpy as np
+import getopt
+
+
+file_name = "scan_model.res.formal"
+if len(sys.argv) >= 2:
+    file_name = sys.argv[1]
+out_file_name = file_name + ".prep"
+time_per_case = 2
+use_delete_min_max = False
+filters = []
+out_columns = [c for c in xrange(100)]
+cols_supplied = False
+wrong_arg = False
+target_column_id = 0
+
+#sys.argv.extend("-i sort_result -o sort.prep -t 5 -C 4 -f 0,g,1 -f 0,le,100000".split(" "))
+
+opts,args = getopt.getopt(sys.argv[1:],"i:o:t:f:a:dc:C:")
+for op, value in opts:
+    if "-i" == op:
+        file_name = value
+    elif "-o" == op:
+        out_file_name = value
+    elif "-t" == op:
+        time_per_case = int(value)
+    elif "-f" == op:
+        filter_str = value
+        filter_elements = filter_str.split(",")
+        if not filter_elements[1] in ["g","l","ge","le","e","ne"]:
+            print "invalid filter type"
+            sys.exit(1)
+        filters.append(filter_str.split(","))
+    elif "-a" == op:
+        time_per_case = int(value)
+    elif "-d" == op:
+        use_delete_min_max = True
+    elif "-C" == op:
+        target_column_id = int(value)
+    elif "-c" == op:
+        if not cols_supplied:
+            cols_supplied = True
+            out_columns = []
+        out_columns.extend([int(c) for c in value.split(",")])
+    else:
+        wrong_arg = True
+
+if wrong_arg:
+    print "wrong arg"
+    sys.exit(1)
+
+if time_per_case < 5:
+    use_delete_min_max = False
+
+if os.path.exists(out_file_name):
+    os.remove(out_file_name)
+
+origin_file = open(file_name, "r")
+out_file = open(out_file_name,"w")
+
+i = 0
+column_nums = []
+avgs = []
+avg_strs = []
+
+def delete(li, index):
+    li = li[:index] + li[index+1:]
+    return li
+
+def find_max_index(l):
+    max = -9999999999999999999999
+    max_i = -1
+    for i in xrange(len(l)):
+        if l[i] > max:
+            max = l[i]
+            max_i = i
+    return max_i
+
+def find_min_index(l):
+    min = 999999999999999999999999
+    min_i = -1
+    for i in xrange(len(l)):
+        if l[i] < min:
+            min = l[i]
+            min_i = i
+    return min_i
+
+def delete_max_min_case(column_nums, column_id):
+    # min_i = find_min_index(column_nums[len(column_nums) - 1])
+    # for j in xrange(len(column_nums)):
+    #     column_nums[j] = delete(column_nums[j], min_i)
+    max_i = find_max_index(column_nums[column_id])
+    for j in xrange(len(column_nums)):
+       column_nums[j] = delete(column_nums[j], max_i)
+    max_i = find_max_index(column_nums[column_id])
+    for j in xrange(len(column_nums)):
+       column_nums[j] = delete(column_nums[j], max_i)
+    # max_i = find_max_index(column_nums[column_id])
+    # for j in xrange(len(column_nums)):
+    #    column_nums[j] = delete(column_nums[j], max_i)
+    # max_i = find_max_index(column_nums[column_id])
+    # for j in xrange(len(column_nums)):
+    #    column_nums[j] = delete(column_nums[j], max_i)
+
+
+def do_filter(column_strs):
+    filtered = False
+    for f in filters:
+        if f[1] == "g" and float(column_strs[int(f[0])]) <= int(f[2]) :
+            filtered = True
+            break
+        elif f[1] == "l" and float(column_strs[int(f[0])]) >= int(f[2]) :
+            filtered = True
+            break
+        elif f[1] == "ge" and float(column_strs[int(f[0])]) < int(f[2]) :
+            filtered = True
+            break
+        elif f[1] == "le" and float(column_strs[int(f[0])]) > int(f[2]) :
+            filtered = True
+            break
+        elif f[1] == "e" and float(column_strs[int(f[0])]) != int(f[2]) :
+            filtered = True
+            break
+        elif f[1] == "ne" and float(column_strs[int(f[0])]) == int(f[2]) :
+            filtered = True
+            break
+    return filtered
+
+
+for line in origin_file:
+    if line.startswith("#"):
+        out_file.write(line)
+        continue #skip comment
+    column_strs_raw = line.split(",")
+    if do_filter(column_strs_raw):
+        continue
+    column_count = len(column_strs_raw)
+    if i == 0:
+        avg_strs = []
+        avgs = []
+        column_nums = []
+        for n in xrange(column_count):
+            column_nums.append([])
+    #split line and cast to float
+    for n in xrange(column_count):
+        column_nums[n].append(float(column_strs_raw[n]))
+    if i == time_per_case - 1:
+        if use_delete_min_max:
+            delete_max_min_case(column_nums, target_column_id)
+        #calc avg per column
+        for n in xrange(column_count):
+            avgs.append(np.mean(column_nums[n]))
+        #cast to str
+        avg_strs = [str(a) for a in avgs]
+        real_avg_strs = []
+        #out_columns filter
+        for cid in xrange(len(avg_strs)):
+            if cid in out_columns:
+                real_avg_strs.append(avg_strs[cid])
+
+        out_file.write(",".join(real_avg_strs) + "\n")
+    i = (i + 1) % time_per_case
+
+origin_file.close()
+out_file.close()
+
+
+
+
+