From 76ad599fd7411e490c479629516756e1c821ffe4 Mon Sep 17 00:00:00 2001 From: ElvinWei Date: Sat, 7 Jan 2023 00:50:32 +0800 Subject: [PATCH] [enhancement](histogram) optimise aggregate function histogram (#15317) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This pr mainly to optimize the histogram(👉🏻 https://github.com/apache/doris/pull/14910) aggregation function. Including the following: 1. Support input parameters `sample_rate` and `max_bucket_num` 2. Add UT and regression test 3. Add documentation 4. Optimize function implementation logic Parameter description: - `sample_rate`:Optional. The proportion of sample data used to generate the histogram. The default is 0.2. - `max_bucket_num`:Optional. Limit the number of histogram buckets. The default value is 128. --- Example: ``` MySQL [test]> SELECT histogram(c_float) FROM histogram_test; +-------------------------------------------------------------------------------------------------------------------------------------+ | histogram(`c_float`) | +-------------------------------------------------------------------------------------------------------------------------------------+ | {"sample_rate":0.2,"max_bucket_num":128,"bucket_num":3,"buckets":[{"lower":"0.1","upper":"0.1","count":1,"pre_sum":0,"ndv":1},...]} | +-------------------------------------------------------------------------------------------------------------------------------------+ MySQL [test]> SELECT histogram(c_string, 0.5, 2) FROM histogram_test; +-------------------------------------------------------------------------------------------------------------------------------------+ | histogram(`c_string`) | +-------------------------------------------------------------------------------------------------------------------------------------+ | {"sample_rate":0.5,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"str1","upper":"str7","count":4,"pre_sum":0,"ndv":3},...]} | +-------------------------------------------------------------------------------------------------------------------------------------+ ``` Query result description: ``` { "sample_rate": 0.2, "max_bucket_num": 128, "bucket_num": 3, "buckets": [ { "lower": "0.1", "upper": "0.2", "count": 2, "pre_sum": 0, "ndv": 2 }, { "lower": "0.8", "upper": "0.9", "count": 2, "pre_sum": 2, "ndv": 2 }, { "lower": "1.0", "upper": "1.0", "count": 2, "pre_sum": 4, "ndv": 1 } ] } ``` Field description: - sample_rate:Rate of sampling - max_bucket_num:Limit the maximum number of buckets - bucket_num:The actual number of buckets - buckets:All buckets - lower:Upper bound of the bucket - upper:Lower bound of the bucket - count:The number of elements contained in the bucket - pre_sum:The total number of elements in the front bucket - ndv:The number of different values in the bucket > Total number of histogram elements = number of elements in the last bucket(count) + total number of elements in the previous bucket(pre_sum). --- .../aggregate_function_histogram.cpp | 67 ++-- .../aggregate_function_histogram.h | 372 ++++++------------ be/src/vec/utils/histogram_helpers.hpp | 270 +++++++++++++ .../agg_histogram_test.cpp | 153 +++++-- .../aggregate-functions/histogram.md | 86 ++-- .../aggregate-functions/histogram.md | 87 ++-- .../org/apache/doris/catalog/FunctionSet.java | 2 + .../test_aggregate_histogram.out | 14 + .../test_aggregate_histogram.groovy | 214 ++++++++++ 9 files changed, 877 insertions(+), 388 deletions(-) create mode 100644 be/src/vec/utils/histogram_helpers.hpp create mode 100644 regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_histogram.out create mode 100644 regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_histogram.groovy diff --git a/be/src/vec/aggregate_functions/aggregate_function_histogram.cpp b/be/src/vec/aggregate_functions/aggregate_function_histogram.cpp index b400b37edc..5ab7dd6035 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_histogram.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_histogram.cpp @@ -17,12 +17,24 @@ #include "vec/aggregate_functions/aggregate_function_histogram.h" +#include "vec/aggregate_functions/helpers.h" +#include "vec/core/types.h" + namespace doris::vectorized { template AggregateFunctionPtr create_agg_function_histogram(const DataTypes& argument_types) { - return AggregateFunctionPtr( - new AggregateFunctionHistogram, T>(argument_types)); + bool has_input_param = (argument_types.size() == 3); + + if (has_input_param) { + return AggregateFunctionPtr( + new AggregateFunctionHistogram, T, true>( + argument_types)); + } else { + return AggregateFunctionPtr( + new AggregateFunctionHistogram, T, false>( + argument_types)); + } } AggregateFunctionPtr create_aggregate_function_histogram(const std::string& name, @@ -31,34 +43,37 @@ AggregateFunctionPtr create_aggregate_function_histogram(const std::string& name const bool result_is_nullable) { WhichDataType type(argument_types[0]); - if (type.is_uint8()) { - return create_agg_function_histogram(argument_types); - } else if (type.is_int8()) { - return create_agg_function_histogram(argument_types); - } else if (type.is_int16()) { - return create_agg_function_histogram(argument_types); - } else if (type.is_int32()) { - return create_agg_function_histogram(argument_types); - } else if (type.is_int64()) { + LOG(INFO) << fmt::format("supported input type {} for aggregate function {}", + argument_types[0]->get_name(), name); + +#define DISPATCH(TYPE) \ + if (type.idx == TypeIndex::TYPE) return create_agg_function_histogram(argument_types); + FOR_NUMERIC_TYPES(DISPATCH) +#undef DISPATCH + + if (type.idx == TypeIndex::String) { + return create_agg_function_histogram(argument_types); + } + if (type.idx == TypeIndex::DateTime || type.idx == TypeIndex::Date) { return create_agg_function_histogram(argument_types); - } else if (type.is_int128()) { - return create_agg_function_histogram(argument_types); - } else if (type.is_float32()) { - return create_agg_function_histogram(argument_types); - } else if (type.is_float64()) { - return create_agg_function_histogram(argument_types); - } else if (type.is_decimal32()) { + } + if (type.idx == TypeIndex::DateV2) { + return create_agg_function_histogram(argument_types); + } + if (type.idx == TypeIndex::DateTimeV2) { + return create_agg_function_histogram(argument_types); + } + if (type.idx == TypeIndex::Decimal32) { return create_agg_function_histogram(argument_types); - } else if (type.is_decimal64()) { + } + if (type.idx == TypeIndex::Decimal64) { return create_agg_function_histogram(argument_types); - } else if (type.is_decimal128()) { + } + if (type.idx == TypeIndex::Decimal128) { return create_agg_function_histogram(argument_types); - } else if (type.is_date()) { - return create_agg_function_histogram(argument_types); - } else if (type.is_date_time()) { - return create_agg_function_histogram(argument_types); - } else if (type.is_string()) { - return create_agg_function_histogram(argument_types); + } + if (type.idx == TypeIndex::Decimal128I) { + return create_agg_function_histogram(argument_types); } LOG(WARNING) << fmt::format("unsupported input type {} for aggregate function {}", diff --git a/be/src/vec/aggregate_functions/aggregate_function_histogram.h b/be/src/vec/aggregate_functions/aggregate_function_histogram.h index f92edd7090..5f036a398d 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_histogram.h +++ b/be/src/vec/aggregate_functions/aggregate_function_histogram.h @@ -17,299 +17,140 @@ #pragma once -#include -#include -#include - -#include -#include - -#include "runtime/datetime_value.h" #include "vec/aggregate_functions/aggregate_function.h" #include "vec/aggregate_functions/aggregate_function_simple_factory.h" -#include "vec/io/io_helper.h" +#include "vec/utils/histogram_helpers.hpp" namespace doris::vectorized { -// TODO: support input parameters and statistics of sampling -const int64_t MAX_BUCKET_SIZE = 128; -const float_t SAMPLE_RATE = 1.0; - template -struct Bucket { -public: - Bucket() = default; - Bucket(T value, size_t pre_sum) - : lower(value), upper(value), count(1), pre_sum(pre_sum), ndv(1) {} - Bucket(T lower, T upper, size_t count, size_t pre_sum, size_t ndv) - : lower(lower), upper(upper), count(count), pre_sum(pre_sum), ndv(ndv) {} +struct AggregateFunctionHistogramData { + using ColVecType = + std::conditional_t, ColumnDecimal, ColumnVector>; - T lower; - T upper; - int64_t count; - int64_t pre_sum; - int64_t ndv; -}; - -struct AggregateFunctionHistogramBase { -public: - AggregateFunctionHistogramBase() = default; - - template - static std::vector> build_bucket_from_data(const std::vector& sorted_data, - int64_t max_bucket_size) { - std::vector> buckets; - - if (sorted_data.size() > 0) { - int64_t data_size = sorted_data.size(); - int num_per_bucket = (int64_t)std::ceil((Float64)data_size / max_bucket_size); - - for (int i = 0; i < data_size; ++i) { - T v = sorted_data[i]; - if (buckets.empty()) { - Bucket bucket(v, 0); - buckets.emplace_back(bucket); - } else { - Bucket* bucket = &buckets.back(); - T upper = bucket->upper; - if (upper == v) { - bucket->count++; - } else if (bucket->count < num_per_bucket) { - bucket->count++; - bucket->ndv++; - bucket->upper = v; - } else { - int64_t pre_sum = bucket->pre_sum + bucket->count; - Bucket new_bucket(v, pre_sum); - buckets.emplace_back(new_bucket); - } - } - } + void set_parameters(double input_sample_rate, size_t input_max_bucket_num) { + if (input_sample_rate > 0 && input_sample_rate <= 1) { + sample_rate = input_sample_rate; + } + if (input_max_bucket_num > 0) { + max_bucket_num = (uint32_t)input_max_bucket_num; } - - return buckets; } - template - static std::string build_json_from_bucket(const std::vector>& buckets, - const DataTypePtr& data_type, int64_t max_bucket_size, - int64_t sample_rate) { - rapidjson::Document doc; - doc.SetObject(); - rapidjson::Document::AllocatorType& allocator = doc.GetAllocator(); + void reset() { ordered_map.clear(); } - rapidjson::Value max_bucket_size_val(max_bucket_size); - doc.AddMember("max_bucket_size", max_bucket_size_val, allocator); + void add(const StringRef& value, const UInt64& number = 1) { + std::string data = value.to_string(); + auto it = ordered_map.find(data); + if (it != ordered_map.end()) { + it->second = it->second + number; + } else { + ordered_map.insert({data, number}); + } + } - rapidjson::Value sample_rate_val(sample_rate); - doc.AddMember("sample_rate", sample_rate_val, allocator); + void add(const T& value, const UInt64& number = 1) { + auto it = ordered_map.find(value); + if (it != ordered_map.end()) { + it->second = it->second + number; + } else { + ordered_map.insert({value, number}); + } + } - // buckets - rapidjson::Value bucket_arr(rapidjson::kArrayType); + void merge(const AggregateFunctionHistogramData& rhs) { + if (!rhs.sample_rate) { + return; + } - if (!buckets.empty()) { - int size = buckets.size(); - rapidjson::Value bucket_size_val(size); - doc.AddMember("bucket_size", bucket_size_val, allocator); - - WhichDataType type(data_type); - if (type.is_int() || type.is_float() || type.is_decimal() || type.is_string()) { - for (int i = 0; i < size; ++i) { - std::string lower_str = numerical_to_string(buckets[i].lower); - std::string upper_str = numerical_to_string(buckets[i].upper); - to_bucket_json(allocator, bucket_arr, lower_str, upper_str, - (int64_t)(buckets[i].count), (int64_t)(buckets[i].pre_sum), - (int64_t)(buckets[i].ndv)); - } - } else if (type.is_date_or_datetime()) { - for (int i = 0; i < size; ++i) { - std::string lower_str = to_date_string(buckets[i].lower); - std::string upper_str = to_date_string(buckets[i].upper); - to_bucket_json(allocator, bucket_arr, lower_str, upper_str, - (int64_t)(buckets[i].count), (int64_t)(buckets[i].pre_sum), - (int64_t)(buckets[i].ndv)); - } + for (auto rhs_it : rhs.ordered_map) { + auto lhs_it = ordered_map.find(rhs_it.first); + if (lhs_it != ordered_map.end()) { + lhs_it->second += rhs_it.second; } else { - rapidjson::Value bucket_size_zero(0); - doc.AddMember("bucket_size", bucket_size_zero, allocator); - LOG(WARNING) << fmt::format("unable to convert histogram data of type {}", - data_type->get_name()); + ordered_map.insert({rhs_it.first, rhs_it.second}); } } - - doc.AddMember("buckets", bucket_arr, allocator); - - rapidjson::StringBuffer sb; - rapidjson::Writer writer(sb); - doc.Accept(writer); - - return std::string(sb.GetString()); } - static void to_bucket_json(rapidjson::Document::AllocatorType& allocator, - rapidjson::Value& bucket_arr, std::string lower, std::string upper, - int64 count, int64 pre_sum, int64 ndv) { - rapidjson::Value bucket(rapidjson::kObjectType); + void write(BufferWritable& buf) const { + write_binary(sample_rate, buf); + write_binary(max_bucket_num, buf); - rapidjson::Value lower_val(lower.c_str(), allocator); - bucket.AddMember("lower", lower_val, allocator); + uint64_t element_number = (uint64_t)ordered_map.size(); + write_binary(element_number, buf); - rapidjson::Value upper_val(upper.c_str(), allocator); - bucket.AddMember("upper", upper_val, allocator); + auto pair_vector = map_to_vector(); - rapidjson::Value count_val(count); - bucket.AddMember("count", count_val, allocator); + for (auto i = 0; i < element_number; i++) { + auto element = pair_vector[i]; + write_binary(element.second, buf); + write_binary(element.first, buf); + } + } - rapidjson::Value pre_sum_val(pre_sum); - bucket.AddMember("pre_sum", pre_sum_val, allocator); + void read(BufferReadable& buf) { + read_binary(sample_rate, buf); + read_binary(max_bucket_num, buf); - rapidjson::Value ndv_val(ndv); - bucket.AddMember("ndv", ndv_val, allocator); + uint64_t element_number = 0; + read_binary(element_number, buf); - bucket_arr.PushBack(bucket, allocator); + ordered_map.clear(); + std::pair element; + for (auto i = 0; i < element_number; i++) { + read_binary(element.first, buf); + read_binary(element.second, buf); + ordered_map.insert(element); + } + } + + void insert_result_into(IColumn& to) const { + auto pair_vector = map_to_vector(); + for (auto i = 0; i < pair_vector.size(); i++) { + const auto& element = pair_vector[i]; + if constexpr (std::is_same_v) { + static_cast(to).insert_data(element.second.c_str(), + element.second.length()); + } else { + static_cast(to).get_data().push_back(element.second); + } + } + } + + std::string get(const DataTypePtr& data_type) const { + std::vector> buckets; + rapidjson::StringBuffer buffer; + build_bucket_from_data(buckets, ordered_map, sample_rate, max_bucket_num); + build_json_from_bucket(buffer, buckets, data_type, sample_rate, max_bucket_num); + return std::string(buffer.GetString()); + } + + std::vector> map_to_vector() const { + std::vector> pair_vector; + for (auto it : ordered_map) { + pair_vector.emplace_back(it.second, it.first); + } + return pair_vector; } private: - template - static std::string numerical_to_string(T input) { - fmt::memory_buffer buffer; - fmt::format_to(buffer, "{}", input); - return std::string(buffer.data(), buffer.size()); - } - - template - static std::string to_date_string(T input) { - auto* date_int = reinterpret_cast(&input); - auto date_value = binary_cast(*date_int); - char buf[32] = {}; - date_value.to_string(buf); - return std::string(buf, strlen(buf)); - } + double sample_rate = 0.2; + uint32_t max_bucket_num = 128; + std::map ordered_map; }; -template -struct AggregateFunctionHistogramData : public AggregateFunctionHistogramBase { - using ElementType = T; - using ColVecType = ColumnVectorOrDecimal; - PaddedPODArray data; - - void add(const IColumn& column, size_t row_num) { - const auto& vec = assert_cast(column).get_data(); - data.push_back(vec[row_num]); - } - - void write(BufferWritable& buf) const { - write_var_uint(data.size(), buf); - buf.write(data.raw_data(), data.size() * sizeof(ElementType)); - } - - void read(BufferReadable& buf) { - UInt64 rows = 0; - read_var_uint(rows, buf); - data.resize(rows); - buf.read(reinterpret_cast(data.data()), rows * sizeof(ElementType)); - } - - void merge(const AggregateFunctionHistogramData& rhs) { - data.insert(rhs.data.begin(), rhs.data.end()); - } - - void insert_result_into(IColumn& to) const { - auto& vec = assert_cast(to).get_data(); - size_t old_size = vec.size(); - vec.resize(old_size + data.size()); - memcpy(vec.data() + old_size, data.data(), data.size() * sizeof(ElementType)); - } - - std::string get(const DataTypePtr& data_type) const { - std::vector vec_data; - - for (size_t i = 0; i < data.size(); ++i) { - [[maybe_unused]] ElementType d = data[i]; - vec_data.push_back(d); - } - - std::sort(vec_data.begin(), vec_data.end()); - auto buckets = build_bucket_from_data(vec_data, MAX_BUCKET_SIZE); - auto result_str = build_json_from_bucket(buckets, data_type, MAX_BUCKET_SIZE, - SAMPLE_RATE); - - return result_str; - } - - void reset() { data.clear(); } -}; - -template <> -struct AggregateFunctionHistogramData : public AggregateFunctionHistogramBase { - using ElementType = StringRef; - using ColVecType = ColumnString; - MutableColumnPtr data; - - AggregateFunctionHistogramData() { data = ColVecType::create(); } - - void add(const IColumn& column, size_t row_num) { data->insert_from(column, row_num); } - - void write(BufferWritable& buf) const { - auto& col = assert_cast(*data); - - write_var_uint(col.size(), buf); - buf.write(col.get_offsets().raw_data(), col.size() * sizeof(IColumn::Offset)); - - write_var_uint(col.get_chars().size(), buf); - buf.write(col.get_chars().raw_data(), col.get_chars().size()); - } - - void read(BufferReadable& buf) { - auto& col = assert_cast(*data); - UInt64 offs_size = 0; - read_var_uint(offs_size, buf); - col.get_offsets().resize(offs_size); - buf.read(reinterpret_cast(col.get_offsets().data()), - offs_size * sizeof(IColumn::Offset)); - - UInt64 chars_size = 0; - read_var_uint(chars_size, buf); - col.get_chars().resize(chars_size); - buf.read(reinterpret_cast(col.get_chars().data()), chars_size); - } - - void merge(const AggregateFunctionHistogramData& rhs) { - data->insert_range_from(*rhs.data, 0, rhs.data->size()); - } - - void insert_result_into(IColumn& to) const { - auto& to_str = assert_cast(to); - to_str.insert_range_from(*data, 0, data->size()); - } - - std::string get(const DataTypePtr& data_type) const { - std::vector str_data; - auto* res_column = reinterpret_cast(data.get()); - - for (int i = 0; i < res_column->size(); ++i) { - [[maybe_unused]] ElementType c = res_column->get_data_at(i); - str_data.push_back(c.to_string()); - } - - std::sort(str_data.begin(), str_data.end()); - const auto buckets = build_bucket_from_data(str_data, MAX_BUCKET_SIZE); - auto result_str = build_json_from_bucket(buckets, data_type, MAX_BUCKET_SIZE, - SAMPLE_RATE); - - return result_str; - } - - void reset() { data->clear(); } -}; - -template +template class AggregateFunctionHistogram final - : public IAggregateFunctionDataHelper> { + : public IAggregateFunctionDataHelper< + Data, AggregateFunctionHistogram> { public: + using ColVecType = ColumnVectorOrDecimal; + AggregateFunctionHistogram() = default; AggregateFunctionHistogram(const DataTypes& argument_types_) - : IAggregateFunctionDataHelper>( + : IAggregateFunctionDataHelper>( argument_types_, {}), _argument_type(argument_types_[0]) {} @@ -323,7 +164,18 @@ public: return; } - this->data(place).add(*columns[0], row_num); + if (has_input_param) { + this->data(place).set_parameters( + static_cast(columns[1])->get_element(row_num), + static_cast(columns[2])->get_element(row_num)); + } + + if constexpr (std::is_same_v) { + this->data(place).add( + static_cast(*columns[0]).get_data_at(row_num)); + } else { + this->data(place).add(static_cast(*columns[0]).get_data()[row_num]); + } } void reset(AggregateDataPtr place) const override { this->data(place).reset(); } diff --git a/be/src/vec/utils/histogram_helpers.hpp b/be/src/vec/utils/histogram_helpers.hpp new file mode 100644 index 0000000000..206948d733 --- /dev/null +++ b/be/src/vec/utils/histogram_helpers.hpp @@ -0,0 +1,270 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include + +#include "vec/data_types/data_type_decimal.h" +#include "vec/io/io_helper.h" + +namespace doris::vectorized { + +template +struct Bucket { +public: + Bucket() = default; + Bucket(T value, size_t pre_sum) + : lower(value), upper(value), count(1), pre_sum(pre_sum), ndv(1) {} + Bucket(T lower, T upper, size_t count, size_t pre_sum, size_t ndv) + : lower(lower), upper(upper), count(count), pre_sum(pre_sum), ndv(ndv) {} + + T lower; + T upper; + uint64_t count; + uint64_t pre_sum; + uint64_t ndv; +}; + +template +bool build_bucket_from_data(std::vector>& buckets, + const std::map& ordered_map, double sample_rate, + uint32_t max_bucket_num) { + if (ordered_map.size() == 0) { + return false; + } + + uint64_t element_number = 0; + for (auto it : ordered_map) { + element_number += it.second; + } + + auto sample_number = (uint64_t)std::ceil(element_number * sample_rate); + auto num_per_bucket = (uint64_t)std::ceil((Float64)sample_number / max_bucket_num); + + LOG(INFO) << fmt::format( + "histogram bucket info: element number {}, sample number:{}, num per bucket:{}", + element_number, sample_number, num_per_bucket); + + if (sample_rate == 1) { + for (auto it : ordered_map) { + for (auto i = it.second; i > 0; --i) { + auto v = it.first; + value_to_bucket(buckets, v, num_per_bucket); + } + } + return true; + } + + // if sampling is required (0 sample_index(element_number); + + // use a same seed value so that we get + // same result each time we run this function + srand(element_number * sample_rate * max_bucket_num); + + while (sample_index.count() < sample_number) { + uint64_t num = (rand() % (element_number)); + sample_index[num] = true; + } + + uint64_t element_cnt = 0; + uint64_t sample_cnt = 0; + bool break_flag = false; + + for (auto it : ordered_map) { + if (break_flag) { + break; + } + for (auto i = it.second; i > 0; --i) { + if (sample_cnt >= sample_number) { + break_flag = true; + break; + } + if (sample_index[element_cnt]) { + sample_cnt += 1; + auto v = it.first; + value_to_bucket(buckets, v, num_per_bucket); + } + element_cnt += 1; + } + } + + return true; +} + +bool inline bucket_to_json(rapidjson::Document::AllocatorType& allocator, + rapidjson::Value& bucket_arr, std::string lower, std::string upper, + int64 count, int64 pre_sum, int64 ndv) { + rapidjson::Value bucket(rapidjson::kObjectType); + + rapidjson::Value lower_val(lower.c_str(), allocator); + bucket.AddMember("lower", lower_val, allocator); + + rapidjson::Value upper_val(upper.c_str(), allocator); + bucket.AddMember("upper", upper_val, allocator); + + rapidjson::Value count_val(count); + bucket.AddMember("count", count_val, allocator); + + rapidjson::Value pre_sum_val(pre_sum); + bucket.AddMember("pre_sum", pre_sum_val, allocator); + + rapidjson::Value ndv_val(ndv); + bucket.AddMember("ndv", ndv_val, allocator); + + bucket_arr.PushBack(bucket, allocator); + + return bucket_arr.Size() > 0; +} + +template +bool value_to_bucket(std::vector>& buckets, T v, size_t num_per_bucket) { + if (buckets.empty()) { + Bucket bucket(v, 0); + buckets.emplace_back(bucket); + } else { + Bucket* bucket = &buckets.back(); + T upper = bucket->upper; + if (upper == v) { + bucket->count++; + } else if (bucket->count < num_per_bucket) { + bucket->count++; + bucket->ndv++; + bucket->upper = v; + } else { + uint64_t pre_sum = bucket->pre_sum + bucket->count; + Bucket new_bucket(v, pre_sum); + buckets.emplace_back(new_bucket); + } + } + + return buckets.size() > 0; +} + +template +bool value_to_string(std::stringstream& ss, T input, const DataTypePtr& data_type) { + fmt::memory_buffer _insert_stmt_buffer; + switch (data_type->get_type_id()) { + case TypeIndex::Int8: + case TypeIndex::UInt8: + case TypeIndex::Int16: + case TypeIndex::UInt16: + case TypeIndex::Int32: + case TypeIndex::UInt32: + case TypeIndex::Int64: + case TypeIndex::UInt64: + case TypeIndex::Int128: + case TypeIndex::UInt128: + case TypeIndex::Float32: + case TypeIndex::Float64: + case TypeIndex::String: { + fmt::memory_buffer buffer; + fmt::format_to(buffer, "{}", input); + ss << std::string(buffer.data(), buffer.size()); + break; + } + case TypeIndex::Decimal32: { + auto scale = get_decimal_scale(*data_type); + auto decimal_val = reinterpret_cast(&input); + write_text(*decimal_val, scale, ss); + break; + } + case TypeIndex::Decimal64: { + auto scale = get_decimal_scale(*data_type); + auto decimal_val = reinterpret_cast(&input); + write_text(*decimal_val, scale, ss); + break; + } + case TypeIndex::Decimal128: + case TypeIndex::Decimal128I: { + auto scale = get_decimal_scale(*data_type); + auto decimal_val = reinterpret_cast(&input); + write_text(*decimal_val, scale, ss); + break; + } + case TypeIndex::Date: + case TypeIndex::DateTime: { + auto* date_int = reinterpret_cast(&input); + auto date_value = binary_cast(*date_int); + char buf[32] = {}; + date_value.to_string(buf); + ss << std::string(buf, strlen(buf)); + break; + } + case TypeIndex::DateV2: { + auto* value = (DateV2Value*)(&input); + ss << *value; + break; + } + case TypeIndex::DateTimeV2: { + auto* value = (DateV2Value*)(&input); + ss << *value; + break; + } + default: + LOG(WARNING) << fmt::format("unable to convert histogram data of type {}", + data_type->get_name()); + return false; + } + + return true; +} + +template +bool build_json_from_bucket(rapidjson::StringBuffer& buffer, const std::vector>& buckets, + const DataTypePtr& data_type, double sample_rate, + uint32_t max_bucket_num) { + rapidjson::Document doc; + doc.SetObject(); + rapidjson::Document::AllocatorType& allocator = doc.GetAllocator(); + + rapidjson::Value sample_rate_val(sample_rate); + doc.AddMember("sample_rate", sample_rate_val, allocator); + + rapidjson::Value max_bucket_num_val(max_bucket_num); + doc.AddMember("max_bucket_num", max_bucket_num_val, allocator); + + int bucket_num = buckets.size(); + rapidjson::Value bucket_num_val(bucket_num); + doc.AddMember("bucket_num", bucket_num_val, allocator); + + rapidjson::Value bucket_arr(rapidjson::kArrayType); + for (auto i = 0; i < bucket_num; ++i) { + std::stringstream ss1; + std::stringstream ss2; + value_to_string(ss1, buckets[i].lower, data_type); + value_to_string(ss2, buckets[i].upper, data_type); + std::string lower_str = ss1.str(); + std::string upper_str = ss2.str(); + bucket_to_json(allocator, bucket_arr, lower_str, upper_str, (uint64_t)(buckets[i].count), + (uint64_t)(buckets[i].pre_sum), (uint64_t)(buckets[i].ndv)); + } + doc.AddMember("buckets", bucket_arr, allocator); + + rapidjson::Writer writer(buffer); + doc.Accept(writer); + + return buffer.GetSize() > 0; +} + +} // namespace doris::vectorized diff --git a/be/test/vec/aggregate_functions/agg_histogram_test.cpp b/be/test/vec/aggregate_functions/agg_histogram_test.cpp index d2d41d5f51..545e118568 100644 --- a/be/test/vec/aggregate_functions/agg_histogram_test.cpp +++ b/be/test/vec/aggregate_functions/agg_histogram_test.cpp @@ -22,7 +22,6 @@ #include "vec/aggregate_functions/aggregate_function.h" #include "vec/aggregate_functions/aggregate_function_histogram.h" #include "vec/aggregate_functions/aggregate_function_simple_factory.h" -#include "vec/columns/column_vector.h" #include "vec/data_types/data_type.h" #include "vec/data_types/data_type_date.h" #include "vec/data_types/data_type_date_time.h" @@ -45,30 +44,67 @@ public: template void agg_histogram_add_elements(AggregateFunctionPtr agg_function, AggregateDataPtr place, - size_t input_nums) { + size_t input_rows, double sample_rate, size_t max_bucket_num) { using FieldType = typename DataType::FieldType; auto type = std::make_shared(); - auto input_col = type->create_column(); - for (size_t i = 0; i < input_nums; ++i) { + + if (sample_rate == 0 || max_bucket_num == 0) { + auto input_col = type->create_column(); + for (size_t i = 0; i < input_rows; ++i) { + if constexpr (std::is_same_v) { + auto item = std::string("item") + std::to_string(i); + input_col->insert_data(item.c_str(), item.size()); + } else { + auto item = FieldType(static_cast(i)); + input_col->insert_data(reinterpret_cast(&item), 0); + } + } + + EXPECT_EQ(input_col->size(), input_rows); + + const IColumn* column[1] = {input_col.get()}; + for (int i = 0; i < input_col->size(); i++) { + agg_function->add(place, column, i, &_agg_arena_pool); + } + + return; + } + + MutableColumns columns(3); + columns[0] = type->create_column(); + columns[1] = ColumnFloat64::create(); + columns[2] = ColumnInt32::create(); + + for (size_t i = 0; i < input_rows; ++i) { if constexpr (std::is_same_v) { auto item = std::string("item") + std::to_string(i); - input_col->insert_data(item.c_str(), item.size()); + columns[0]->insert_data(item.c_str(), item.size()); } else { auto item = FieldType(static_cast(i)); - input_col->insert_data(reinterpret_cast(&item), 0); + columns[0]->insert_data(reinterpret_cast(&item), 0); } + columns[1]->insert_data(reinterpret_cast(&sample_rate), sizeof(sample_rate)); + columns[2]->insert_data(reinterpret_cast(&max_bucket_num), + sizeof(max_bucket_num)); } - EXPECT_EQ(input_col->size(), input_nums); - const IColumn* column[1] = {input_col.get()}; - for (int i = 0; i < input_col->size(); i++) { + EXPECT_EQ(columns[0]->size(), input_rows); + + const IColumn* column[3] = {columns[0].get(), columns[1].get(), columns[2].get()}; + for (int i = 0; i < input_rows; i++) { agg_function->add(place, column, i, &_agg_arena_pool); } } template - void test_agg_histogram(size_t input_nums = 0) { - DataTypes data_types = {(DataTypePtr)std::make_shared()}; + void test_agg_histogram(size_t input_rows = 0, double sample_rate = 0, + size_t max_bucket_num = 0) { + DataTypes data_types1 = {(DataTypePtr)std::make_shared()}; + DataTypes data_types3 = {(DataTypePtr)std::make_shared(), + std::make_shared(), + std::make_shared()}; + + auto data_types = (sample_rate == 0 || max_bucket_num == 0) ? data_types1 : data_types3; LOG(INFO) << "test_agg_histogram for type" << "(" << data_types[0]->get_name() << ")"; @@ -80,8 +116,8 @@ public: std::unique_ptr memory(new char[agg_function->size_of_data()]); AggregateDataPtr place = memory.get(); agg_function->create(place); - - agg_histogram_add_elements(agg_function, place, input_nums); + agg_histogram_add_elements(agg_function, place, input_rows, sample_rate, + max_bucket_num); ColumnString buf; VectorBufferWriter buf_writer(buf); @@ -93,22 +129,67 @@ public: std::unique_ptr memory2(new char[agg_function->size_of_data()]); AggregateDataPtr place2 = memory2.get(); agg_function->create(place2); - - agg_histogram_add_elements(agg_function, place2, input_nums); - + agg_histogram_add_elements(agg_function, place2, input_rows, sample_rate, + max_bucket_num); agg_function->merge(place, place2, &_agg_arena_pool); - auto column_result = ColumnString::create(); - agg_function->insert_result_into(place, *column_result); - EXPECT_EQ(column_result->size(), 1); - EXPECT_TRUE(column_result->get_offsets()[0] >= 1); + + auto column_result1 = ColumnString::create(); + agg_function->insert_result_into(place, *column_result1); + EXPECT_EQ(column_result1->size(), 1); + EXPECT_TRUE(column_result1->get_offsets()[0] >= 1); auto column_result2 = ColumnString::create(); agg_function->insert_result_into(place2, *column_result2); EXPECT_EQ(column_result2->size(), 1); EXPECT_TRUE(column_result2->get_offsets()[0] >= 1); - LOG(INFO) << column_result->get_offsets()[0]; - LOG(INFO) << column_result2->get_offsets()[0]; + LOG(INFO) << column_result1->get_data_at(0).to_string(); + LOG(INFO) << column_result2->get_data_at(0).to_string(); + + // test empty data + if (input_rows == 0 && sample_rate == 0 && max_bucket_num == 0) { + std::string expect_empty_result = + "{\"sample_rate\":0.2,\"max_bucket_num\":128,\"bucket_num\":0,\"buckets\":[]" + "}"; + std::string empty_result1 = column_result1->get_data_at(0).to_string(); + std::string empty_result2 = column_result2->get_data_at(0).to_string(); + EXPECT_EQ(empty_result1, expect_empty_result); + EXPECT_EQ(empty_result2, expect_empty_result); + } + + // test with data + if (input_rows == 1000 && sample_rate == 0.5 && max_bucket_num == 5) { + if constexpr (std::is_same_v) { + std::string expect_result1 = + "{\"sample_rate\":0.5,\"max_bucket_num\":5,\"bucket_num\":5,\"buckets\":[" + "{\"lower\":\"0\",\"upper\":\"189\",\"count\":200,\"pre_sum\":0,\"ndv\":" + "151}," + "{\"lower\":\"190\",\"upper\":\"380\",\"count\":200,\"pre_sum\":200," + "\"ndv\":149}," + "{\"lower\":\"382\",\"upper\":\"582\",\"count\":200,\"pre_sum\":400," + "\"ndv\":150}," + "{\"lower\":\"586\",\"upper\":\"796\",\"count\":200,\"pre_sum\":600," + "\"ndv\":157}," + "{\"lower\":\"797\",\"upper\":\"999\",\"count\":200,\"pre_sum\":800," + "\"ndv\":147}]}"; + std::string expect_result2 = + "{\"sample_rate\":0.5,\"max_bucket_num\":5,\"bucket_num\":5,\"buckets\":[" + "{\"lower\":\"0\",\"upper\":\"207\",\"count\":100,\"pre_sum\":0,\"ndv\":" + "100}," + "{\"lower\":\"209\",\"upper\":\"410\",\"count\":100,\"pre_sum\":100," + "\"ndv\":100}," + "{\"lower\":\"412\",\"upper\":\"599\",\"count\":100,\"pre_sum\":200," + "\"ndv\":100}," + "{\"lower\":\"600\",\"upper\":\"797\",\"count\":100,\"pre_sum\":300," + "\"ndv\":100}," + "{\"lower\":\"799\",\"upper\":\"998\",\"count\":100,\"pre_sum\":400," + "\"ndv\":100}]}"; + std::string result1 = column_result1->get_data_at(0).to_string(); + std::string result2 = column_result2->get_data_at(0).to_string(); + EXPECT_EQ(result1, expect_result1); + EXPECT_EQ(result2, expect_result2); + } + } agg_function->destroy(place); agg_function->destroy(place2); @@ -125,18 +206,34 @@ TEST_F(VAggHistogramTest, test_empty) { test_agg_histogram(); test_agg_histogram(); - test_agg_histogram>(); + test_agg_histogram(); + test_agg_histogram(); + test_agg_histogram(); + test_agg_histogram(); test_agg_histogram(); + test_agg_histogram>(); } TEST_F(VAggHistogramTest, test_with_data) { - test_agg_histogram(8); - test_agg_histogram(10); + // rows 1000, sample rate 0.5, max bucket size 5 + test_agg_histogram(1000, 0.5, 5); - test_agg_histogram>(12); - test_agg_histogram(14); - test_agg_histogram(10); + test_agg_histogram(100, 0.5, 5); + test_agg_histogram(100, 0.5, 5); + test_agg_histogram(100, 0.5, 5); + test_agg_histogram(100, 0.5, 5); + test_agg_histogram(100, 0.5, 5); + test_agg_histogram(100, 0.5, 5); + test_agg_histogram(100, 0.5, 5); + + test_agg_histogram(100, 0.5, 5); + test_agg_histogram(100, 0.5, 5); + + test_agg_histogram(100, 0.5, 5); + test_agg_histogram(100, 0.5, 5); + + test_agg_histogram>(100, 0.5, 5); } } // namespace doris::vectorized diff --git a/docs/en/docs/sql-manual/sql-functions/aggregate-functions/histogram.md b/docs/en/docs/sql-manual/sql-functions/aggregate-functions/histogram.md index 63c522c500..6cc96340dc 100644 --- a/docs/en/docs/sql-manual/sql-functions/aggregate-functions/histogram.md +++ b/docs/en/docs/sql-manual/sql-functions/aggregate-functions/histogram.md @@ -32,67 +32,79 @@ under the License. The histogram function is used to describe the distribution of the data. It uses an "equal height" bucking strategy, and divides the data into buckets according to the value of the data. It describes each bucket with some simple data, such as the number of values that fall in the bucket. It is mainly used by the optimizer to estimate the range query. +The result of the function returns an empty or Json string. + +Parameter description: +- sample_rate:Optional. The proportion of sample data used to generate the histogram. The default is 0.2. +- max_bucket_num:Optional. Limit the number of histogram buckets. The default value is 128. + ### notice -``` -Only supported in vectorized engine -``` +> Only supported in vectorized engine ### example ``` -MySQL [test]> select histogram(login_time) from dev_table; -+------------------------------------------------------------------------------------------------------------------------------+ -| histogram(`login_time`) | -+------------------------------------------------------------------------------------------------------------------------------+ -| {"bucket_size":5,"buckets":[{"lower":"2022-09-21 17:30:29","upper":"2022-09-21 22:30:29","count":9,"pre_sum":0,"ndv":1},...]}| -+------------------------------------------------------------------------------------------------------------------------------+ +MySQL [test]> SELECT histogram(c_float) FROM histogram_test; ++-------------------------------------------------------------------------------------------------------------------------------------+ +| histogram(`c_float`) | ++-------------------------------------------------------------------------------------------------------------------------------------+ +| {"sample_rate":0.2,"max_bucket_num":128,"bucket_num":3,"buckets":[{"lower":"0.1","upper":"0.1","count":1,"pre_sum":0,"ndv":1},...]} | ++-------------------------------------------------------------------------------------------------------------------------------------+ + +MySQL [test]> SELECT histogram(c_string, 0.5, 2) FROM histogram_test; ++-------------------------------------------------------------------------------------------------------------------------------------+ +| histogram(`c_string`) | ++-------------------------------------------------------------------------------------------------------------------------------------+ +| {"sample_rate":0.5,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"str1","upper":"str7","count":4,"pre_sum":0,"ndv":3},...]} | ++-------------------------------------------------------------------------------------------------------------------------------------+ ``` + Query result description: ``` { - "bucket_size": 5, + "sample_rate": 0.2, + "max_bucket_num": 128, + "bucket_num": 3, "buckets": [ { - "lower": "2022-09-21 17:30:29", - "upper": "2022-09-21 22:30:29", - "count": 9, + "lower": "0.1", + "upper": "0.2", + "count": 2, "pre_sum": 0, - "ndv": 1 + "ndv": 2 }, { - "lower": "2022-09-22 17:30:29", - "upper": "2022-09-22 22:30:29", - "count": 10, - "pre_sum": 9, - "ndv": 1 + "lower": "0.8", + "upper": "0.9", + "count": 2, + "pre_sum": 2, + "ndv": 2 }, { - "lower": "2022-09-23 17:30:29", - "upper": "2022-09-23 22:30:29", - "count": 9, - "pre_sum": 19, - "ndv": 1 - }, - { - "lower": "2022-09-24 17:30:29", - "upper": "2022-09-24 22:30:29", - "count": 9, - "pre_sum": 28, - "ndv": 1 - }, - { - "lower": "2022-09-25 17:30:29", - "upper": "2022-09-25 22:30:29", - "count": 9, - "pre_sum": 37, + "lower": "1.0", + "upper": "1.0", + "count": 2, + "pre_sum": 4, "ndv": 1 } ] } ``` +Field description: +- sample_rate:Rate of sampling +- max_bucket_num:Limit the maximum number of buckets +- bucket_num:The actual number of buckets +- buckets:All buckets + - lower:Upper bound of the bucket + - upper:Lower bound of the bucket + - count:The number of elements contained in the bucket + - pre_sum:The total number of elements in the front bucket + - ndv:The number of different values in the bucket + +> Total number of histogram elements = number of elements in the last bucket(count) + total number of elements in the previous bucket(pre_sum). ### keywords diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/aggregate-functions/histogram.md b/docs/zh-CN/docs/sql-manual/sql-functions/aggregate-functions/histogram.md index dbd1cd8445..55085ee26d 100644 --- a/docs/zh-CN/docs/sql-manual/sql-functions/aggregate-functions/histogram.md +++ b/docs/zh-CN/docs/sql-manual/sql-functions/aggregate-functions/histogram.md @@ -28,11 +28,15 @@ under the License. ### description #### Syntax -仅支持向量 +`histogram(expr[, DOUBLE sample_rate, INT max_bucket_num])` -`histogram(expr)` +histogram(直方图)函数用于描述数据分布情况,它使用“等高”的分桶策略,并按照数据的值大小进行分桶,并用一些简单的数据来描述每个桶,比如落在桶里的值的个数。主要用于优化器进行区间查询的估算。 -histogram(直方图)函数用于描述数据分布情况,它使用“等高”的分桶策略,并按照数据的值大小进行分桶,并用一些简单的数据来描述每个桶,比如落在桶里的值的个数。主要用于优化器进行区间查询的估算。 +函数结果返回空或者 Json 字符串。 + +参数说明: +- sample_rate:可选项。用于生成直方图的抽样数据比例,默认值 0.2。 +- max_bucket_num:可选项。用于限制直方图桶(bucket)的数量,默认值 128。 ### notice @@ -43,58 +47,67 @@ histogram(直方图)函数用于描述数据分布情况,它使用“等高” ### example ``` -MySQL [test]> select histogram(login_time) from dev_table; -+------------------------------------------------------------------------------------------------------------------------------+ -| histogram(`login_time`) | -+------------------------------------------------------------------------------------------------------------------------------+ -| {"bucket_size":5,"buckets":[{"lower":"2022-09-21 17:30:29","upper":"2022-09-21 22:30:29","count":9,"pre_sum":0,"ndv":1},...]}| -+------------------------------------------------------------------------------------------------------------------------------+ +MySQL [test]> SELECT histogram(c_float) FROM histogram_test; ++-------------------------------------------------------------------------------------------------------------------------------------+ +| histogram(`c_float`) | ++-------------------------------------------------------------------------------------------------------------------------------------+ +| {"sample_rate":0.2,"max_bucket_num":128,"bucket_num":3,"buckets":[{"lower":"0.1","upper":"0.1","count":1,"pre_sum":0,"ndv":1},...]} | ++-------------------------------------------------------------------------------------------------------------------------------------+ + +MySQL [test]> SELECT histogram(c_string, 0.5, 2) FROM histogram_test; ++-------------------------------------------------------------------------------------------------------------------------------------+ +| histogram(`c_string`) | ++-------------------------------------------------------------------------------------------------------------------------------------+ +| {"sample_rate":0.5,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"str1","upper":"str7","count":4,"pre_sum":0,"ndv":3},...]} | ++-------------------------------------------------------------------------------------------------------------------------------------+ ``` + 查询结果说明: ``` { - "bucket_size": 5, + "sample_rate": 0.2, + "max_bucket_num": 128, + "bucket_num": 3, "buckets": [ { - "lower": "2022-09-21 17:30:29", - "upper": "2022-09-21 22:30:29", - "count": 9, + "lower": "0.1", + "upper": "0.2", + "count": 2, "pre_sum": 0, - "ndv": 1 + "ndv": 2 }, { - "lower": "2022-09-22 17:30:29", - "upper": "2022-09-22 22:30:29", - "count": 10, - "pre_sum": 9, - "ndv": 1 + "lower": "0.8", + "upper": "0.9", + "count": 2, + "pre_sum": 2, + "ndv": 2 }, { - "lower": "2022-09-23 17:30:29", - "upper": "2022-09-23 22:30:29", - "count": 9, - "pre_sum": 19, - "ndv": 1 - }, - { - "lower": "2022-09-24 17:30:29", - "upper": "2022-09-24 22:30:29", - "count": 9, - "pre_sum": 28, - "ndv": 1 - }, - { - "lower": "2022-09-25 17:30:29", - "upper": "2022-09-25 22:30:29", - "count": 9, - "pre_sum": 37, + "lower": "1.0", + "upper": "1.0", + "count": 2, + "pre_sum": 4, "ndv": 1 } ] } ``` +字段说明: +- sample_rate:抽样数据比例 +- max_bucket_num:用户限制的最大桶数量 +- bucket_num:实际的桶数量 +- buckets:直方图所包含的桶 + - lower:桶的上界 + - upper:桶的下界 + - count:桶内包含的元素数量 + - pre_sum:前面桶的元素总量 + - ndv:桶内不同值的个数 + +> 直方图总的元素数量 = 最后一个桶的元素数量(count)+ 前面桶的元素总量(pre_sum)。 + ### keywords HISTOGRAM diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionSet.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionSet.java index 1d93829d83..1b437e3e92 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionSet.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionSet.java @@ -2609,6 +2609,8 @@ public class FunctionSet { new ArrayType(t), t, "", "", "", "", "", true, false, true, true)); addBuiltin(AggregateFunction.createBuiltin(HISTOGRAM, Lists.newArrayList(t), Type.VARCHAR, t, + "", "", "", "", "", true, false, true, true)); + addBuiltin(AggregateFunction.createBuiltin(HISTOGRAM, Lists.newArrayList(t, Type.DOUBLE, Type.INT), Type.VARCHAR, t, "", "", "", "", "", true, false, true, true)); } diff --git a/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_histogram.out b/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_histogram.out new file mode 100644 index 0000000000..b7a75da277 --- /dev/null +++ b/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_histogram.out @@ -0,0 +1,14 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select -- +{"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"0","upper":"1","count":6,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"10","upper":"31","count":6,"pre_sum":0,"ndv":4}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"12","upper":"32","count":6,"pre_sum":0,"ndv":5}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"944444444444","upper":"4444444444444","count":6,"pre_sum":0,"ndv":5}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"255555555","upper":"55555555555","count":6,"pre_sum":0,"ndv":5}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"0.1","upper":"9.1","count":6,"pre_sum":0,"ndv":5}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"0.222","upper":"9.222","count":6,"pre_sum":0,"ndv":5}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"3333.330000000","upper":"93333.330000000","count":6,"pre_sum":0,"ndv":5}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"14444.44","upper":"94444.44","count":6,"pre_sum":0,"ndv":5}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"c","upper":"p","count":6,"pre_sum":0,"ndv":5}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"varchar1","upper":"varchar9","count":6,"pre_sum":0,"ndv":4}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"string1","upper":"string9","count":6,"pre_sum":0,"ndv":4}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"2022-12-01","upper":"2022-12-09","count":6,"pre_sum":0,"ndv":4}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"2022-12-01","upper":"2022-12-09","count":6,"pre_sum":0,"ndv":4}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"2022-12-01 22:23:23","upper":"2022-12-09 22:23:23","count":6,"pre_sum":0,"ndv":4}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"2022-12-01 22:23:24.999999","upper":"2022-12-09 22:23:24.999999","count":6,"pre_sum":0,"ndv":4}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"not null","upper":"not null","count":12,"pre_sum":0,"ndv":1}]} + +-- !select -- +1 {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"0","upper":"1","count":3,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"10","upper":"11","count":3,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"12","upper":"21","count":3,"pre_sum":0,"ndv":3}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"1444444444444","upper":"4444444444444","count":3,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"1555555555","upper":"55555555555","count":3,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"0.1","upper":"1.1","count":3,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"0.222","upper":"1.222","count":3,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"3333.330000000","upper":"13333.330000000","count":3,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"14444.44","upper":"4444.44","count":3,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"c","upper":"d","count":3,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"varchar1","upper":"varchar2","count":3,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"string1","upper":"string2","count":3,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"2022-12-01","upper":"2022-12-02","count":3,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"2022-12-01","upper":"2022-12-02","count":3,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"2022-12-01 22:23:23","upper":"2022-12-02 22:23:23","count":3,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"2022-12-01 22:23:24.999999","upper":"2022-12-02 22:23:24.999999","count":3,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"not null","upper":"not null","count":6,"pre_sum":0,"ndv":1}]} +2 {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"0","upper":"1","count":3,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"10","upper":"31","count":3,"pre_sum":0,"ndv":3}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"20","upper":"32","count":3,"pre_sum":0,"ndv":3}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"944444444444","upper":"3444444444444","count":3,"pre_sum":0,"ndv":3}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"255555555","upper":"9555555555","count":3,"pre_sum":0,"ndv":3}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"2.1","upper":"9.1","count":3,"pre_sum":0,"ndv":3}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"2.222","upper":"9.222","count":3,"pre_sum":0,"ndv":3}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"23333.330000000","upper":"93333.330000000","count":3,"pre_sum":0,"ndv":3}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"24444.44","upper":"94444.44","count":3,"pre_sum":0,"ndv":3}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"f","upper":"p","count":3,"pre_sum":0,"ndv":3}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"varchar3","upper":"varchar9","count":3,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"string3","upper":"string9","count":3,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"2022-12-03","upper":"2022-12-09","count":3,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"2022-12-03","upper":"2022-12-09","count":3,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"2022-12-03 22:23:23","upper":"2022-12-09 22:23:23","count":3,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"2022-12-03 22:23:24.999999","upper":"2022-12-09 22:23:24.999999","count":3,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"not null","upper":"not null","count":6,"pre_sum":0,"ndv":1}]} + +-- !select -- +1 {"sample_rate":1.0,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"0","upper":"0","count":4,"pre_sum":0,"ndv":1},{"lower":"1","upper":"1","count":2,"pre_sum":4,"ndv":1}]} {"sample_rate":1.0,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"10","upper":"11","count":4,"pre_sum":0,"ndv":2},{"lower":"21","upper":"31","count":2,"pre_sum":4,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"12","upper":"20","count":3,"pre_sum":0,"ndv":2},{"lower":"21","upper":"32","count":3,"pre_sum":3,"ndv":3}]} {"sample_rate":1.0,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"944444444444","upper":"2444444444444","count":3,"pre_sum":0,"ndv":3},{"lower":"3444444444444","upper":"4444444444444","count":3,"pre_sum":3,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"255555555","upper":"3555555555","count":3,"pre_sum":0,"ndv":3},{"lower":"9555555555","upper":"55555555555","count":3,"pre_sum":3,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"0.1","upper":"1.1","count":3,"pre_sum":0,"ndv":2},{"lower":"2.1","upper":"9.1","count":3,"pre_sum":3,"ndv":3}]} {"sample_rate":1.0,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"0.222","upper":"1.222","count":3,"pre_sum":0,"ndv":2},{"lower":"2.222","upper":"9.222","count":3,"pre_sum":3,"ndv":3}]} {"sample_rate":1.0,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"3333.330000000","upper":"13333.330000000","count":3,"pre_sum":0,"ndv":2},{"lower":"23333.330000000","upper":"93333.330000000","count":3,"pre_sum":3,"ndv":3}]} {"sample_rate":1.0,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"14444.44","upper":"34444.44","count":3,"pre_sum":0,"ndv":3},{"lower":"4444.44","upper":"94444.44","count":3,"pre_sum":3,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"c","upper":"d","count":3,"pre_sum":0,"ndv":2},{"lower":"f","upper":"p","count":3,"pre_sum":3,"ndv":3}]} {"sample_rate":1.0,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"varchar1","upper":"varchar2","count":3,"pre_sum":0,"ndv":2},{"lower":"varchar3","upper":"varchar9","count":3,"pre_sum":3,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"string1","upper":"string2","count":3,"pre_sum":0,"ndv":2},{"lower":"string3","upper":"string9","count":3,"pre_sum":3,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"2022-12-01","upper":"2022-12-02","count":3,"pre_sum":0,"ndv":2},{"lower":"2022-12-03","upper":"2022-12-09","count":3,"pre_sum":3,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"2022-12-01","upper":"2022-12-02","count":3,"pre_sum":0,"ndv":2},{"lower":"2022-12-03","upper":"2022-12-09","count":3,"pre_sum":3,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"2022-12-01 22:23:23","upper":"2022-12-02 22:23:23","count":3,"pre_sum":0,"ndv":2},{"lower":"2022-12-03 22:23:23","upper":"2022-12-09 22:23:23","count":3,"pre_sum":3,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"2022-12-01 22:23:24.999999","upper":"2022-12-02 22:23:24.999999","count":3,"pre_sum":0,"ndv":2},{"lower":"2022-12-03 22:23:24.999999","upper":"2022-12-09 22:23:24.999999","count":3,"pre_sum":3,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":2,"bucket_num":1,"buckets":[{"lower":"not null","upper":"not null","count":12,"pre_sum":0,"ndv":1}]} + +-- !select -- +1 {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"0","upper":"1","count":6,"pre_sum":0,"ndv":2}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"10","upper":"31","count":6,"pre_sum":0,"ndv":4}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"12","upper":"32","count":6,"pre_sum":0,"ndv":5}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"944444444444","upper":"4444444444444","count":6,"pre_sum":0,"ndv":5}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"255555555","upper":"55555555555","count":6,"pre_sum":0,"ndv":5}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"0.1","upper":"9.1","count":6,"pre_sum":0,"ndv":5}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"0.222","upper":"9.222","count":6,"pre_sum":0,"ndv":5}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"3333.330000000","upper":"93333.330000000","count":6,"pre_sum":0,"ndv":5}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"14444.44","upper":"94444.44","count":6,"pre_sum":0,"ndv":5}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"c","upper":"p","count":6,"pre_sum":0,"ndv":5}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"varchar1","upper":"varchar9","count":6,"pre_sum":0,"ndv":4}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"string1","upper":"string9","count":6,"pre_sum":0,"ndv":4}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"2022-12-01","upper":"2022-12-09","count":6,"pre_sum":0,"ndv":4}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"2022-12-01","upper":"2022-12-09","count":6,"pre_sum":0,"ndv":4}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"2022-12-01 22:23:23","upper":"2022-12-09 22:23:23","count":6,"pre_sum":0,"ndv":4}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"2022-12-01 22:23:24.999999","upper":"2022-12-09 22:23:24.999999","count":6,"pre_sum":0,"ndv":4}]} {"sample_rate":1.0,"max_bucket_num":1,"bucket_num":1,"buckets":[{"lower":"not null","upper":"not null","count":12,"pre_sum":0,"ndv":1}]} + diff --git a/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_histogram.groovy b/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_histogram.groovy new file mode 100644 index 0000000000..dcd7ea0709 --- /dev/null +++ b/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_histogram.groovy @@ -0,0 +1,214 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_aggregate_histogram") { + sql "set enable_vectorized_engine = true" + + def tableName = "histogram_test" + def tableCTAS1 = "histogram_test_ctas1" + def tableCTAS2 = "histogram_test_ctas2" + + sql "DROP TABLE IF EXISTS ${tableName}" + sql "DROP TABLE IF EXISTS ${tableCTAS1}" + sql "DROP TABLE IF EXISTS ${tableCTAS2}" + + sql """ + CREATE TABLE IF NOT EXISTS ${tableName} ( + c_id INT, + c_bool BOOLEAN, + c_tinyint TINYINT, + c_smallint SMALLINT, + c_int INT, + c_bigint BIGINT, + c_largeint LARGEINT, + c_float FLOAT, + c_double DOUBLE, + c_decimal DECIMAL(9, 2), + c_decimalv3 DECIMALV3(9, 2), + c_char CHAR, + c_varchar VARCHAR(10), + c_string STRING, + c_date DATE, + c_datev2 DATEV2, + c_date_time DATETIME, + c_date_timev2 DATETIMEV2(6), + c_string_not_null VARCHAR(10) NOT NULL + ) + DISTRIBUTED BY HASH(c_id) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1" + ) + """ + + sql """ + INSERT INTO ${tableName} values + (1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, 'not null'), + (1, false, 10, 20, 30, 4444444444444, 55555555555, 0.1, 0.222, 3333.33, 4444.44, 'c', 'varchar1', 'string1', + '2022-12-01', '2022-12-01', '2022-12-01 22:23:23', '2022-12-01 22:23:24.999999', 'not null') + """ + + sql """ + INSERT INTO ${tableName} values + (1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, 'not null'), + (1, false, 11, 21, 33, 4444444444444, 55555555555, 0.1, 0.222, 3333.33, 4444.44, 'c', 'varchar1', 'string1', + '2022-12-01', '2022-12-01', '2022-12-01 22:23:23', '2022-12-01 22:23:24.999999', 'not null') + """ + + sql """ + INSERT INTO ${tableName} values + (1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, 'not null'), + (1, true, 11, 12, 13, 1444444444444, 1555555555, 1.1, 1.222, 13333.33, 14444.44, 'd', 'varchar2', 'string2', + '2022-12-02', '2022-12-02', '2022-12-02 22:23:23', '2022-12-02 22:23:24.999999', 'not null') + """ + + sql """ + INSERT INTO ${tableName} values + (2, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, 'not null'), + (2, false, 21, 22, 23, 2444444444444, 255555555, 2.1, 2.222, 23333.33, 24444.44, 'f', 'varchar3', 'string3', + '2022-12-03', '2022-12-03', '2022-12-03 22:23:23', '2022-12-03 22:23:24.999999', 'not null') + """ + + sql """ + INSERT INTO ${tableName} values + (2, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, 'not null'), + (2, true, 31, 32, 33, 3444444444444, 3555555555, 3.1, 3.222, 33333.33, 34444.44, 'l', 'varchar3', 'string3', + '2022-12-03', '2022-12-03', '2022-12-03 22:23:23', '2022-12-03 22:23:24.999999', 'not null') + """ + + sql """ + INSERT INTO ${tableName} values + (2, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, 'not null'), + (2, false, 10, 20, 30, 944444444444, 9555555555, 9.1, 9.222, 93333.33, 94444.44, 'p', 'varchar9', 'string9', + '2022-12-09', '2022-12-09', '2022-12-09 22:23:23', '2022-12-09 22:23:24.999999', 'not null') + """ + + // Test without GROUP BY + qt_select """ + SELECT + histogram(c_bool, 1.0, 1), + histogram(c_tinyint, 1.0, 1), + histogram(c_smallint, 1.0, 1), + histogram(c_bigint, 1.0, 1), + histogram(c_largeint, 1.0, 1), + histogram(c_float, 1.0, 1), + histogram(c_double, 1.0, 1), + histogram(c_decimal, 1.0, 1), + histogram(c_decimalv3, 1.0, 1), + histogram(c_char, 1.0, 1), + histogram(c_varchar, 1.0, 1), + histogram(c_string, 1.0, 1), + histogram(c_date, 1.0, 1), + histogram(c_datev2, 1.0, 1), + histogram(c_date_time, 1.0, 1), + histogram(c_date_timev2, 1.0, 1), + histogram(c_string_not_null, 1.0, 1) + FROM + ${tableName} + """ + + // Test with GROUP BY + qt_select """ + SELECT + c_id, + histogram(c_bool, 1.0, 1), + histogram(c_tinyint, 1.0, 1), + histogram(c_smallint, 1.0, 1), + histogram(c_bigint, 1.0, 1), + histogram(c_largeint, 1.0, 1), + histogram(c_float, 1.0, 1), + histogram(c_double, 1.0, 1), + histogram(c_decimal, 1.0, 1), + histogram(c_decimalv3, 1.0, 1), + histogram(c_char, 1.0, 1), + histogram(c_varchar, 1.0, 1), + histogram(c_string, 1.0, 1), + histogram(c_date, 1.0, 1), + histogram(c_datev2, 1.0, 1), + histogram(c_date_time, 1.0, 1), + histogram(c_date_timev2, 1.0, 1), + histogram(c_string_not_null, 1.0, 1) + FROM + ${tableName} + GROUP BY + c_id + ORDER BY + c_id + """ + + sql """ + CREATE TABLE ${tableCTAS1} PROPERTIES("replication_num" = "1") AS + SELECT + 1, + histogram(c_bool, 1.0, 2), + histogram(c_tinyint, 1.0, 2), + histogram(c_smallint, 1.0, 2), + histogram(c_bigint, 1.0, 2), + histogram(c_largeint, 1.0, 2), + histogram(c_float, 1.0, 2), + histogram(c_double, 1.0, 2), + histogram(c_decimal, 1.0, 2), + histogram(c_decimalv3, 1.0, 2), + histogram(c_char, 1.0, 2), + histogram(c_varchar, 1.0, 2), + histogram(c_string, 1.0, 2), + histogram(c_date, 1.0, 2), + histogram(c_datev2, 1.0, 2), + histogram(c_date_time, 1.0, 2), + histogram(c_date_timev2, 1.0, 2), + histogram(c_string_not_null, 1.0, 2) + FROM + ${tableName} + """ + + sql """ + CREATE TABLE ${tableCTAS2} PROPERTIES("replication_num" = "1") AS + SELECT + 1, + histogram(c_bool, 1.0, 1), + histogram(c_tinyint, 1.0, 1), + histogram(c_smallint, 1.0, 1), + histogram(c_bigint, 1.0, 1), + histogram(c_largeint, 1.0, 1), + histogram(c_float, 1.0, 1), + histogram(c_double, 1.0, 1), + histogram(c_decimal, 1.0, 1), + histogram(c_decimalv3, 1.0, 1), + histogram(c_char, 1.0, 1), + histogram(c_varchar, 1.0, 1), + histogram(c_string, 1.0, 1), + histogram(c_date, 1.0, 1), + histogram(c_datev2, 1.0, 1), + histogram(c_date_time, 1.0, 1), + histogram(c_date_timev2, 1.0, 1), + histogram(c_string_not_null, 1.0, 1) + FROM + ${tableName} + """ + + qt_select "SELECT * from ${tableCTAS1}" + qt_select "SELECT * from ${tableCTAS2}" + + sql "DROP TABLE IF EXISTS ${tableName}" + sql "DROP TABLE IF EXISTS ${tableCTAS1}" + sql "DROP TABLE IF EXISTS ${tableCTAS2}" +}