[Feature](vec)(quantile_state): support quantile state in vectorized engine (#16562)
* [Feature](vectorized)(quantile_state): support vectorized quantile state functions 1. now quantile column only support not nullable 2. add up some regression test cases 3. set default enable_quantile_state_type = true --------- Co-authored-by: spaces-x <weixiang06@meituan.com>
This commit is contained in:
277
be/src/vec/functions/function_quantile_state.cpp
Normal file
277
be/src/vec/functions/function_quantile_state.cpp
Normal file
@ -0,0 +1,277 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// This file is copied from
|
||||
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/FunctionBitmap.h
|
||||
// and modified by Doris
|
||||
|
||||
#include "util/string_parser.hpp"
|
||||
#include "vec/columns/column.h"
|
||||
#include "vec/columns/column_array.h"
|
||||
#include "vec/columns/columns_number.h"
|
||||
#include "vec/data_types/data_type_number.h"
|
||||
#include "vec/data_types/data_type_quantilestate.h"
|
||||
#include "vec/functions/function_always_not_nullable.h"
|
||||
#include "vec/functions/function_const.h"
|
||||
#include "vec/functions/function_string.h"
|
||||
#include "vec/functions/simple_function_factory.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
template <typename InternalType>
|
||||
struct QuantileStateEmpty {
|
||||
static constexpr auto name = "quantile_state_empty";
|
||||
using ReturnColVec = ColumnQuantileState<InternalType>;
|
||||
static DataTypePtr get_return_type() {
|
||||
return std::make_shared<DataTypeQuantileState<InternalType>>();
|
||||
}
|
||||
static auto init_value() { return QuantileState<InternalType> {}; }
|
||||
};
|
||||
|
||||
template <typename InternalType>
|
||||
class FunctionToQuantileState : public IFunction {
|
||||
public:
|
||||
static constexpr auto name = "to_quantile_state";
|
||||
String get_name() const override { return name; }
|
||||
|
||||
static FunctionPtr create() {
|
||||
return std::make_shared<FunctionToQuantileState<InternalType>>();
|
||||
}
|
||||
|
||||
DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
|
||||
return std::make_shared<DataTypeQuantileState<InternalType>>();
|
||||
}
|
||||
|
||||
size_t get_number_of_arguments() const override { return 2; }
|
||||
|
||||
bool use_default_implementation_for_nulls() const override { return false; }
|
||||
|
||||
bool use_default_implementation_for_constants() const override { return true; }
|
||||
|
||||
template <typename ColumnType, bool is_nullable>
|
||||
Status execute_internal(const ColumnPtr& column, const DataTypePtr& data_type,
|
||||
MutableColumnPtr& column_result) {
|
||||
auto type_error = [&]() {
|
||||
return Status::RuntimeError("Illegal column {} of argument of function {}",
|
||||
column->get_name(), get_name());
|
||||
};
|
||||
const ColumnNullable* col_nullable = nullptr;
|
||||
const ColumnUInt8* col_nullmap = nullptr;
|
||||
const ColumnType* col = nullptr;
|
||||
const NullMap* nullmap = nullptr;
|
||||
if constexpr (is_nullable) {
|
||||
col_nullable = check_and_get_column<ColumnNullable>(column.get());
|
||||
col_nullmap = check_and_get_column<ColumnUInt8>(
|
||||
col_nullable->get_null_map_column_ptr().get());
|
||||
col = check_and_get_column<ColumnType>(col_nullable->get_nested_column_ptr().get());
|
||||
if (col == nullptr || col_nullmap == nullptr) {
|
||||
return type_error();
|
||||
}
|
||||
|
||||
nullmap = &col_nullmap->get_data();
|
||||
} else {
|
||||
col = check_and_get_column<ColumnType>(column.get());
|
||||
}
|
||||
auto* res_column =
|
||||
reinterpret_cast<ColumnQuantileState<InternalType>*>(column_result.get());
|
||||
auto& res_data = res_column->get_data();
|
||||
|
||||
size_t size = col->size();
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
if constexpr (is_nullable) {
|
||||
if ((*nullmap)[i]) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<ColumnType, ColumnString>) {
|
||||
const ColumnString::Chars& data = col->get_chars();
|
||||
const ColumnString::Offsets& offsets = col->get_offsets();
|
||||
|
||||
const char* raw_str = reinterpret_cast<const char*>(&data[offsets[i - 1]]);
|
||||
size_t str_size = offsets[i] - offsets[i - 1];
|
||||
StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS;
|
||||
InternalType value = StringParser::string_to_float<InternalType>(raw_str, str_size,
|
||||
&parse_result);
|
||||
if (LIKELY(parse_result == StringParser::PARSE_SUCCESS)) {
|
||||
res_data[i].add_value(value);
|
||||
} else {
|
||||
std::stringstream ss;
|
||||
ss << "The input column content: " << std::string(raw_str, str_size)
|
||||
<< " is not valid in function: " << get_name();
|
||||
LOG(WARNING) << ss.str();
|
||||
return Status::InternalError(ss.str());
|
||||
}
|
||||
} else if constexpr (std::is_same_v<ColumnType, ColumnInt64> ||
|
||||
std::is_same_v<ColumnType, ColumnFloat32> ||
|
||||
std::is_same_v<ColumnType, ColumnFloat64>) {
|
||||
// InternalType only can be double or float, so we can cast directly
|
||||
InternalType value = (InternalType)col->get_data()[i];
|
||||
res_data[i].set_compression(compression);
|
||||
res_data[i].add_value(value);
|
||||
} else {
|
||||
type_error();
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
|
||||
size_t result, size_t input_rows_count) override {
|
||||
if constexpr (!(std::is_same_v<InternalType, float> ||
|
||||
std::is_same_v<InternalType, double>)) {
|
||||
std::stringstream ss;
|
||||
ss << "The InternalType of quantile_state must be float or double";
|
||||
|
||||
return Status::InternalError(ss.str());
|
||||
}
|
||||
|
||||
const ColumnPtr& column = block.get_by_position(arguments[0]).column;
|
||||
const DataTypePtr& data_type = block.get_by_position(arguments[0]).type;
|
||||
auto compression_arg = check_and_get_column_const<ColumnFloat32>(
|
||||
block.get_by_position(arguments.back()).column);
|
||||
if (compression_arg) {
|
||||
auto compression_arg_val = compression_arg->get_value<Float32>();
|
||||
if (compression_arg_val && compression_arg_val >= QUANTILE_STATE_COMPRESSION_MIN &&
|
||||
compression_arg_val <= QUANTILE_STATE_COMPRESSION_MAX) {
|
||||
this->compression = compression_arg_val;
|
||||
}
|
||||
}
|
||||
WhichDataType which(data_type);
|
||||
MutableColumnPtr column_result = get_return_type_impl({})->create_column();
|
||||
column_result->resize(input_rows_count);
|
||||
|
||||
auto type_error = [&]() {
|
||||
return Status::RuntimeError("Illegal column {} of argument of function {}",
|
||||
block.get_by_position(arguments[0]).column->get_name(),
|
||||
get_name());
|
||||
};
|
||||
Status status = Status::OK();
|
||||
if (which.is_nullable()) {
|
||||
const DataTypePtr& nested_data_type =
|
||||
static_cast<const DataTypeNullable*>(data_type.get())->get_nested_type();
|
||||
WhichDataType nested_which(nested_data_type);
|
||||
if (nested_which.is_string_or_fixed_string()) {
|
||||
status = execute_internal<ColumnString, true>(column, data_type, column_result);
|
||||
} else if (nested_which.is_int64()) {
|
||||
status = execute_internal<ColumnInt64, true>(column, data_type, column_result);
|
||||
} else if (which.is_float32()) {
|
||||
status = execute_internal<ColumnFloat32, true>(column, data_type, column_result);
|
||||
} else if (which.is_float64()) {
|
||||
status = execute_internal<ColumnFloat64, true>(column, data_type, column_result);
|
||||
} else {
|
||||
return type_error();
|
||||
}
|
||||
} else {
|
||||
if (which.is_string_or_fixed_string()) {
|
||||
status = execute_internal<ColumnString, false>(column, data_type, column_result);
|
||||
} else if (which.is_int64()) {
|
||||
status = execute_internal<ColumnInt64, false>(column, data_type, column_result);
|
||||
} else if (which.is_float32()) {
|
||||
status = execute_internal<ColumnFloat32, false>(column, data_type, column_result);
|
||||
} else if (which.is_float64()) {
|
||||
status = execute_internal<ColumnFloat64, false>(column, data_type, column_result);
|
||||
} else {
|
||||
return type_error();
|
||||
}
|
||||
}
|
||||
if (status.ok()) {
|
||||
block.replace_by_position(result, std::move(column_result));
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
private:
|
||||
float compression = 2048;
|
||||
};
|
||||
|
||||
template <typename InternalType>
|
||||
class FunctionQuantileStatePercent : public IFunction {
|
||||
public:
|
||||
static constexpr auto name = "quantile_percent";
|
||||
String get_name() const override { return name; }
|
||||
|
||||
static FunctionPtr create() {
|
||||
return std::make_shared<FunctionQuantileStatePercent<InternalType>>();
|
||||
}
|
||||
|
||||
DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
|
||||
return std::make_shared<DataTypeFloat64>();
|
||||
}
|
||||
|
||||
size_t get_number_of_arguments() const override { return 2; }
|
||||
|
||||
bool use_default_implementation_for_nulls() const override { return false; }
|
||||
|
||||
bool use_default_implementation_for_constants() const override { return true; }
|
||||
|
||||
Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
|
||||
size_t result, size_t input_rows_count) override {
|
||||
auto res_data_column = ColumnFloat64::create();
|
||||
auto& res = res_data_column->get_data();
|
||||
auto data_null_map = ColumnUInt8::create(input_rows_count, 0);
|
||||
auto& null_map = data_null_map->get_data();
|
||||
|
||||
auto column = block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
|
||||
if (auto* nullable = check_and_get_column<const ColumnNullable>(*column)) {
|
||||
VectorizedUtils::update_null_map(null_map, nullable->get_null_map_data());
|
||||
column = nullable->get_nested_column_ptr();
|
||||
}
|
||||
auto str_col = assert_cast<const ColumnQuantileState<InternalType>*>(column.get());
|
||||
auto& col_data = str_col->get_data();
|
||||
auto percent_arg = check_and_get_column_const<ColumnFloat32>(
|
||||
block.get_by_position(arguments.back()).column);
|
||||
|
||||
if (!percent_arg) {
|
||||
LOG(FATAL) << fmt::format(
|
||||
"Second argument to {} must be a constant string describing type", get_name());
|
||||
}
|
||||
float percent_arg_value = percent_arg->get_value<Float32>();
|
||||
if (percent_arg_value < 0 || percent_arg_value > 1) {
|
||||
std::stringstream ss;
|
||||
ss << "the input argument of percentage: " << percent_arg_value
|
||||
<< " is not valid, must be in range [0,1] ";
|
||||
LOG(WARNING) << ss.str();
|
||||
return Status::InternalError(ss.str());
|
||||
}
|
||||
|
||||
res.reserve(input_rows_count);
|
||||
for (size_t i = 0; i < input_rows_count; ++i) {
|
||||
if (null_map[i]) {
|
||||
// if null push_back meaningless result to make sure idxs can be matched
|
||||
res.push_back(0);
|
||||
continue;
|
||||
}
|
||||
|
||||
res.push_back(col_data[i].get_value_by_percentile(percent_arg_value));
|
||||
}
|
||||
|
||||
block.replace_by_position(result, std::move(res_data_column));
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
using FunctionQuantileStateEmpty = FunctionConst<QuantileStateEmpty<double>, false>;
|
||||
using FunctionQuantileStatePercentDouble = FunctionQuantileStatePercent<double>;
|
||||
using FunctionToQuantileStateDouble = FunctionToQuantileState<double>;
|
||||
|
||||
void register_function_quantile_state(SimpleFunctionFactory& factory) {
|
||||
factory.register_function<FunctionQuantileStateEmpty>();
|
||||
factory.register_function<FunctionQuantileStatePercentDouble>();
|
||||
factory.register_function<FunctionToQuantileStateDouble>();
|
||||
}
|
||||
|
||||
} // namespace doris::vectorized
|
||||
Reference in New Issue
Block a user