From 3103bb08dcde269b5388a98fea3ea41fcd7f2890 Mon Sep 17 00:00:00 2001 From: lihangyu <15605149486@163.com> Date: Fri, 23 Aug 2024 22:47:32 +0800 Subject: [PATCH] [pick](Variant) casting to decimal type may lost precision (#39843) #39650 --- be/src/olap/iterators.h | 2 +- be/src/olap/rowset/rowset_reader_context.h | 2 +- .../rowset/segment_v2/segment_iterator.cpp | 2 +- be/src/olap/tablet_reader.cpp | 3 +- be/src/olap/tablet_reader.h | 2 +- be/src/pipeline/exec/scan_operator.cpp | 38 +++++++++---------- be/src/pipeline/exec/scan_operator.h | 5 ++- be/src/vec/exec/scan/new_olap_scan_node.cpp | 4 +- be/src/vec/exec/scan/new_olap_scan_node.h | 2 +- be/src/vec/exec/scan/vscan_node.cpp | 16 ++++---- be/src/vec/exec/scan/vscan_node.h | 3 +- .../data/variant_p0/sql/implicit_cast.out | 12 ++++++ .../suites/variant_p0/sql/implicit_cast.sql | 4 +- 13 files changed, 56 insertions(+), 39 deletions(-) diff --git a/be/src/olap/iterators.h b/be/src/olap/iterators.h index 5d752a2bf7..b29c63a2c9 100644 --- a/be/src/olap/iterators.h +++ b/be/src/olap/iterators.h @@ -118,7 +118,7 @@ public: Version version; int64_t tablet_id = 0; // slots that cast may be eliminated in storage layer - std::map target_cast_type_for_variants; + std::map target_cast_type_for_variants; RowRanges row_ranges; size_t topn_limit = 0; }; diff --git a/be/src/olap/rowset/rowset_reader_context.h b/be/src/olap/rowset/rowset_reader_context.h index 6029196c9b..8cc7a281c8 100644 --- a/be/src/olap/rowset/rowset_reader_context.h +++ b/be/src/olap/rowset/rowset_reader_context.h @@ -82,7 +82,7 @@ struct RowsetReaderContext { const std::set* output_columns = nullptr; RowsetId rowset_id; // slots that cast may be eliminated in storage layer - std::map target_cast_type_for_variants; + std::map target_cast_type_for_variants; size_t topn_limit = 0; }; diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 8d647aab39..c9eec10e0c 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -1816,7 +1816,7 @@ bool SegmentIterator::_can_evaluated_by_vectorized(ColumnPredicate* predicate) { if (field_type == FieldType::OLAP_FIELD_TYPE_VARIANT) { // Use variant cast dst type field_type = TabletColumn::get_field_type_by_type( - _opts.target_cast_type_for_variants[_schema->column(cid)->name()]); + _opts.target_cast_type_for_variants[_schema->column(cid)->name()].type); } switch (predicate->type()) { case PredicateType::EQ: diff --git a/be/src/olap/tablet_reader.cpp b/be/src/olap/tablet_reader.cpp index 5af4c0d4fc..6b5d12118f 100644 --- a/be/src/olap/tablet_reader.cpp +++ b/be/src/olap/tablet_reader.cpp @@ -272,7 +272,8 @@ TabletColumn TabletReader::materialize_column(const TabletColumn& orig) { } TabletColumn column_with_cast_type = orig; auto cast_type = _reader_context.target_cast_type_for_variants.at(orig.name()); - column_with_cast_type.set_type(TabletColumn::get_field_type_by_type(cast_type)); + FieldType filed_type = TabletColumn::get_field_type_by_type(cast_type.type); + column_with_cast_type.set_type(filed_type); return column_with_cast_type; } diff --git a/be/src/olap/tablet_reader.h b/be/src/olap/tablet_reader.h index 942c61f820..8b99a8f886 100644 --- a/be/src/olap/tablet_reader.h +++ b/be/src/olap/tablet_reader.h @@ -136,7 +136,7 @@ public: std::vector function_filters; std::vector delete_predicates; // slots that cast may be eliminated in storage layer - std::map target_cast_type_for_variants; + std::map target_cast_type_for_variants; std::vector rs_splits; // For unique key table with merge-on-write diff --git a/be/src/pipeline/exec/scan_operator.cpp b/be/src/pipeline/exec/scan_operator.cpp index feb10a618d..d17a3aa701 100644 --- a/be/src/pipeline/exec/scan_operator.cpp +++ b/be/src/pipeline/exec/scan_operator.cpp @@ -28,6 +28,7 @@ #include "pipeline/exec/meta_scan_operator.h" #include "pipeline/exec/olap_scan_operator.h" #include "pipeline/exec/operator.h" +#include "runtime/types.h" #include "util/runtime_profile.h" #include "vec/exec/runtime_filter_consumer.h" #include "vec/exec/scan/pip_scanner_context.h" @@ -168,14 +169,14 @@ Status ScanLocalState::_normalize_conjuncts() { // The conjuncts is always on output tuple, so use _output_tuple_desc; std::vector slots = p._output_tuple_desc->slots(); - auto init_value_range = [&](SlotDescriptor* slot, PrimitiveType type) { - switch (type) { -#define M(NAME) \ - case TYPE_##NAME: { \ - ColumnValueRange range(slot->col_name(), slot->is_nullable(), \ - slot->type().precision, slot->type().scale); \ - _slot_id_to_value_range[slot->id()] = std::pair {slot, range}; \ - break; \ + auto init_value_range = [&](SlotDescriptor* slot, const TypeDescriptor& type_desc) { + switch (type_desc.type) { +#define M(NAME) \ + case TYPE_##NAME: { \ + ColumnValueRange range(slot->col_name(), slot->is_nullable(), \ + type_desc.precision, type_desc.scale); \ + _slot_id_to_value_range[slot->id()] = std::pair {slot, range}; \ + break; \ } #define APPLY_FOR_PRIMITIVE_TYPE(M) \ M(TINYINT) \ @@ -219,7 +220,7 @@ Status ScanLocalState::_normalize_conjuncts() { continue; } } - init_value_range(slots[slot_idx], slots[slot_idx]->type().type); + init_value_range(slots[slot_idx], slots[slot_idx]->type()); } get_cast_types_for_variants(); @@ -631,7 +632,7 @@ Status ScanLocalState::_normalize_in_and_eq_predicate( vectorized::VExpr* expr, vectorized::VExprContext* expr_ctx, SlotDescriptor* slot, ColumnValueRange& range, vectorized::VScanNode::PushDownType* pdt) { auto temp_range = ColumnValueRange::create_empty_column_value_range( - slot->is_nullable(), slot->type().precision, slot->type().scale); + slot->is_nullable(), range.precision(), range.scale()); // 1. Normalize in conjuncts like 'where col in (v1, v2, v3)' if (TExprNodeType::IN_PRED == expr->node_type()) { HybridSetBase::IteratorBase* iter = nullptr; @@ -787,7 +788,7 @@ Status ScanLocalState::_normalize_not_in_and_not_eq_predicate( ColumnValueRange& range, vectorized::VScanNode::PushDownType* pdt) { bool is_fixed_range = range.is_fixed_value_range(); auto not_in_range = ColumnValueRange::create_empty_column_value_range( - range.column_name(), slot->is_nullable(), slot->type().precision, slot->type().scale); + range.column_name(), slot->is_nullable(), range.precision(), range.scale()); vectorized::VScanNode::PushDownType temp_pdt = vectorized::VScanNode::PushDownType::UNACCEPTABLE; // 1. Normalize in conjuncts like 'where col in (v1, v2, v3)' @@ -969,14 +970,14 @@ Status ScanLocalState::_normalize_is_null_predicate( if (reinterpret_cast(expr)->fn().name.function_name == "is_null_pred") { auto temp_range = ColumnValueRange::create_empty_column_value_range( - slot->is_nullable(), slot->type().precision, slot->type().scale); + slot->is_nullable(), range.precision(), range.scale()); temp_range.set_contain_null(true); range.intersection(temp_range); *pdt = temp_pdt; } else if (reinterpret_cast(expr)->fn().name.function_name == "is_not_null_pred") { auto temp_range = ColumnValueRange::create_empty_column_value_range( - slot->is_nullable(), slot->type().precision, slot->type().scale); + slot->is_nullable(), range.precision(), range.scale()); temp_range.set_contain_null(false); range.intersection(temp_range); *pdt = temp_pdt; @@ -1216,7 +1217,7 @@ Status ScanLocalState::_normalize_match_predicate( // create empty range as temp range, temp range should do intersection on range auto temp_range = ColumnValueRange::create_empty_column_value_range( - slot->is_nullable(), slot->type().precision, slot->type().scale); + slot->is_nullable(), range.precision(), range.scale()); // Normalize match conjuncts like 'where col match value' auto match_checker = [](const std::string& fn_name) { return is_match_condition(fn_name); }; @@ -1361,7 +1362,7 @@ Status ScanLocalState::_init_profile() { template void ScanLocalState::_filter_and_collect_cast_type_for_variant( const vectorized::VExpr* expr, - phmap::flat_hash_map>& colname_to_cast_types) { + std::unordered_map>& colname_to_cast_types) { const auto* cast_expr = dynamic_cast(expr); if (cast_expr != nullptr) { const auto* src_slot = @@ -1373,10 +1374,9 @@ void ScanLocalState::_filter_and_collect_cast_type_for_variant( } std::vector slots = output_tuple_desc()->slots(); SlotDescriptor* src_slot_desc = _slot_id_to_slot_desc[src_slot->slot_id()]; - PrimitiveType cast_dst_type = - cast_expr->get_target_type()->get_type_as_type_descriptor().type; + TypeDescriptor type_desc = cast_expr->get_target_type()->get_type_as_type_descriptor(); if (src_slot_desc->type().is_variant_type()) { - colname_to_cast_types[src_slot_desc->col_name()].push_back(cast_dst_type); + colname_to_cast_types[src_slot_desc->col_name()].push_back(type_desc); } } for (const auto& child : expr->children()) { @@ -1386,7 +1386,7 @@ void ScanLocalState::_filter_and_collect_cast_type_for_variant( template void ScanLocalState::get_cast_types_for_variants() { - phmap::flat_hash_map> colname_to_cast_types; + std::unordered_map> colname_to_cast_types; for (auto it = _conjuncts.begin(); it != _conjuncts.end();) { auto& conjunct = *it; if (conjunct->root()) { diff --git a/be/src/pipeline/exec/scan_operator.h b/be/src/pipeline/exec/scan_operator.h index 4cdebeedc8..100b12d0a7 100644 --- a/be/src/pipeline/exec/scan_operator.h +++ b/be/src/pipeline/exec/scan_operator.h @@ -27,6 +27,7 @@ #include "pipeline/pipeline_x/dependency.h" #include "pipeline/pipeline_x/operator.h" #include "runtime/descriptors.h" +#include "runtime/types.h" #include "vec/exec/scan/vscan_node.h" namespace doris { @@ -329,7 +330,7 @@ protected: void get_cast_types_for_variants(); void _filter_and_collect_cast_type_for_variant( const vectorized::VExpr* expr, - phmap::flat_hash_map>& colname_to_cast_types); + std::unordered_map>& colname_to_cast_types); // Every time vconjunct_ctx_ptr is updated, the old ctx will be stored in this vector // so that it will be destroyed uniformly at the end of the query. @@ -344,7 +345,7 @@ protected: std::vector _push_down_functions; // colname -> cast dst type - std::map _cast_types_for_variants; + std::map _cast_types_for_variants; // slot id -> SlotDescriptor phmap::flat_hash_map _slot_id_to_slot_desc; diff --git a/be/src/vec/exec/scan/new_olap_scan_node.cpp b/be/src/vec/exec/scan/new_olap_scan_node.cpp index 63b0067dd1..c6e4363db4 100644 --- a/be/src/vec/exec/scan/new_olap_scan_node.cpp +++ b/be/src/vec/exec/scan/new_olap_scan_node.cpp @@ -423,7 +423,7 @@ std::string NewOlapScanNode::get_name() { void NewOlapScanNode::_filter_and_collect_cast_type_for_variant( const VExpr* expr, - phmap::flat_hash_map>& colname_to_cast_types) { + phmap::flat_hash_map>& colname_to_cast_types) { auto* cast_expr = dynamic_cast(expr); if (cast_expr != nullptr) { auto* src_slot = cast_expr->get_child(0)->node_type() == TExprNodeType::SLOT_REF @@ -446,7 +446,7 @@ void NewOlapScanNode::_filter_and_collect_cast_type_for_variant( } void NewOlapScanNode::get_cast_types_for_variants() { - phmap::flat_hash_map> colname_to_cast_types; + phmap::flat_hash_map> colname_to_cast_types; for (auto it = _conjuncts.begin(); it != _conjuncts.end();) { auto& conjunct = *it; if (conjunct->root()) { diff --git a/be/src/vec/exec/scan/new_olap_scan_node.h b/be/src/vec/exec/scan/new_olap_scan_node.h index 4ee2b77216..15b15ead16 100644 --- a/be/src/vec/exec/scan/new_olap_scan_node.h +++ b/be/src/vec/exec/scan/new_olap_scan_node.h @@ -106,7 +106,7 @@ protected: void get_cast_types_for_variants() override; void _filter_and_collect_cast_type_for_variant( const VExpr* expr, - phmap::flat_hash_map>& colname_to_cast_types); + phmap::flat_hash_map>& colname_to_cast_types); private: Status _build_key_ranges_and_filters(); diff --git a/be/src/vec/exec/scan/vscan_node.cpp b/be/src/vec/exec/scan/vscan_node.cpp index 5a64287ce2..258e225a26 100644 --- a/be/src/vec/exec/scan/vscan_node.cpp +++ b/be/src/vec/exec/scan/vscan_node.cpp @@ -342,14 +342,14 @@ Status VScanNode::_normalize_conjuncts() { // The conjuncts is always on output tuple, so use _output_tuple_desc; std::vector slots = _output_tuple_desc->slots(); - auto init_value_range = [&](SlotDescriptor* slot, PrimitiveType type) { - switch (type) { -#define M(NAME) \ - case TYPE_##NAME: { \ - ColumnValueRange range(slot->col_name(), slot->is_nullable(), \ - slot->type().precision, slot->type().scale); \ - _slot_id_to_value_range[slot->id()] = std::pair {slot, range}; \ - break; \ + auto init_value_range = [&](SlotDescriptor* slot, TypeDescriptor type) { + switch (type.type) { +#define M(NAME) \ + case TYPE_##NAME: { \ + ColumnValueRange range(slot->col_name(), slot->is_nullable(), type.precision, \ + type.scale); \ + _slot_id_to_value_range[slot->id()] = std::pair {slot, range}; \ + break; \ } #define APPLY_FOR_PRIMITIVE_TYPE(M) \ M(TINYINT) \ diff --git a/be/src/vec/exec/scan/vscan_node.h b/be/src/vec/exec/scan/vscan_node.h index 04bc738fda..ddbaac1333 100644 --- a/be/src/vec/exec/scan/vscan_node.h +++ b/be/src/vec/exec/scan/vscan_node.h @@ -41,6 +41,7 @@ #include "runtime/define_primitive_type.h" #include "runtime/query_context.h" #include "runtime/runtime_state.h" +#include "runtime/types.h" #include "util/runtime_profile.h" #include "vec/exec/runtime_filter_consumer.h" #include "vec/exec/scan/scanner_context.h" @@ -308,7 +309,7 @@ protected: std::vector _push_down_functions; // colname -> cast dst type - std::map _cast_types_for_variants; + std::map _cast_types_for_variants; // slot id -> ColumnValueRange // Parsed from conjuncts diff --git a/regression-test/data/variant_p0/sql/implicit_cast.out b/regression-test/data/variant_p0/sql/implicit_cast.out index b0f5d96087..2eefddc43e 100644 --- a/regression-test/data/variant_p0/sql/implicit_cast.out +++ b/regression-test/data/variant_p0/sql/implicit_cast.out @@ -78,3 +78,15 @@ user user user +-- !implicit_cast_14 -- +14690746673 +14690746676 +14690746679 +14690746680 +14690746681 +14690746684 +14690746685 +14690746687 +14690746688 +14690746689 + diff --git a/regression-test/suites/variant_p0/sql/implicit_cast.sql b/regression-test/suites/variant_p0/sql/implicit_cast.sql index 0653a52eed..f62b25ecfd 100644 --- a/regression-test/suites/variant_p0/sql/implicit_cast.sql +++ b/regression-test/suites/variant_p0/sql/implicit_cast.sql @@ -12,4 +12,6 @@ SELECT v["payload"]["member"]["id"] FROM ghdata where v["payload"]["member"]["id select k, json_extract(v, '$.repo') from ghdata WHERE v["type"] = 'WatchEvent' order by k limit 10; -- SELECT v["payload"]["member"]["id"], count() FROM ghdata where v["payload"]["member"]["id"] is not null group by v["payload"]["member"]["id"] order by 1, 2 desc LIMIT 10; select k, v["id"], v["type"], v["repo"]["name"] from ghdata WHERE v["type"] = 'WatchEvent' order by k limit 10; -SELECT v["payload"]["pusher_type"] FROM ghdata where v["payload"]["pusher_type"] is not null ORDER BY k LIMIT 10; \ No newline at end of file +SELECT v["payload"]["pusher_type"] FROM ghdata where v["payload"]["pusher_type"] is not null ORDER BY k LIMIT 10; +-- implicit cast to decimal type +SELECT v["id"] FROM ghdata where v["id"] not in (7273, 10.118626, -69352) order by cast(v["id"] as decimal) limit 10; \ No newline at end of file