From c93f3bd24ec2076f2fa48461e9b9718baa9f3a9c Mon Sep 17 00:00:00 2001 From: Gabriel Date: Fri, 26 Jul 2024 10:11:31 +0800 Subject: [PATCH] [Improvement](bloom filter) Forbid small bloom filter (#38349) (#38392) Bloom filter has a expected filter ratio when data is enough. This PR forbid too small bloom filter which has a big bias for filter ratio. pick #38349 --- be/src/exprs/bloom_filter_func.h | 20 +++++++++++++++---- be/src/exprs/runtime_filter.cpp | 3 +++ be/src/exprs/runtime_filter.h | 1 + .../org/apache/doris/qe/SessionVariable.java | 3 ++- gensrc/thrift/PaloInternalService.thrift | 4 +++- 5 files changed, 25 insertions(+), 6 deletions(-) diff --git a/be/src/exprs/bloom_filter_func.h b/be/src/exprs/bloom_filter_func.h index a831395a5e..e88f692a23 100644 --- a/be/src/exprs/bloom_filter_func.h +++ b/be/src/exprs/bloom_filter_func.h @@ -100,8 +100,12 @@ public: virtual ~BloomFilterFuncBase() = default; void init_params(const RuntimeFilterParams* params) { - _bloom_filter_length = params->bloom_filter_size; + _bloom_filter_length = + params->runtime_bloom_filter_min_size > 0 + ? std::max(params->bloom_filter_size, params->runtime_bloom_filter_min_size) + : params->bloom_filter_size; _build_bf_exactly = params->build_bf_exactly; + _runtime_bloom_filter_min_size = params->runtime_bloom_filter_min_size; _null_aware = params->null_aware; _bloom_filter_size_calculated_by_ndv = params->bloom_filter_size_calculated_by_ndv; } @@ -124,9 +128,16 @@ public: // if FE do use ndv stat to predict the bf size, BE only use the row count. FE have more // exactly row count stat. which one is min is more correctly. if (_bloom_filter_size_calculated_by_ndv) { - _bloom_filter_length = std::min(be_calculate_size, _bloom_filter_length); + _bloom_filter_length = + _runtime_bloom_filter_min_size > 0 + ? std::max(_runtime_bloom_filter_min_size, + std::min(be_calculate_size, _bloom_filter_length)) + : std::min(be_calculate_size, _bloom_filter_length); } else { - _bloom_filter_length = be_calculate_size; + _bloom_filter_length = + _runtime_bloom_filter_min_size > 0 + ? std::max(_runtime_bloom_filter_min_size, be_calculate_size) + : be_calculate_size; } } return init_with_fixed_length(_bloom_filter_length); @@ -221,8 +232,9 @@ protected: // bloom filter size int32_t _bloom_filter_alloced; std::shared_ptr _bloom_filter; - bool _inited {}; + bool _inited = false; int64_t _bloom_filter_length; + int64_t _runtime_bloom_filter_min_size; bool _build_bf_exactly = false; bool _bloom_filter_size_calculated_by_ndv = false; }; diff --git a/be/src/exprs/runtime_filter.cpp b/be/src/exprs/runtime_filter.cpp index 1bf921b67a..0ea095e9a5 100644 --- a/be/src/exprs/runtime_filter.cpp +++ b/be/src/exprs/runtime_filter.cpp @@ -1393,6 +1393,9 @@ Status IRuntimeFilter::init_with_desc(const TRuntimeFilterDesc* desc, const TQue params.filter_type = _runtime_filter_type; params.column_return_type = build_ctx->root()->type().type; params.max_in_num = options->runtime_filter_max_in_num; + params.runtime_bloom_filter_min_size = options->__isset.runtime_bloom_filter_min_size + ? options->runtime_bloom_filter_min_size + : 0; // We build runtime filter by exact distinct count iff three conditions are met: // 1. Only 1 join key // 2. Do not have remote target (e.g. do not need to merge), or broadcast join diff --git a/be/src/exprs/runtime_filter.h b/be/src/exprs/runtime_filter.h index 7b57a86a94..3acca8cd4e 100644 --- a/be/src/exprs/runtime_filter.h +++ b/be/src/exprs/runtime_filter.h @@ -128,6 +128,7 @@ struct RuntimeFilterParams { // used in bloom filter int64_t bloom_filter_size; int32_t max_in_num; + int64_t runtime_bloom_filter_min_size; int32_t filter_id; bool bitmap_filter_not_in; bool build_bf_exactly; diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 7c35abd345..1ab8deb36e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -1028,7 +1028,7 @@ public class SessionVariable implements Serializable, Writable { private int runtimeBloomFilterSize = 2097152; @VariableMgr.VarAttr(name = RUNTIME_BLOOM_FILTER_MIN_SIZE, needForward = true) - private int runtimeBloomFilterMinSize = 2048; + private int runtimeBloomFilterMinSize = 1048576; @VariableMgr.VarAttr(name = RUNTIME_BLOOM_FILTER_MAX_SIZE, needForward = true) private int runtimeBloomFilterMaxSize = 16777216; @@ -3335,6 +3335,7 @@ public class SessionVariable implements Serializable, Writable { tResult.setRuntimeFilterWaitTimeMs(runtimeFilterWaitTimeMs); tResult.setRuntimeFilterMaxInNum(runtimeFilterMaxInNum); + tResult.setRuntimeBloomFilterMinSize(runtimeBloomFilterMinSize); tResult.setRuntimeFilterWaitInfinitely(runtimeFilterWaitInfinitely); if (cpuResourceLimit > 0) { diff --git a/gensrc/thrift/PaloInternalService.thrift b/gensrc/thrift/PaloInternalService.thrift index 6b0df79f14..995385ddc9 100644 --- a/gensrc/thrift/PaloInternalService.thrift +++ b/gensrc/thrift/PaloInternalService.thrift @@ -306,7 +306,9 @@ struct TQueryOptions { 118: optional TSerdeDialect serde_dialect = TSerdeDialect.DORIS; - 119: optional bool keep_carriage_return = false; // \n,\r\n split line in CSV. + 119: optional bool keep_carriage_return = false; // \n,\r\n split line in CSV. + + 122: optional i32 runtime_bloom_filter_min_size = 1048576; // For cloud, to control if the content would be written into file cache 1000: optional bool disable_file_cache = false }