From 77c15729d422ecb5332f6a6ef60016e24a94f96f Mon Sep 17 00:00:00 2001 From: Xinyi Zou Date: Thu, 22 Dec 2022 17:16:18 +0800 Subject: [PATCH] [fix](memory) Fix too many repeat cause OOM (#15217) --- be/src/runtime/runtime_state.h | 11 +++++++++++ be/src/vec/functions/function_string.h | 11 +++++++---- be/test/vec/function/function_string_test.cpp | 15 ++++++++------- .../java/org/apache/doris/qe/SessionVariable.java | 7 +++++++ gensrc/thrift/PaloInternalService.thrift | 2 ++ .../datatype_p0/string/test_string_basic.groovy | 2 ++ 6 files changed, 37 insertions(+), 11 deletions(-) diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index f01156e893..f1180155d9 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -403,6 +403,17 @@ public: _query_options.enable_share_hash_table_for_broadcast_join; } + int repeat_max_num() const { +#ifndef BE_TEST + if (!_query_options.__isset.repeat_max_num) { + return 10000; + } + return _query_options.repeat_max_num; +#else + return 10; +#endif + } + private: // Use a custom block manager for the query for testing purposes. void set_block_mgr2(const std::shared_ptr& block_mgr) { diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index bfe3a2f1dc..79f1c03bc5 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -1000,13 +1000,15 @@ public: if (auto* col1 = check_and_get_column(*argument_ptr[0])) { if (auto* col2 = check_and_get_column(*argument_ptr[1])) { vector_vector(col1->get_chars(), col1->get_offsets(), col2->get_data(), - res->get_chars(), res->get_offsets(), null_map->get_data()); + res->get_chars(), res->get_offsets(), null_map->get_data(), + context->impl()->state()->repeat_max_num()); block.replace_by_position( result, ColumnNullable::create(std::move(res), std::move(null_map))); return Status::OK(); } else if (auto* col2_const = check_and_get_column(*argument_ptr[1])) { DCHECK(check_and_get_column(col2_const->get_data_column())); - int repeat = col2_const->get_int(0); + int repeat = std::min(col2_const->get_int(0), + context->impl()->state()->repeat_max_num()); if (repeat <= 0) { null_map->get_data().resize_fill(input_rows_count, 0); res->insert_many_defaults(input_rows_count); @@ -1026,7 +1028,8 @@ public: void vector_vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, const ColumnInt32::Container& repeats, ColumnString::Chars& res_data, - ColumnString::Offsets& res_offsets, ColumnUInt8::Container& null_map) { + ColumnString::Offsets& res_offsets, ColumnUInt8::Container& null_map, + const int repeat_max_num) { size_t input_row_size = offsets.size(); fmt::memory_buffer buffer; @@ -1036,7 +1039,7 @@ public: buffer.clear(); const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); size_t size = offsets[i] - offsets[i - 1]; - int repeat = repeats[i]; + int repeat = std::min(repeats[i], repeat_max_num); if (repeat <= 0) { StringOP::push_empty_string(i, res_data, res_offsets); diff --git a/be/test/vec/function/function_string_test.cpp b/be/test/vec/function/function_string_test.cpp index 0ce215a650..42d759df79 100644 --- a/be/test/vec/function/function_string_test.cpp +++ b/be/test/vec/function/function_string_test.cpp @@ -165,13 +165,14 @@ TEST(function_string_test, function_string_repeat_test) { std::string func_name = "repeat"; InputTypeSet input_types = {TypeIndex::String, TypeIndex::Int32}; - DataSet data_set = {{{std::string("a"), 3}, std::string("aaa")}, - {{std::string("hel lo"), 2}, std::string("hel lohel lo")}, - {{std::string("hello word"), -1}, std::string("")}, - {{std::string(""), 1}, std::string("")}, - {{std::string("134"), 1073741825}, Null()}, // bigger than 1GB - {{std::string("HELLO,!^%"), 2}, std::string("HELLO,!^%HELLO,!^%")}, - {{std::string("你"), 2}, std::string("你你")}}; + DataSet data_set = { + {{std::string("a"), 3}, std::string("aaa")}, + {{std::string("hel lo"), 2}, std::string("hel lohel lo")}, + {{std::string("hello word"), -1}, std::string("")}, + {{std::string(""), 1}, std::string("")}, + {{std::string("a"), 1073741825}, std::string("aaaaaaaaaa")}, // ut repeat max num 10 + {{std::string("HELLO,!^%"), 2}, std::string("HELLO,!^%HELLO,!^%")}, + {{std::string("你"), 2}, std::string("你你")}}; check_function(func_name, input_types, data_set); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 7e22343eb4..0e3141c7f9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -250,6 +250,8 @@ public class SessionVariable implements Serializable, Writable { public static final String ENABLE_SHARE_HASH_TABLE_FOR_BROADCAST_JOIN = "enable_share_hash_table_for_broadcast_join"; + public static final String REPEAT_MAX_NUM = "repeat_max_num"; + // session origin value public Map sessionOriginValue = new HashMap(); // check stmt is or not [select /*+ SET_VAR(...)*/ ...] @@ -643,6 +645,9 @@ public class SessionVariable implements Serializable, Writable { @VariableMgr.VarAttr(name = ENABLE_SHARE_HASH_TABLE_FOR_BROADCAST_JOIN) public boolean enableShareHashTableForBroadcastJoin = true; + @VariableMgr.VarAttr(name = REPEAT_MAX_NUM) + public int repeatMaxNum = 10000; + // If this fe is in fuzzy mode, then will use initFuzzyModeVariables to generate some variables, // not the default value set in the code. public void initFuzzyModeVariables() { @@ -1365,6 +1370,8 @@ public class SessionVariable implements Serializable, Writable { tResult.setPartitionedHashJoinRowsThreshold(partitionedHashJoinRowsThreshold); + tResult.setRepeatMaxNum(repeatMaxNum); + return tResult; } diff --git a/gensrc/thrift/PaloInternalService.thrift b/gensrc/thrift/PaloInternalService.thrift index cb343ae29b..32d6721bb3 100644 --- a/gensrc/thrift/PaloInternalService.thrift +++ b/gensrc/thrift/PaloInternalService.thrift @@ -185,6 +185,8 @@ struct TQueryOptions { 54: optional bool enable_share_hash_table_for_broadcast_join 55: optional bool enable_pipeline_engine = false + + 56: optional i32 repeat_max_num = 0 } diff --git a/regression-test/suites/datatype_p0/string/test_string_basic.groovy b/regression-test/suites/datatype_p0/string/test_string_basic.groovy index d5884c2905..0ebde1debc 100644 --- a/regression-test/suites/datatype_p0/string/test_string_basic.groovy +++ b/regression-test/suites/datatype_p0/string/test_string_basic.groovy @@ -37,6 +37,8 @@ suite("test_string_basic") { CREATE TABLE IF NOT EXISTS ${tbName} (k1 VARCHAR(10) NULL, v1 STRING NULL) UNIQUE KEY(k1) DISTRIBUTED BY HASH(k1) BUCKETS 5 properties("replication_num" = "1") """ + // default repeat maximum is 10000 + sql """set repeat_max_num=131073""" sql """ INSERT INTO ${tbName} VALUES ("", ""),