From 47edc5a06ef09744f8019fdc39aa6b5c44d0206d Mon Sep 17 00:00:00 2001 From: Jerry Hu Date: Thu, 11 May 2023 01:13:13 +0800 Subject: [PATCH] [fix](functions) Support nullable column for multi_string functions (#19498) --- .../functions_multi_string_position.cpp | 51 ++++++++++++++- .../functions_multi_string_search.cpp | 65 +++++++++++++++---- .../test_multi_string_position.out | 6 ++ .../test_multi_string_search.out | 44 ++++++++++++- .../test_multi_string_position.groovy | 11 ++-- .../test_multi_string_search.groovy | 33 ++++++++++ 6 files changed, 192 insertions(+), 18 deletions(-) diff --git a/be/src/vec/functions/functions_multi_string_position.cpp b/be/src/vec/functions/functions_multi_string_position.cpp index 062f34f995..3db079efad 100644 --- a/be/src/vec/functions/functions_multi_string_position.cpp +++ b/be/src/vec/functions/functions_multi_string_position.cpp @@ -73,14 +73,34 @@ public: bool use_default_implementation_for_constants() const override { return true; } + bool use_default_implementation_for_nulls() const override { return false; } + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { return std::make_shared(make_nullable(std::make_shared())); } Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) override { - ColumnPtr haystack_ptr = block.get_by_position(arguments[0]).column; - ColumnPtr needles_ptr = block.get_by_position(arguments[1]).column; + auto haystack_column = block.get_by_position(arguments[0]).column; + auto haystack_ptr = haystack_column; + + auto needles_column = block.get_by_position(arguments[1]).column; + auto needles_ptr = needles_column; + + bool haystack_nullable = false; + bool needles_nullable = false; + + if (haystack_column->is_nullable()) { + haystack_ptr = check_and_get_column(haystack_column.get()) + ->get_nested_column_ptr(); + haystack_nullable = true; + } + + if (needles_column->is_nullable()) { + needles_ptr = check_and_get_column(needles_column.get()) + ->get_nested_column_ptr(); + needles_nullable = true; + } const ColumnString* col_haystack_vector = check_and_get_column(&*haystack_ptr); @@ -122,6 +142,30 @@ public: return status; } + if (haystack_nullable) { + auto column_nullable = check_and_get_column(haystack_column.get()); + auto& null_map = column_nullable->get_null_map_data(); + for (size_t i = 0; i != input_rows_count; ++i) { + if (null_map[i] == 1) { + for (size_t offset = offsets_res[i - 1]; offset != offsets_res[i]; ++offset) { + vec_res[offset] = 0; + } + } + } + } + + if (needles_nullable) { + auto column_nullable = check_and_get_column(needles_column.get()); + auto& null_map = column_nullable->get_null_map_data(); + for (size_t i = 0; i != input_rows_count; ++i) { + if (null_map[i] == 1) { + for (size_t offset = offsets_res[i - 1]; offset != offsets_res[i]; ++offset) { + vec_res[offset] = 0; + } + } + } + } + auto nullable_col = ColumnNullable::create(std::move(col_res), ColumnUInt8::create(col_res->size(), 0)); block.get_by_position(result).column = @@ -151,6 +195,9 @@ public: std::vector searchers; searchers.reserve(needles_size); for (const auto& needle : needles_arr) { + if (needle.get_type() != Field::Types::String) { + return Status::InvalidArgument("invalid type of needle {}", needle.get_type_name()); + } searchers.emplace_back(needle.get().data, needle.get().size); } diff --git a/be/src/vec/functions/functions_multi_string_search.cpp b/be/src/vec/functions/functions_multi_string_search.cpp index 5c19fe6c8b..0791a96125 100644 --- a/be/src/vec/functions/functions_multi_string_search.cpp +++ b/be/src/vec/functions/functions_multi_string_search.cpp @@ -71,14 +71,34 @@ public: bool use_default_implementation_for_constants() const override { return true; } + bool use_default_implementation_for_nulls() const override { return false; } + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { return Impl::get_return_type(); } Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) override { - ColumnPtr haystack_ptr = block.get_by_position(arguments[0]).column; - ColumnPtr needles_ptr = block.get_by_position(arguments[1]).column; + auto haystack_column = block.get_by_position(arguments[0]).column; + auto haystack_ptr = haystack_column; + + auto needles_column = block.get_by_position(arguments[1]).column; + auto needles_ptr = needles_column; + + bool haystack_nullable = false; + bool needles_nullable = false; + + if (haystack_column->is_nullable()) { + haystack_ptr = check_and_get_column(haystack_column.get()) + ->get_nested_column_ptr(); + haystack_nullable = true; + } + + if (needles_column->is_nullable()) { + needles_ptr = check_and_get_column(needles_column.get()) + ->get_nested_column_ptr(); + needles_nullable = true; + } const ColumnString* col_haystack_vector = check_and_get_column(&*haystack_ptr); @@ -104,24 +124,44 @@ public: auto& offsets_res = col_offsets->get_data(); Status status; - if (col_needles_const) + if (col_needles_const) { status = Impl::vector_constant( col_haystack_vector->get_chars(), col_haystack_vector->get_offsets(), col_needles_const->get_value(), vec_res, offsets_res, allow_hyperscan_, max_hyperscan_regexp_length_, max_hyperscan_regexp_total_length_); - else + } else { status = Impl::vector_vector( col_haystack_vector->get_chars(), col_haystack_vector->get_offsets(), col_needles_vector->get_data(), col_needles_vector->get_offsets(), vec_res, offsets_res, allow_hyperscan_, max_hyperscan_regexp_length_, max_hyperscan_regexp_total_length_); - if (!status.ok()) return status; + } - if constexpr (Impl::is_column_array) - block.get_by_position(result).column = - ColumnArray::create(std::move(col_res), std::move(col_offsets)); - else - block.replace_by_position(result, std::move(col_res)); + if (!status.ok()) { + return status; + } + + if (haystack_nullable) { + auto column_nullable = check_and_get_column(haystack_column.get()); + auto& null_map = column_nullable->get_null_map_data(); + for (size_t i = 0; i != input_rows_count; ++i) { + if (null_map[i] == 1) { + vec_res[i] = 0; + } + } + } + + if (needles_nullable) { + auto column_nullable = check_and_get_column(needles_column.get()); + auto& null_map = column_nullable->get_null_map_data(); + for (size_t i = 0; i != input_rows_count; ++i) { + if (null_map[i] == 1) { + vec_res[i] = 0; + } + } + } + + block.replace_by_position(result, std::move(col_res)); return status; } @@ -145,7 +185,6 @@ struct FunctionMultiMatchAnyImpl { static constexpr bool FindAnyIndex = (Find == MultiMatchTraits::Find::AnyIndex); static constexpr auto name = "multi_match_any"; - static constexpr bool is_column_array = false; static auto get_return_type() { return std::make_shared>(); } @@ -231,6 +270,10 @@ struct FunctionMultiMatchAnyImpl { ->get_nested_column(); const ColumnString* needles_data_string = check_and_get_column(nested_column); + if (!needles_data_string) { + return Status::InvalidArgument("needles should be string"); + } + std::vector needles; for (size_t i = 0; i < haystack_offsets.size(); ++i) { needles.reserve(needles_offsets[i] - prev_needles_offset); diff --git a/regression-test/data/query_p0/sql_functions/search_functions/test_multi_string_position.out b/regression-test/data/query_p0/sql_functions/search_functions/test_multi_string_position.out index 017fa620c4..f408d2d543 100644 --- a/regression-test/data/query_p0/sql_functions/search_functions/test_multi_string_position.out +++ b/regression-test/data/query_p0/sql_functions/search_functions/test_multi_string_position.out @@ -5,6 +5,9 @@ [1, 13, 8, 0, 0] [1, 13, 8, 0, 0] [0, 6, 0, 0, 0] +[0, 0, 0, 0, 0] +[0, 0, 0, 0, 0] +[0, 0, 0, 0, 0] -- !table_select2 -- [0, 0] @@ -12,6 +15,9 @@ [0, 8] [1, 8, 0, 13] [1, 1, 4, 0] +[] +[] +[0, 0, 0] -- !select1 -- [4, 1, 1, 2, 6, 1, 1, 0, 4, 1, 14, 0, 10, 0, 16, 6] diff --git a/regression-test/data/query_p0/sql_functions/search_functions/test_multi_string_search.out b/regression-test/data/query_p0/sql_functions/search_functions/test_multi_string_search.out index 2eef34a650..efaec678d0 100644 --- a/regression-test/data/query_p0/sql_functions/search_functions/test_multi_string_search.out +++ b/regression-test/data/query_p0/sql_functions/search_functions/test_multi_string_search.out @@ -1,45 +1,87 @@ -- This file is automatically generated. You should know what you did if you want to edit this +-- !select1 -- +1 +1 +1 +1 +1 +0 +0 +0 + +-- !select2 -- +0 +1 +1 +1 +1 +0 +0 +0 + -- !select -- 0 + -- !select -- 0 + -- !select -- 0 + -- !select -- 1 + -- !select -- 0 + -- !select -- 0 + -- !select -- 0 + -- !select -- 1 + -- !select -- 1 + -- !select -- 1 + -- !select -- 0 + -- !select -- 1 + -- !select -- 0 + -- !select -- 1 + -- !select -- 1 + -- !select -- 1 + -- !select -- 0 + -- !select -- 0 + -- !select -- 0 + -- !select -- 1 + -- !select -- 1 + -- !select -- -1 \ No newline at end of file +1 + diff --git a/regression-test/suites/query_p0/sql_functions/search_functions/test_multi_string_position.groovy b/regression-test/suites/query_p0/sql_functions/search_functions/test_multi_string_position.groovy index fa3ec92b66..f6a8aa110e 100644 --- a/regression-test/suites/query_p0/sql_functions/search_functions/test_multi_string_position.groovy +++ b/regression-test/suites/query_p0/sql_functions/search_functions/test_multi_string_position.groovy @@ -16,14 +16,14 @@ // under the License. suite("test_multi_string_position") { - def table_name = "strings" + def table_name = "test_multi_string_position_strings" sql """ DROP TABLE IF EXISTS ${table_name} """ sql """ CREATE TABLE IF NOT EXISTS ${table_name} ( `col1` INT NOT NULL, - `content` TEXT NOT NULL, - `mode` ARRAY NOT NULL + `content` TEXT NULL, + `mode` ARRAY NULL ) ENGINE=OLAP DUPLICATE KEY(`col1`) COMMENT 'OLAP' @@ -40,7 +40,10 @@ suite("test_multi_string_position") { (2, 'Hello, World!', ['hello', 'world', 'Hello', '!'] ), (3, 'hello, world!', ['Hello', 'world'] ), (4, 'hello, world!', ['hello', 'world', 'Hello', '!'] ), - (5, 'HHHHW!', ['H', 'HHHH', 'HW', 'WH'] ); + (5, 'HHHHW!', ['H', 'HHHH', 'HW', 'WH'] ), + (6, 'abc', null), + (7, null, null), + (8, null, ['a', 'b', 'c']); """ qt_table_select1 "select multi_search_all_positions(content, ['hello', '!', 'world', 'Hello', 'World']) from ${table_name} order by col1" diff --git a/regression-test/suites/query_p0/sql_functions/search_functions/test_multi_string_search.groovy b/regression-test/suites/query_p0/sql_functions/search_functions/test_multi_string_search.groovy index e258ab7fc8..5a3229ce36 100644 --- a/regression-test/suites/query_p0/sql_functions/search_functions/test_multi_string_search.groovy +++ b/regression-test/suites/query_p0/sql_functions/search_functions/test_multi_string_search.groovy @@ -16,6 +16,39 @@ // under the License. suite("test_multi_string_search") { + def table_name = "test_multi_string_search_strings" + + sql """ DROP TABLE IF EXISTS ${table_name} """ + sql """ CREATE TABLE IF NOT EXISTS ${table_name} + ( + `col1` INT NOT NULL, + `content` TEXT NULL, + `mode` ARRAY NULL + ) ENGINE=OLAP + DUPLICATE KEY(`col1`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`col1`) BUCKETS 3 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "in_memory" = "false", + "storage_format" = "V2" + ); + """ + + sql """ INSERT INTO ${table_name} (col1, content, mode) VALUES + (1, 'Hello, World!', ['hello', 'world'] ), + (2, 'Hello, World!', ['hello', 'world', 'Hello', '!'] ), + (3, 'hello, world!', ['Hello', 'world'] ), + (4, 'hello, world!', ['hello', 'world', 'Hello', '!'] ), + (5, 'HHHHW!', ['H', 'HHHH', 'HW', 'WH'] ), + (6, 'abc', null), + (7, null, null), + (8, null, ['a', 'b', 'c']); + """ + + qt_select1 "select multi_match_any(content, ['hello', '!', 'world', 'Hello', 'World']) from ${table_name} order by col1" + qt_select2 "select multi_match_any(content, mode) from ${table_name} order by col1" + qt_select "select multi_match_any('mpnsguhwsitzvuleiwebwjfitmsg', ['wbirxqoabpblrnvvmjizj', 'cfcxhuvrexyzyjsh', 'oldhtubemyuqlqbwvwwkwin', 'bumoozxdkjglzu', 'intxlfohlxmajjomw', 'dxkeghohv', 'arsvmwwkjeopnlwnan', 'ouugllgowpqtaxslcopkytbfhifaxbgt', 'hkedmjlbcrzvryaopjqdjjc', 'tbqkljywstuahzh', 'o', 'wowoclosyfcuwotmvjygzuzhrery', 'vpefjiffkhlggntcu', 'ytdixvasrorhripzfhjdmlhqksmctyycwp'])" qt_select "select multi_match_any('qjjzqexjpgkglgxpzrbqbnskq', ['vaiatcjacmlffdzsejpdareqzy', 'xspcfzdufkmecud', 'bcvtbuqtctq', 'nkcopwbfytgemkqcfnnno', 'dylxnzuyhq', 'tno', 'scukuhufly', 'cdyquzuqlptv', 'ohluyfeksyxepezdhqmtfmgkvzsyph', 'ualzwtahvqvtijwp', 'jg', 'gwbawqlngzcknzgtmlj', 'qimvjcgbkkp', 'eaedbcgyrdvv', 'qcwrncjoewwedyyewcdkh', 'uqcvhngoqngmitjfxpznqomertqnqcveoqk', 'ydrgjiankgygpm', 'axepgap'])" qt_select "select multi_match_any('fdkmtqmxnegwvnjhghjq', ['vynkybvdmhgeezybbdqfrukibisj', 'knazzamgjjpavwhvdkwigykh', 'peumnifrmdhhmrqqnemw', 'lmsnyvqoisinlaqobxojlwfbi', 'oqwfzs', 'dymudxxeodwjpgbibnkvr', 'vomtfsnizkplgzktqyoiw', 'yoyfuhlpgrzds', 'cefao', 'gi', 'srpgxfjwl', 'etsjusdeiwbfe', 'ikvtzdopxo', 'ljfkavrau', 'soqdhxtenfrkmeic', 'ktprjwfcelzbup', 'pcvuoddqwsaurcqdtjfnczekwni', 'agkqkqxkfbkfgyqliahsljim'])"