From b122f9b80c1b8def1cfb4d110cacfcb6f899cc0f Mon Sep 17 00:00:00 2001 From: TengJianPing <18241664+jacktengg@users.noreply.github.com> Date: Fri, 4 Aug 2023 19:13:35 +0800 Subject: [PATCH] [fix](concat) ColumnString::chars is resized with wrong size (#22610) FunctionStringConcat::execute_impl resized with size that include string null terminator, which causes ColumnString::chars.size() does not match with ColumnString::offsets.back, this will cause problems for some string functions, e.g. like and regexp. --- be/src/vec/columns/column_string.cpp | 17 +++++++ be/src/vec/columns/column_string.h | 2 + be/src/vec/functions/function_string.h | 2 - be/test/vec/core/column_string_test.cpp | 59 +++++++++++++++++++++++++ 4 files changed, 78 insertions(+), 2 deletions(-) create mode 100644 be/test/vec/core/column_string_test.cpp diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index 4ea25b7688..0beb662da0 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -35,6 +35,23 @@ namespace doris::vectorized { +void ColumnString::sanity_check() const { + auto count = offsets.size(); + if (chars.size() != offsets[count - 1]) { + LOG(FATAL) << "row count: " << count << ", chars.size(): " << chars.size() << ", offset[" + << count - 1 << "]: " << offsets[count - 1]; + } + if (offsets[-1] != 0) { + LOG(FATAL) << "wrong offsets[-1]: " << offsets[-1]; + } + for (size_t i = 0; i < count; ++i) { + if (offsets[i] < offsets[i - 1]) { + LOG(FATAL) << "row count: " << count << ", offsets[" << i << "]: " << offsets[i] + << ", offsets[" << i - 1 << "]: " << offsets[i - 1]; + } + } +} + MutableColumnPtr ColumnString::clone_resized(size_t to_size) const { auto res = ColumnString::create(); if (to_size == 0) { diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index 14c426c762..96d4a4f834 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -106,6 +106,8 @@ private: chars(src.chars.begin(), src.chars.end()) {} public: + void sanity_check() const; + const char* get_family_name() const override { return "String"; } size_t size() const override { return offsets.size(); } diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index 83f98d726a..32e373ffa0 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -776,8 +776,6 @@ public: if ((UNLIKELY(UINT_MAX - input_rows_count < res_reserve_size))) { return Status::BufferAllocFailed("concat output is too large to allocate"); } - // for each terminal zero - res_reserve_size += input_rows_count; res_data.resize(res_reserve_size); diff --git a/be/test/vec/core/column_string_test.cpp b/be/test/vec/core/column_string_test.cpp new file mode 100644 index 0000000000..81f41bd11c --- /dev/null +++ b/be/test/vec/core/column_string_test.cpp @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/columns/column_string.h" + +#include + +#include "vec/core/block.h" +#include "vec/data_types/data_type_string.h" +#include "vec/functions/function_string.h" + +namespace doris::vectorized { +TEST(ColumnStringTest, TestConcat) { + Block block; + vectorized::DataTypePtr str_type = std::make_shared(); + + auto str_col0 = ColumnString::create(); + std::vector vals0 = {"aaa", "bb", "cccc"}; + for (auto& v : vals0) { + str_col0->insert_data(v.data(), v.size()); + } + block.insert({std::move(str_col0), str_type, "test_str_col0"}); + + auto str_col1 = ColumnString::create(); + std::vector vals1 = {"3", "2", "4"}; + for (auto& v : vals1) { + str_col1->insert_data(v.data(), v.size()); + } + block.insert({std::move(str_col1), str_type, "test_str_col1"}); + + auto str_col_res = ColumnString::create(); + block.insert({std::move(str_col_res), str_type, "test_str_res"}); + + ColumnNumbers arguments = {0, 1}; + + FunctionStringConcat func_concat; + auto status = func_concat.execute_impl(nullptr, block, arguments, 2, 3); + EXPECT_TRUE(status.ok()); + + auto actual_res_col = block.get_by_position(2).column; + EXPECT_EQ(actual_res_col->size(), 3); + auto actual_res_col_str = assert_cast(actual_res_col.get()); + actual_res_col_str->sanity_check(); +} +} // namespace doris::vectorized \ No newline at end of file