From 2f98a6216e43aa708bc41d5b36c01bb2b1c11349 Mon Sep 17 00:00:00 2001 From: Pxl Date: Tue, 7 Jan 2025 19:59:35 +0800 Subject: [PATCH] [Bug](join) fix columnstr64's offset overflow on serialize_value_into_arena #46461 (#46462) pick from #46461 --- be/src/vec/columns/column_string.cpp | 14 ++--- be/src/vec/columns/column_string.h | 7 ++- .../str64_serialize/str64_serialize.out | 7 +++ .../str64_serialize/str64_serialize.groovy | 57 +++++++++++++++++++ 4 files changed, 76 insertions(+), 9 deletions(-) create mode 100644 regression-test/data/query_p1/str64_serialize/str64_serialize.out create mode 100644 regression-test/suites/query_p1/str64_serialize/str64_serialize.groovy diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index bf3e2b5c75..c085fdc549 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -356,8 +356,8 @@ ColumnPtr ColumnStr::permute(const IColumn::Permutation& perm, size_t limit) template StringRef ColumnStr::serialize_value_into_arena(size_t n, Arena& arena, char const*& begin) const { - uint32_t string_size(size_at(n)); - uint32_t offset(offset_at(n)); + auto string_size(size_at(n)); + auto offset(offset_at(n)); StringRef res; res.size = sizeof(string_size) + string_size; @@ -389,7 +389,7 @@ size_t ColumnStr::get_max_row_byte_size() const { size_t max_size = 0; size_t num_rows = offsets.size(); for (size_t i = 0; i < num_rows; ++i) { - max_size = std::max(max_size, size_at(i)); + max_size = std::max(max_size, size_t(size_at(i))); } return max_size + sizeof(uint32_t); @@ -399,8 +399,8 @@ template void ColumnStr::serialize_vec(std::vector& keys, size_t num_rows, size_t max_row_byte_size) const { for (size_t i = 0; i < num_rows; ++i) { - uint32_t offset(offset_at(i)); - uint32_t string_size(size_at(i)); + auto offset(offset_at(i)); + auto string_size(size_at(i)); auto* ptr = const_cast(keys[i].data + keys[i].size); memcpy_fixed(ptr, (char*)&string_size); @@ -414,8 +414,8 @@ void ColumnStr::serialize_vec_with_null_map(std::vector& keys, siz const uint8_t* null_map) const { for (size_t i = 0; i < num_rows; ++i) { if (null_map[i] == 0) { - uint32_t offset(offset_at(i)); - uint32_t string_size(size_at(i)); + auto offset(offset_at(i)); + auto string_size(size_at(i)); auto* ptr = const_cast(keys[i].data + keys[i].size); memcpy_fixed(ptr, (char*)&string_size); diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index fe1fd086ad..5fe8d29881 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -87,8 +87,11 @@ private: size_t ALWAYS_INLINE offset_at(ssize_t i) const { return offsets[i - 1]; } - /// Size of i-th element, including terminating zero. - size_t ALWAYS_INLINE size_at(ssize_t i) const { return offsets[i] - offsets[i - 1]; } + // Size of i-th element, including terminating zero. + // assume that the length of a single element is less than 32-bit + uint32_t ALWAYS_INLINE size_at(ssize_t i) const { + return uint32_t(offsets[i] - offsets[i - 1]); + } template struct less; diff --git a/regression-test/data/query_p1/str64_serialize/str64_serialize.out b/regression-test/data/query_p1/str64_serialize/str64_serialize.out new file mode 100644 index 0000000000..99c168b99d --- /dev/null +++ b/regression-test/data/query_p1/str64_serialize/str64_serialize.out @@ -0,0 +1,7 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !test -- +50000000 + +-- !test -- +50000000 + diff --git a/regression-test/suites/query_p1/str64_serialize/str64_serialize.groovy b/regression-test/suites/query_p1/str64_serialize/str64_serialize.groovy new file mode 100644 index 0000000000..b0e3ffa99e --- /dev/null +++ b/regression-test/suites/query_p1/str64_serialize/str64_serialize.groovy @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("str64_serialize") { + + sql """ DROP TABLE IF EXISTS d_table; """ + sql """ DROP TABLE IF EXISTS d_table2; """ + + + sql """ + create table d_table ( + k1 int null, + k2 int not null, + k3 bigint null, + k4 varchar(100) null + ) + duplicate key (k1,k2,k3) + distributed BY hash(k1) buckets 3 + properties("replication_num" = "1"); + """ + sql """ + create table d_table2 ( + k1 int null, + k2 int not null, + k3 bigint null, + k4 varchar(100) null + ) + duplicate key (k1,k2,k3) + distributed BY hash(k1) buckets 3 + properties("replication_num" = "1"); + """ + + sql """insert into d_table select 1,1,1,'1234567890abcdefghigalsdhaluihdicandejionxaoxwdeuhwenudzmwoedxneiowdxiowedjxneiowdjixoneiiexdnuiexef' from (select 1 k1) as t lateral view explode_numbers(50000000) tmp1 as e1; +""" + + sql """insert into d_table2 select 1,1,1,'1234567890abcdefghigalsdhaluihdicandejionxaoxwdeuhwenudzmwoedxneiowdxiowedjxneiowdjixoneiiexdnuiexef'; +""" + sql "set parallel_pipeline_task_num=1;" + + qt_test "select /*+ LEADING(a,b) */ count(*) from d_table as a, d_table2 as b where a.k4=b.k4 and a.k1=b.k1;" + qt_test "select /*+ LEADING(b,a) */ count(*) from d_table as a, d_table2 as b where a.k4=b.k4 and a.k1=b.k1;" +} +