From a2edc6fd8b99d3db6598d2e2c757700e83fc52e8 Mon Sep 17 00:00:00 2001 From: camby <104178625@qq.com> Date: Wed, 20 Apr 2022 14:50:34 +0800 Subject: [PATCH] [feature-wip](array-type) replicate impl for ColumnArray to support join with array column (#9070) SQL with JOIN and columns ARRAY, will call function ColumnArray::replicate. At this pr, we implement replicate for ARRAY type, to support SQL like this: `SELECT count(lo_array),count(d_array),SUM(lo_extendedprice*lo_discount) AS REVENUE FROM lineorder, date WHERE lo_orderdate = d_datekey AND d_year = 1993 AND lo_discount BETWEEN 1 AND 3 AND lo_quantity < 25;` --- be/src/olap/row_block2.cpp | 2 +- be/src/olap/rowset/segment_v2/column_reader.h | 2 +- be/src/vec/columns/column_array.cpp | 41 +++++++-- be/src/vec/columns/column_array.h | 1 + be/test/vec/core/column_array_test.cpp | 85 +++++++++++++++---- 5 files changed, 104 insertions(+), 27 deletions(-) diff --git a/be/src/olap/row_block2.cpp b/be/src/olap/row_block2.cpp index 8beca02192..83bb249566 100644 --- a/be/src/olap/row_block2.cpp +++ b/be/src/olap/row_block2.cpp @@ -289,7 +289,7 @@ Status RowBlockV2::_copy_data_to_column(int cid, auto& offsets_col = column_array->get_offsets(); offsets_col.reserve(_selected_size); - uint32_t offset = 0; + uint32_t offset = offsets_col.back(); for (uint16_t j = 0; j < _selected_size; ++j) { uint16_t row_idx = _selection_vector[j]; auto cv = reinterpret_cast(column_block(cid).cell_ptr(row_idx)); diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index 3103d9c2b6..75dd2d1788 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -360,7 +360,7 @@ public: : size_to_read; ColumnBlockView ordinal_view(&ordinal_block); RETURN_IF_ERROR(_length_iterator->next_batch(&this_read, &ordinal_view, &has_null)); - auto* ordinals = reinterpret_cast(_length_batch->data()); + auto* ordinals = reinterpret_cast(_length_batch->data()); for (int i = 0; i < this_read; ++i) { item_ordinal += ordinals[i]; } diff --git a/be/src/vec/columns/column_array.cpp b/be/src/vec/columns/column_array.cpp index cc4f380f7e..c18d1a55b3 100644 --- a/be/src/vec/columns/column_array.cpp +++ b/be/src/vec/columns/column_array.cpp @@ -20,8 +20,6 @@ #include "vec/columns/column_array.h" -#include // memcpy - #include "vec/columns/collator.h" #include "vec/columns/column_const.h" #include "vec/columns/column_nullable.h" @@ -493,14 +491,9 @@ void ColumnArray::insert_indices_from(const IColumn& src, const int* indices_beg ColumnPtr ColumnArray::replicate(const Offsets& replicate_offsets) const { if (replicate_offsets.empty()) return clone_empty(); + // keep ColumnUInt8 for ColumnNullable::null_map if (typeid_cast(data.get())) return replicate_number(replicate_offsets); - if (typeid_cast(data.get())) - return replicate_number(replicate_offsets); - if (typeid_cast(data.get())) - return replicate_number(replicate_offsets); - if (typeid_cast(data.get())) - return replicate_number(replicate_offsets); if (typeid_cast(data.get())) return replicate_number(replicate_offsets); if (typeid_cast(data.get())) @@ -517,10 +510,40 @@ ColumnPtr ColumnArray::replicate(const Offsets& replicate_offsets) const { if (typeid_cast(data.get())) return replicate_const(replicate_offsets); if (typeid_cast(data.get())) return replicate_nullable(replicate_offsets); - //if (typeid_cast(data.get())) return replicateTuple(replicate_offsets); return replicate_generic(replicate_offsets); } +void ColumnArray::replicate(const uint32_t* counts, size_t target_size, IColumn& column) const { + size_t col_size = size(); + if (col_size == 0) { + return; + } + + Offsets replicate_offsets(col_size); + size_t cur_offset = 0; + for (size_t i = 0; i < col_size; ++i) { + cur_offset += counts[i]; + replicate_offsets[i] = cur_offset; + } + if (cur_offset != target_size) { + LOG(WARNING) << "ColumnArray replicate input target_size:" << target_size + << " not equal SUM(counts):" << cur_offset; + return; + } + + auto rep_res = replicate(replicate_offsets); + if (!rep_res) { + LOG(WARNING) << "ColumnArray replicate failed, replicate_offsets count=" + << replicate_offsets.size() << ", max=" << replicate_offsets.back(); + return; + } + auto& rep_res_arr = typeid_cast(*rep_res); + + ColumnArray& res_arr = typeid_cast(column); + res_arr.data = rep_res_arr.get_data_ptr(); + res_arr.offsets = rep_res_arr.get_offsets_ptr(); +} + template ColumnPtr ColumnArray::replicate_number(const Offsets& replicate_offsets) const { size_t col_size = size(); diff --git a/be/src/vec/columns/column_array.h b/be/src/vec/columns/column_array.h index 74a0805336..182f7b185d 100644 --- a/be/src/vec/columns/column_array.h +++ b/be/src/vec/columns/column_array.h @@ -105,6 +105,7 @@ public: size_t allocated_bytes() const override; void protect() override; ColumnPtr replicate(const Offsets& replicate_offsets) const override; + void replicate(const uint32_t* counts, size_t target_size, IColumn& column) const override; ColumnPtr convert_to_full_column_if_const() const override; void get_extremes(Field& min, Field& max) const override { LOG(FATAL) << "get_extremes not implemented"; diff --git a/be/test/vec/core/column_array_test.cpp b/be/test/vec/core/column_array_test.cpp index b497b14451..60725501ab 100644 --- a/be/test/vec/core/column_array_test.cpp +++ b/be/test/vec/core/column_array_test.cpp @@ -28,16 +28,16 @@ namespace doris::vectorized { -void check_array_offsets(ColumnPtr arr, const std::vector& offs) { - auto arr_col = check_and_get_column(*arr); +void check_array_offsets(const IColumn& arr, const std::vector& offs) { + auto arr_col = check_and_get_column(arr); ASSERT_EQ(arr_col->size(), offs.size()); for (size_t i = 0; i < arr_col->size(); ++i) { ASSERT_EQ(arr_col->get_offsets()[i], offs[i]); } } template -void check_array_data(ColumnPtr arr, const std::vector& data) { - auto arr_col = check_and_get_column(*arr); +void check_array_data(const IColumn& arr, const std::vector& data) { + auto arr_col = check_and_get_column(arr); auto data_col = arr_col->get_data_ptr(); ASSERT_EQ(data_col->size(), data.size()); for (size_t i = 0; i < data_col->size(); ++i) { @@ -46,8 +46,8 @@ void check_array_data(ColumnPtr arr, const std::vector& data) { } } template <> -void check_array_data(ColumnPtr arr, const std::vector& data) { - auto arr_col = check_and_get_column(*arr); +void check_array_data(const IColumn& arr, const std::vector& data) { + auto arr_col = check_and_get_column(arr); auto data_col = arr_col->get_data_ptr(); ASSERT_EQ(data_col->size(), data.size()); for (size_t i = 0; i < data_col->size(); ++i) { @@ -123,13 +123,13 @@ TEST(ColumnArrayTest, IntArrayPermuteTest) { IColumn::Permutation perm = {3, 2, 1, 0}; // return array column: [[5,6],[4]]; auto res1 = array_column.permute(perm, 2); - check_array_offsets(res1, {2, 3}); - check_array_data(res1, {5, 6, 4}); + check_array_offsets(*res1, {2, 3}); + check_array_data(*res1, {5, 6, 4}); // return array column: [[5,6],[4],[],[1,2,3]] auto res2 = array_column.permute(perm, 0); - check_array_offsets(res2, {2, 3, 3, 6}); - check_array_data(res2, {5, 6, 4, 1, 2, 3}); + check_array_offsets(*res2, {2, 3, 3, 6}); + check_array_data(*res2, {5, 6, 4, 1, 2, 3}); } TEST(ColumnArrayTest, StringArrayPermuteTest) { @@ -149,8 +149,13 @@ TEST(ColumnArrayTest, StringArrayPermuteTest) { IColumn::Permutation perm = {3, 2, 1, 0}; // return array column: [[""],[]]; auto res1 = array_column.permute(perm, 2); - check_array_offsets(res1, {1, 1}); - check_array_data(res1, {""}); + check_array_offsets(*res1, {1, 1}); + check_array_data(*res1, {""}); + + // return array column: [[""],[],["ef"],["abc","d"]]; + auto res2 = array_column.permute(perm, 0); + check_array_offsets(*res2, {1, 1, 2, 4}); + check_array_data(*res2, {"", "ef", "abc", "d"}); } TEST(ColumnArrayTest, EmptyArrayPermuteTest) { @@ -170,13 +175,61 @@ TEST(ColumnArrayTest, EmptyArrayPermuteTest) { IColumn::Permutation perm = {3, 2, 1, 0}; // return array column: [[],[]]; auto res1 = array_column.permute(perm, 2); - check_array_offsets(res1, {0, 0}); - check_array_data(res1, {}); + check_array_offsets(*res1, {0, 0}); + check_array_data(*res1, {}); // return array column: [[],[],[],[]] auto res2 = array_column.permute(perm, 0); - check_array_offsets(res2, {0, 0, 0, 0}); - check_array_data(res2, {}); + check_array_offsets(*res2, {0, 0, 0, 0}); + check_array_data(*res2, {}); +} + +TEST(ColumnArrayTest, IntArrayReplicateTest) { + auto off_column = ColumnVector::create(); + auto data_column = ColumnVector::create(); + // init column array with [[1,2,3],[],[4],[5,6]] + std::vector offs = {0, 3, 3, 4, 6}; + std::vector vals = {1, 2, 3, 4, 5, 6}; + for (size_t i = 1; i < offs.size(); ++i) { + off_column->insert_data((const char*)(&offs[i]), 0); + } + for (auto& v : vals) { + data_column->insert_data((const char*)(&v), 0); + } + ColumnArray array_column(std::move(data_column), std::move(off_column)); + + uint32_t counts[] = {2, 1, 0, 3}; // size should be equal array_column.size() + size_t target_size = 6; // sum(counts) + + // return array column: [[1,2,3],[1,2,3],[],[5,6],[5,6],[5,6]]; + auto res1 = array_column.clone_empty(); + array_column.replicate(counts, target_size, *res1); + check_array_offsets(*res1, {3, 6, 6, 8, 10, 12}); + check_array_data(*res1, {1, 2, 3, 1, 2, 3, 5, 6, 5, 6, 5, 6}); +} + +TEST(ColumnArrayTest, StringArrayReplicateTest) { + auto off_column = ColumnVector::create(); + auto data_column = ColumnString::create(); + // init column array with [["abc","d"],["ef"],[], [""]]; + std::vector offs = {0, 2, 3, 3, 4}; + std::vector vals = {"abc", "d", "ef", ""}; + for (size_t i = 1; i < offs.size(); ++i) { + off_column->insert_data((const char*)(&offs[i]), 0); + } + for (auto& v : vals) { + data_column->insert_data(v.data(), v.size()); + } + ColumnArray array_column(std::move(data_column), std::move(off_column)); + + uint32_t counts[] = {2, 1, 0, 3}; // size should be equal array_column.size() + size_t target_size = 6; // sum(counts) + + // return array column: [["abc","d"],["abc","d"],["ef"],[""],[""],[""]]; + auto res1 = array_column.clone_empty(); + array_column.replicate(counts, target_size, *res1); + check_array_offsets(*res1, {2, 4, 5, 6, 7, 8}); + check_array_data(*res1, {"abc", "d", "abc", "d", "ef", "", "", ""}); } } // namespace doris::vectorized