From eb079950cb9a82b3c95e9eed83df1b114e6855d0 Mon Sep 17 00:00:00 2001 From: carlvinhust2012 Date: Tue, 12 Jul 2022 17:02:42 +0800 Subject: [PATCH] [feature-wip] (array-type) add the array_distinct function (#10388) * add the array_distinct function * add the support for decimal and update variable names * add docs and regression test for array_distinct function Co-authored-by: hucheng01 --- be/src/vec/CMakeLists.txt | 1 + .../array/function_array_distinct.cpp | 28 ++ .../functions/array/function_array_distinct.h | 271 ++++++++++++++++++ .../array/function_array_register.cpp | 2 + .../array-functions/array_distinct.md | 79 +++++ .../array-functions/array_distinct.md | 78 +++++ gensrc/script/doris_builtins_functions.py | 12 + .../array_functions/test_array_functions.out | 17 ++ .../test_array_functions.groovy | 4 + 9 files changed, 492 insertions(+) create mode 100644 be/src/vec/functions/array/function_array_distinct.cpp create mode 100644 be/src/vec/functions/array/function_array_distinct.h create mode 100644 docs/en/docs/sql-manual/sql-functions/array-functions/array_distinct.md create mode 100644 docs/zh-CN/docs/sql-manual/sql-functions/array-functions/array_distinct.md diff --git a/be/src/vec/CMakeLists.txt b/be/src/vec/CMakeLists.txt index 3eb9ae6c5a..de46796cae 100644 --- a/be/src/vec/CMakeLists.txt +++ b/be/src/vec/CMakeLists.txt @@ -136,6 +136,7 @@ set(VEC_FILES functions/array/function_array_aggregation.cpp functions/array/function_array_utils.cpp functions/array/function_arrays_overlap.cpp + functions/array/function_array_distinct.cpp exprs/table_function/vexplode_json_array.cpp functions/math.cpp functions/function_bitmap.cpp diff --git a/be/src/vec/functions/array/function_array_distinct.cpp b/be/src/vec/functions/array/function_array_distinct.cpp new file mode 100644 index 0000000000..674d34d486 --- /dev/null +++ b/be/src/vec/functions/array/function_array_distinct.cpp @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/functions/array/function_array_distinct.h" + +#include "vec/functions/simple_function_factory.h" + +namespace doris::vectorized { + +void register_function_array_distinct(SimpleFunctionFactory& factory) { + factory.register_function(); +} + +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/functions/array/function_array_distinct.h b/be/src/vec/functions/array/function_array_distinct.h new file mode 100644 index 0000000000..7d9c989c29 --- /dev/null +++ b/be/src/vec/functions/array/function_array_distinct.h @@ -0,0 +1,271 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/array/arrayDistinct.cpp +// and modified by Doris +#pragma once + +#include "vec/columns/column_array.h" +#include "vec/columns/column_const.h" +#include "vec/common/hash_table/hash_set.h" +#include "vec/common/hash_table/hash_table.h" +#include "vec/common/sip_hash.h" +#include "vec/data_types/data_type_array.h" +#include "vec/data_types/data_type_number.h" +#include "vec/functions/function.h" +#include "vec/functions/function_helpers.h" +#include "vec/io/io_helper.h" + +namespace doris::vectorized { + +class FunctionArrayDistinct : public IFunction { +public: + static constexpr auto name = "array_distinct"; + static FunctionPtr create() { return std::make_shared(); } + using NullMapType = PaddedPODArray; + + /// Get function name. + String get_name() const override { return name; } + + bool is_variadic() const override { return false; } + + size_t get_number_of_arguments() const override { return 1; } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + DCHECK(is_array(arguments[0])) + << "first argument for function: " << name << " should be DataTypeArray" + << " and arguments[0] is " << arguments[0]->get_name(); + return arguments[0]; + } + + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) override { + ColumnPtr src_column = + block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); + const auto& src_column_array = check_and_get_column(*src_column); + if (!src_column_array) { + return Status::RuntimeError( + fmt::format("unsupported types for function {}({})", get_name(), + block.get_by_position(arguments[0]).type->get_name())); + } + const auto& src_offsets = src_column_array->get_offsets(); + const auto* src_nested_column = &src_column_array->get_data(); + DCHECK(src_nested_column != nullptr); + + DataTypePtr src_column_type = block.get_by_position(arguments[0]).type; + auto nested_type = assert_cast(*src_column_type).get_nested_type(); + auto dest_column_ptr = ColumnArray::create(nested_type->create_column(), + ColumnArray::ColumnOffsets::create()); + IColumn* dest_nested_column = &dest_column_ptr->get_data(); + ColumnArray::Offsets& dest_offsets = dest_column_ptr->get_offsets(); + DCHECK(dest_nested_column != nullptr); + dest_nested_column->reserve(src_nested_column->size()); + dest_offsets.reserve(input_rows_count); + + const NullMapType* src_null_map = nullptr; + if (src_nested_column->is_nullable()) { + const ColumnNullable* src_nested_nullable_col = + check_and_get_column(*src_nested_column); + src_nested_column = src_nested_nullable_col->get_nested_column_ptr(); + src_null_map = &src_nested_nullable_col->get_null_map_column().get_data(); + } + + NullMapType* dest_null_map = nullptr; + if (dest_nested_column->is_nullable()) { + ColumnNullable* dest_nested_nullable_col = + reinterpret_cast(dest_nested_column); + dest_nested_column = dest_nested_nullable_col->get_nested_column_ptr(); + dest_null_map = &dest_nested_nullable_col->get_null_map_column().get_data(); + } + + auto res_val = _execute_by_type(*src_nested_column, src_offsets, *dest_nested_column, + dest_offsets, src_null_map, dest_null_map, nested_type); + if (!res_val) { + return Status::RuntimeError( + fmt::format("execute failed or unsupported types for function {}({})", + get_name(), block.get_by_position(arguments[0]).type->get_name())); + } + + block.replace_by_position(result, std::move(dest_column_ptr)); + return Status::OK(); + } + +private: + // Note: Here initially allocate a piece of memory for 2^5 = 32 elements. + static constexpr size_t INITIAL_SIZE_DEGREE = 5; + + template + bool _execute_number(const IColumn& src_column, const ColumnArray::Offsets& src_offsets, + IColumn& dest_column, ColumnArray::Offsets& dest_offsets, + const NullMapType* src_null_map, NullMapType* dest_null_map) { + using NestType = typename ColumnType::value_type; + using ElementNativeType = typename NativeType::Type; + + const ColumnType* src_data_concrete = reinterpret_cast(&src_column); + if (!src_data_concrete) { + return false; + } + const PaddedPODArray& src_datas = src_data_concrete->get_data(); + + ColumnType& dest_data_concrete = reinterpret_cast(dest_column); + PaddedPODArray& dest_datas = dest_data_concrete.get_data(); + + using Set = HashSetWithStackMemory, + INITIAL_SIZE_DEGREE>; + Set set; + + ColumnArray::Offset prev_src_offset = 0; + ColumnArray::Offset res_offset = 0; + + for (auto curr_src_offset : src_offsets) { + set.clear(); + size_t null_size = 0; + for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j) { + if (src_null_map && (*src_null_map)[j]) { + DCHECK(dest_null_map != nullptr); + (*dest_null_map).push_back(true); + // Note: here we need to add an element which will not use for output + // because we expand the value of each offset + dest_datas.push_back(NestType()); + null_size++; + continue; + } + + if (!set.find(src_datas[j])) { + set.insert(src_datas[j]); + dest_datas.push_back(src_datas[j]); + if (dest_null_map) { + (*dest_null_map).push_back(false); + } + } + } + + res_offset += set.size() + null_size; + dest_offsets.push_back(res_offset); + prev_src_offset = curr_src_offset; + } + + return true; + } + + bool _execute_string(const IColumn& src_column, const ColumnArray::Offsets& src_offsets, + IColumn& dest_column, ColumnArray::Offsets& dest_offsets, + const NullMapType* src_null_map, NullMapType* dest_null_map) { + const ColumnString* src_data_concrete = reinterpret_cast(&src_column); + if (!src_data_concrete) { + return false; + } + + ColumnString& dest_column_string = reinterpret_cast(dest_column); + ColumnString::Chars& column_string_chars = dest_column_string.get_chars(); + ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets(); + column_string_chars.reserve(src_column.size()); + + using Set = HashSetWithStackMemory, INITIAL_SIZE_DEGREE>; + Set set; + + ColumnArray::Offset prev_src_offset = 0; + ColumnArray::Offset res_offset = 0; + + for (auto curr_src_offset : src_offsets) { + set.clear(); + size_t null_size = 0; + for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j) { + if (src_null_map && (*src_null_map)[j]) { + DCHECK(dest_null_map != nullptr); + // Note: here we need to update the offset of ColumnString + column_string_offsets.push_back(column_string_offsets.back()); + (*dest_null_map).push_back(true); + null_size++; + continue; + } + + StringRef src_str_ref = src_data_concrete->get_data_at(j); + if (!set.find(src_str_ref)) { + set.insert(src_str_ref); + // copy the src data to column_string_chars + const size_t old_size = column_string_chars.size(); + const size_t new_size = old_size + src_str_ref.size + 1; + column_string_chars.resize(new_size); + if (src_str_ref.size > 0) { + memcpy(column_string_chars.data() + old_size, src_str_ref.data, + src_str_ref.size); + } + column_string_chars[old_size + src_str_ref.size] = 0; + column_string_offsets.push_back(new_size); + + if (dest_null_map) { + (*dest_null_map).push_back(false); + } + } + } + + res_offset += set.size() + null_size; + dest_offsets.push_back(res_offset); + prev_src_offset = curr_src_offset; + } + return true; + } + + bool _execute_by_type(const IColumn& src_column, const ColumnArray::Offsets& src_offsets, + IColumn& dest_column, ColumnArray::Offsets& dest_offsets, + const NullMapType* src_null_map, NullMapType* dest_null_map, + DataTypePtr& nested_type) { + bool res = false; + WhichDataType which(remove_nullable(nested_type)); + if (which.is_uint8()) { + res = _execute_number(src_column, src_offsets, dest_column, dest_offsets, + src_null_map, dest_null_map); + } else if (which.is_int8()) { + res = _execute_number(src_column, src_offsets, dest_column, dest_offsets, + src_null_map, dest_null_map); + } else if (which.is_int16()) { + res = _execute_number(src_column, src_offsets, dest_column, dest_offsets, + src_null_map, dest_null_map); + } else if (which.is_int32()) { + res = _execute_number(src_column, src_offsets, dest_column, dest_offsets, + src_null_map, dest_null_map); + } else if (which.is_int64()) { + res = _execute_number(src_column, src_offsets, dest_column, dest_offsets, + src_null_map, dest_null_map); + } else if (which.is_int128()) { + res = _execute_number(src_column, src_offsets, dest_column, dest_offsets, + src_null_map, dest_null_map); + } else if (which.is_float32()) { + res = _execute_number(src_column, src_offsets, dest_column, dest_offsets, + src_null_map, dest_null_map); + } else if (which.is_float64()) { + res = _execute_number(src_column, src_offsets, dest_column, dest_offsets, + src_null_map, dest_null_map); + } else if (which.is_date()) { + res = _execute_number(src_column, src_offsets, dest_column, dest_offsets, + src_null_map, dest_null_map); + } else if (which.is_date_time()) { + res = _execute_number(src_column, src_offsets, dest_column, + dest_offsets, src_null_map, dest_null_map); + } else if (which.is_decimal128()) { + res = _execute_number(src_column, src_offsets, dest_column, + dest_offsets, src_null_map, dest_null_map); + } else if (which.is_string()) { + res = _execute_string(src_column, src_offsets, dest_column, dest_offsets, src_null_map, + dest_null_map); + } + return res; + } +}; + +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/functions/array/function_array_register.cpp b/be/src/vec/functions/array/function_array_register.cpp index cb5a091c91..05597de95c 100644 --- a/be/src/vec/functions/array/function_array_register.cpp +++ b/be/src/vec/functions/array/function_array_register.cpp @@ -27,6 +27,7 @@ void register_function_array_index(SimpleFunctionFactory&); void register_function_array_size(SimpleFunctionFactory&); void register_function_array_aggregation(SimpleFunctionFactory&); void register_function_arrays_overlap(SimpleFunctionFactory&); +void register_function_array_distinct(SimpleFunctionFactory&); void register_function_array(SimpleFunctionFactory& factory) { register_function_array_element(factory); @@ -34,6 +35,7 @@ void register_function_array(SimpleFunctionFactory& factory) { register_function_array_size(factory); register_function_array_aggregation(factory); register_function_arrays_overlap(factory); + register_function_array_distinct(factory); } } // namespace doris::vectorized diff --git a/docs/en/docs/sql-manual/sql-functions/array-functions/array_distinct.md b/docs/en/docs/sql-manual/sql-functions/array-functions/array_distinct.md new file mode 100644 index 0000000000..5877349e58 --- /dev/null +++ b/docs/en/docs/sql-manual/sql-functions/array-functions/array_distinct.md @@ -0,0 +1,79 @@ +--- +{ + "title": "array_distinct", + "language": "en" +} +--- + + + +## array_distinct + +### description + +#### Syntax + +``` +ARRAY array_distinct(ARRAY arr) +``` + +Return the array which has been removed duplicate values. +Return NULL for NULL input. + +### notice + +`Only supported in vectorized engine` + +### example + +``` +mysql> set enable_vectorized_engine=true; + +mysql> select k1, k2, array_distinct(k2) from array_test; ++------+-----------------------------+---------------------------+ +| k1 | k2 | array_distinct(k2) | ++------+-----------------------------+---------------------------+ +| 1 | [1, 2, 3, 4, 5] | [1, 2, 3, 4, 5] | +| 2 | [6, 7, 8] | [6, 7, 8] | +| 3 | [] | [] | +| 4 | NULL | NULL | +| 5 | [1, 2, 3, 4, 5, 4, 3, 2, 1] | [1, 2, 3, 4, 5] | +| 6 | [1, 2, 3, NULL] | [1, 2, 3, NULL] | +| 7 | [1, 2, 3, NULL, NULL] | [1, 2, 3, NULL, NULL] | ++------+-----------------------------+---------------------------+ + +mysql> select k1, k2, array_distinct(k2) from array_test01; ++------+------------------------------------------+---------------------------+ +| k1 | k2 | array_distinct(`k2`) | ++------+------------------------------------------+---------------------------+ +| 1 | ['a', 'b', 'c', 'd', 'e'] | ['a', 'b', 'c', 'd', 'e'] | +| 2 | ['f', 'g', 'h'] | ['f', 'g', 'h'] | +| 3 | [''] | [''] | +| 3 | [NULL] | [NULL] | +| 5 | ['a', 'b', 'c', 'd', 'e', 'a', 'b', 'c'] | ['a', 'b', 'c', 'd', 'e'] | +| 6 | NULL | NULL | +| 7 | ['a', 'b', NULL] | ['a', 'b', NULL] | +| 8 | ['a', 'b', NULL, NULL] | ['a', 'b', NULL, NULL] | ++------+------------------------------------------+---------------------------+ +``` + +### keywords + +ARRAY, DISTINCT, ARRAY_DISTINCT diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/array-functions/array_distinct.md b/docs/zh-CN/docs/sql-manual/sql-functions/array-functions/array_distinct.md new file mode 100644 index 0000000000..a0f263b0f9 --- /dev/null +++ b/docs/zh-CN/docs/sql-manual/sql-functions/array-functions/array_distinct.md @@ -0,0 +1,78 @@ +--- +{ + "title": "array_distinct", + "language": "zh-CN" +} +--- + + + +## array_distinct + +### description + +#### Syntax + +``` +ARRAY array_distinct(ARRAY arr) +``` + +返回去除了重复元素的数组,如果输入数组为NULL,则返回NULL。 + +### notice + +`仅支持向量化引擎中使用` + +### example + +``` +mysql> set enable_vectorized_engine=true; + +mysql> select k1, k2, array_distinct(k2) from array_test; ++------+-----------------------------+---------------------------+ +| k1 | k2 | array_distinct(k2) | ++------+-----------------------------+---------------------------+ +| 1 | [1, 2, 3, 4, 5] | [1, 2, 3, 4, 5] | +| 2 | [6, 7, 8] | [6, 7, 8] | +| 3 | [] | [] | +| 4 | NULL | NULL | +| 5 | [1, 2, 3, 4, 5, 4, 3, 2, 1] | [1, 2, 3, 4, 5] | +| 6 | [1, 2, 3, NULL] | [1, 2, 3, NULL] | +| 7 | [1, 2, 3, NULL, NULL] | [1, 2, 3, NULL, NULL] | ++------+-----------------------------+---------------------------+ + +mysql> select k1, k2, array_distinct(k2) from array_test01; ++------+------------------------------------------+---------------------------+ +| k1 | k2 | array_distinct(`k2`) | ++------+------------------------------------------+---------------------------+ +| 1 | ['a', 'b', 'c', 'd', 'e'] | ['a', 'b', 'c', 'd', 'e'] | +| 2 | ['f', 'g', 'h'] | ['f', 'g', 'h'] | +| 3 | [''] | [''] | +| 3 | [NULL] | [NULL] | +| 5 | ['a', 'b', 'c', 'd', 'e', 'a', 'b', 'c'] | ['a', 'b', 'c', 'd', 'e'] | +| 6 | NULL | NULL | +| 7 | ['a', 'b', NULL] | ['a', 'b', NULL] | +| 8 | ['a', 'b', NULL, NULL] | ['a', 'b', NULL, NULL] | ++------+------------------------------------------+---------------------------+ +``` + +### keywords + +ARRAY, DISTINCT, ARRAY_DISTINCT diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index f95bc7c55b..04d4477e4c 100755 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -173,6 +173,18 @@ visible_functions = [ [['array_position'], 'BIGINT', ['ARRAY_STRING', 'STRING'], '', '', '', 'vec', ''], [['cardinality', 'size'], 'BIGINT', ['ARRAY'], '', '', '', 'vec', ''], + [['array_distinct'], 'ARRAY_TINYINT', ['ARRAY_TINYINT'], '', '', '', 'vec', ''], + [['array_distinct'], 'ARRAY_SMALLINT', ['ARRAY_SMALLINT'], '', '', '', 'vec', ''], + [['array_distinct'], 'ARRAY_INT', ['ARRAY_INT'], '', '', '', 'vec', ''], + [['array_distinct'], 'ARRAY_BIGINT', ['ARRAY_BIGINT'], '', '', '', 'vec', ''], + [['array_distinct'], 'ARRAY_LARGEINT', ['ARRAY_LARGEINT'], '', '', '', 'vec', ''], + [['array_distinct'], 'ARRAY_DATETIME', ['ARRAY_DATETIME'], '', '', '', 'vec', ''], + [['array_distinct'], 'ARRAY_DATE', ['ARRAY_DATE'], '', '', '', 'vec', ''], + [['array_distinct'], 'ARRAY_FLOAT', ['ARRAY_FLOAT'], '', '', '', 'vec', ''], + [['array_distinct'], 'ARRAY_DOUBLE', ['ARRAY_DOUBLE'], '', '', '', 'vec', ''], + [['array_distinct'], 'ARRAY_DECIMALV2', ['ARRAY_DECIMALV2'], '', '', '', 'vec', ''], + [['array_distinct'], 'ARRAY_VARCHAR', ['ARRAY_VARCHAR'], '', '', '', 'vec', ''], + [['array_distinct'], 'ARRAY_STRING', ['ARRAY_STRING'], '', '', '', 'vec', ''], [['array_min'], 'TINYINT', ['ARRAY_TINYINT'], '', '', '', 'vec', 'ALWAYS_NULLABLE'], [['array_min'], 'SMALLINT', ['ARRAY_SMALLINT'], '', '', '', 'vec', 'ALWAYS_NULLABLE'], diff --git a/regression-test/data/query/sql_functions/array_functions/test_array_functions.out b/regression-test/data/query/sql_functions/array_functions/test_array_functions.out index 7ba627d097..3f5b7b7ba2 100644 --- a/regression-test/data/query/sql_functions/array_functions/test_array_functions.out +++ b/regression-test/data/query/sql_functions/array_functions/test_array_functions.out @@ -3,14 +3,31 @@ 1 3 3 2 1 \N 3 0 0 +4 9 0 +5 0 7 +6 9 7 -- !select -- 1 3 3 2 1 \N 3 0 0 +4 9 0 +5 0 7 +6 9 7 -- !select -- 1 true 2 false 3 \N +4 false +5 \N +6 \N + +-- !select -- +1 [1, 2, 3] ['a', 'b', ''] +2 [4] \N +3 [] [] +4 [1, 2, 3, 4, 5] [] +5 [] ['a', 'b', 'c', 'd'] +6 [1, 2, 3, 4, 5] ['a', 'b', 'c', 'd'] diff --git a/regression-test/suites/query/sql_functions/array_functions/test_array_functions.groovy b/regression-test/suites/query/sql_functions/array_functions/test_array_functions.groovy index 3c73c11f6c..93fb380126 100644 --- a/regression-test/suites/query/sql_functions/array_functions/test_array_functions.groovy +++ b/regression-test/suites/query/sql_functions/array_functions/test_array_functions.groovy @@ -40,8 +40,12 @@ suite("test_array_functions", "query") { sql """ INSERT INTO ${tableName} VALUES(1, [1, 2, 3], ["a", "b", ""], [1, 2]) """ sql """ INSERT INTO ${tableName} VALUES(2, [4], NULL, [5]) """ sql """ INSERT INTO ${tableName} VALUES(3, [], [], NULL) """ + sql """ INSERT INTO ${tableName} VALUES(4, [1, 2, 3, 4, 5, 4, 3, 2, 1], [], []) """ + sql """ INSERT INTO ${tableName} VALUES(5, [], ["a", "b", "c", "d", "c", "b", "a"], NULL) """ + sql """ INSERT INTO ${tableName} VALUES(6, [1, 2, 3, 4, 5, 4, 3, 2, 1], ["a", "b", "c", "d", "c", "b", "a"], NULL) """ qt_select "SELECT k1, size(k2), size(k3) FROM ${tableName} ORDER BY k1" qt_select "SELECT k1, cardinality(k2), cardinality(k3) FROM ${tableName} ORDER BY k1" qt_select "SELECT k1, arrays_overlap(k2, k4) FROM ${tableName} ORDER BY k1" + qt_select "SELECT k1, array_distinct(k2), array_distinct(k3) FROM ${tableName} ORDER BY k1" }