// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. // This file is copied from // https://github.com/ClickHouse/ClickHouse/blob/master/src/AggregateFunctions/AggregateFunctionUniq.h // and modified by Doris #pragma once #include #include #include #include #include #include // IWYU pragma: no_include #include "common/compiler_util.h" // IWYU pragma: keep #include "vec/aggregate_functions/aggregate_function.h" #include "vec/columns/column.h" #include "vec/columns/column_vector.h" #include "vec/columns/columns_number.h" #include "vec/common/assert_cast.h" #include "vec/common/hash_table/hash.h" #include "vec/common/hash_table/phmap_fwd_decl.h" #include "vec/common/sip_hash.h" #include "vec/common/string_ref.h" #include "vec/common/uint128.h" #include "vec/core/types.h" #include "vec/data_types/data_type_number.h" #include "vec/io/io_helper.h" #include "vec/io/var_int.h" namespace doris { namespace vectorized { class Arena; class BufferReadable; class BufferWritable; template class ColumnDecimal; } // namespace vectorized } // namespace doris template struct HashCRC32; namespace doris::vectorized { /// uniqExact template struct AggregateFunctionUniqExactData { static constexpr bool is_string_key = std::is_same_v; using Key = std::conditional_t; using Hash = std::conditional_t>; using Set = flat_hash_set; static UInt128 ALWAYS_INLINE get_key(const StringRef& value) { UInt128 key; SipHash hash; hash.update(value.data, value.size); hash.get128(key.low, key.high); return key; } Set set; static String get_name() { return "uniqExact"; } }; namespace detail { /** The structure for the delegation work to add one element to the `uniq` aggregate functions. * Used for partial specialization to add strings. */ template struct OneAdder { static void ALWAYS_INLINE add(Data& data, const IColumn& column, size_t row_num) { if constexpr (std::is_same_v) { StringRef value = column.get_data_at(row_num); data.set.insert(Data::get_key(value)); } else if constexpr (IsDecimalNumber) { data.set.insert(assert_cast&>(column).get_data()[row_num]); } else { data.set.insert(assert_cast&>(column).get_data()[row_num]); } } }; } // namespace detail /// Calculates the number of different values approximately or exactly. template class AggregateFunctionUniq final : public IAggregateFunctionDataHelper> { public: using KeyType = std::conditional_t, UInt128, T>; AggregateFunctionUniq(const DataTypes& argument_types_) : IAggregateFunctionDataHelper>(argument_types_) {} String get_name() const override { return Data::get_name(); } DataTypePtr get_return_type() const override { return std::make_shared(); } void add(AggregateDataPtr __restrict place, const IColumn** columns, size_t row_num, Arena*) const override { detail::OneAdder::add(this->data(place), *columns[0], row_num); } static ALWAYS_INLINE const KeyType* get_keys(std::vector& keys_container, const IColumn& column, size_t batch_size) { if constexpr (std::is_same_v) { keys_container.resize(batch_size); for (size_t i = 0; i != batch_size; ++i) { StringRef value = column.get_data_at(i); keys_container[i] = Data::get_key(value); } return keys_container.data(); } else { using ColumnType = std::conditional_t, ColumnDecimal, ColumnVector>; return assert_cast(column).get_data().data(); } } void add_batch(size_t batch_size, AggregateDataPtr* places, size_t place_offset, const IColumn** columns, Arena* arena, bool /*agg_many*/) const override { std::vector keys_container; const KeyType* keys = get_keys(keys_container, *columns[0], batch_size); std::vector array_of_data_set(batch_size); for (size_t i = 0; i != batch_size; ++i) { array_of_data_set[i] = &(this->data(places[i] + place_offset).set); } for (size_t i = 0; i != batch_size; ++i) { if (i + HASH_MAP_PREFETCH_DIST < batch_size) { array_of_data_set[i + HASH_MAP_PREFETCH_DIST]->prefetch( keys[i + HASH_MAP_PREFETCH_DIST]); } array_of_data_set[i]->insert(keys[i]); } } void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena*) const override { auto& rhs_set = this->data(rhs).set; if (rhs_set.size() == 0) return; auto& set = this->data(place).set; set.rehash(set.size() + rhs_set.size()); for (auto elem : rhs_set) { set.insert(elem); } } void add_batch_single_place(size_t batch_size, AggregateDataPtr place, const IColumn** columns, Arena* arena) const override { std::vector keys_container; const KeyType* keys = get_keys(keys_container, *columns[0], batch_size); auto& set = this->data(place).set; for (size_t i = 0; i != batch_size; ++i) { if (i + HASH_MAP_PREFETCH_DIST < batch_size) { set.prefetch(keys[i + HASH_MAP_PREFETCH_DIST]); } set.insert(keys[i]); } } void serialize(ConstAggregateDataPtr __restrict place, BufferWritable& buf) const override { auto& set = this->data(place).set; write_var_uint(set.size(), buf); for (const auto& elem : set) { write_pod_binary(elem, buf); } } void deserialize_and_merge(AggregateDataPtr __restrict place, AggregateDataPtr __restrict rhs, BufferReadable& buf, Arena* arena) const override { auto& set = this->data(place).set; UInt64 size; read_var_uint(size, buf); set.rehash(size + set.size()); for (size_t i = 0; i < size; ++i) { KeyType ref; read_pod_binary(ref, buf); set.insert(ref); } } void deserialize(AggregateDataPtr __restrict place, BufferReadable& buf, Arena* arena) const override { auto& set = this->data(place).set; UInt64 size; read_var_uint(size, buf); set.rehash(size + set.size()); for (size_t i = 0; i < size; ++i) { KeyType ref; read_pod_binary(ref, buf); set.insert(ref); } } void insert_result_into(ConstAggregateDataPtr __restrict place, IColumn& to) const override { assert_cast(to).get_data().push_back(this->data(place).set.size()); } }; } // namespace doris::vectorized