Currently, there are some useless includes in the codebase. We can use a tool named include-what-you-use to optimize these includes. By using a strict include-what-you-use policy, we can get lots of benefits from it.
325 lines
12 KiB
C++
325 lines
12 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#include "vec/utils/histogram_helpers.hpp"
|
|
|
|
#include <gtest/gtest-message.h>
|
|
#include <gtest/gtest-test-part.h>
|
|
|
|
#include <algorithm>
|
|
#include <map>
|
|
#include <vector>
|
|
|
|
#include "gtest/gtest_pred_impl.h"
|
|
#include "vec/data_types/data_type_number.h"
|
|
|
|
namespace doris::vectorized {
|
|
|
|
// Define a helper function to create a test map with specified number of distinct values
|
|
// and total counts.
|
|
template <typename T>
|
|
std::map<T, size_t> create_test_map(const std::vector<T>& values,
|
|
const std::vector<size_t>& counts) {
|
|
std::map<T, size_t> test_map;
|
|
for (size_t i = 0; i < values.size(); ++i) {
|
|
test_map[values[i]] = counts[i];
|
|
}
|
|
return test_map;
|
|
}
|
|
|
|
/******************test build_histogram******************/
|
|
|
|
// Test case 1: Test when input map is empty.
|
|
TEST(BuildHistogramTest, empty_map) {
|
|
std::vector<Bucket<int>> buckets;
|
|
std::map<int, size_t> ordered_map;
|
|
size_t num_buckets = 10;
|
|
EXPECT_FALSE(build_histogram(buckets, ordered_map, num_buckets));
|
|
EXPECT_EQ(buckets.size(), 0);
|
|
}
|
|
|
|
// Test case 2: Test when input map has only one element.
|
|
TEST(BuildHistogramTest, single_element) {
|
|
std::vector<Bucket<int>> buckets;
|
|
std::map<int, size_t> ordered_map = {{1, 5}};
|
|
size_t num_buckets = 10;
|
|
EXPECT_TRUE(build_histogram(buckets, ordered_map, num_buckets));
|
|
EXPECT_EQ(buckets.size(), 1);
|
|
EXPECT_EQ(buckets[0].lower, 1);
|
|
EXPECT_EQ(buckets[0].upper, 1);
|
|
EXPECT_EQ(buckets[0].ndv, 1);
|
|
EXPECT_EQ(buckets[0].count, 5);
|
|
EXPECT_EQ(buckets[0].pre_sum, 0);
|
|
}
|
|
|
|
// Test case 3: Test when num_buckets >= number of distinct elements in input map.
|
|
TEST(BuildHistogramTest, enough_buckets) {
|
|
std::vector<Bucket<int>> buckets;
|
|
std::map<int, size_t> test_map = create_test_map<int>({1, 2, 3}, {10, 20, 30});
|
|
size_t num_buckets = test_map.size();
|
|
EXPECT_TRUE(build_histogram(buckets, test_map, num_buckets));
|
|
|
|
EXPECT_EQ(buckets.size(), num_buckets);
|
|
EXPECT_EQ(buckets[0].lower, 1);
|
|
EXPECT_EQ(buckets[0].upper, 1);
|
|
EXPECT_EQ(buckets[0].ndv, 1);
|
|
EXPECT_EQ(buckets[0].count, 10);
|
|
EXPECT_EQ(buckets[0].pre_sum, 0);
|
|
|
|
EXPECT_EQ(buckets[1].lower, 2);
|
|
EXPECT_EQ(buckets[1].upper, 2);
|
|
EXPECT_EQ(buckets[1].ndv, 1);
|
|
EXPECT_EQ(buckets[1].count, 20);
|
|
EXPECT_EQ(buckets[1].pre_sum, 10);
|
|
|
|
EXPECT_EQ(buckets[2].lower, 3);
|
|
EXPECT_EQ(buckets[2].upper, 3);
|
|
EXPECT_EQ(buckets[2].ndv, 1);
|
|
EXPECT_EQ(buckets[2].count, 30);
|
|
EXPECT_EQ(buckets[2].pre_sum, 30);
|
|
}
|
|
|
|
// Test case 4: Test when num_buckets < number of distinct elements in input map.
|
|
TEST(BuildHistogramTest, not_enough_buckets) {
|
|
std::vector<Bucket<int>> buckets;
|
|
std::map<int, size_t> test_map = create_test_map<int>({1, 2, 3, 4}, {5, 10, 20, 30});
|
|
size_t num_buckets = 2;
|
|
EXPECT_TRUE(build_histogram(buckets, test_map, num_buckets));
|
|
EXPECT_EQ(buckets.size(), num_buckets);
|
|
EXPECT_EQ(buckets[0].lower, 1);
|
|
EXPECT_EQ(buckets[0].upper, 3);
|
|
EXPECT_EQ(buckets[0].ndv, 3);
|
|
EXPECT_EQ(buckets[0].count, 35);
|
|
EXPECT_EQ(buckets[0].pre_sum, 0);
|
|
|
|
EXPECT_EQ(buckets[1].lower, 4);
|
|
EXPECT_EQ(buckets[1].upper, 4);
|
|
EXPECT_EQ(buckets[1].ndv, 1);
|
|
EXPECT_EQ(buckets[1].count, 30);
|
|
EXPECT_EQ(buckets[1].pre_sum, 35);
|
|
}
|
|
|
|
// Test case 5: Test when the sum of counts of two adjacent elements is larger than a single bucket's capacity.
|
|
TEST(BuildHistogramTest, two_adjacent_values_larger_than_capacity) {
|
|
std::vector<Bucket<int>> buckets;
|
|
std::map<int, size_t> test_map = create_test_map<int>({1, 2, 3, 4}, {20, 30, 40, 50});
|
|
size_t num_buckets = 3;
|
|
EXPECT_TRUE(build_histogram(buckets, test_map, num_buckets));
|
|
EXPECT_EQ(buckets[0].lower, 1);
|
|
EXPECT_EQ(buckets[0].upper, 2);
|
|
EXPECT_EQ(buckets.size(), num_buckets);
|
|
EXPECT_EQ(buckets[0].ndv, 2);
|
|
EXPECT_EQ(buckets[0].count, 50);
|
|
EXPECT_EQ(buckets[0].pre_sum, 0);
|
|
|
|
EXPECT_EQ(buckets[1].lower, 3);
|
|
EXPECT_EQ(buckets[1].upper, 3);
|
|
EXPECT_EQ(buckets[1].ndv, 1);
|
|
EXPECT_EQ(buckets[1].count, 40);
|
|
EXPECT_EQ(buckets[1].pre_sum, 50);
|
|
|
|
EXPECT_EQ(buckets[2].lower, 4);
|
|
EXPECT_EQ(buckets[2].upper, 4);
|
|
EXPECT_EQ(buckets[2].ndv, 1);
|
|
EXPECT_EQ(buckets[2].count, 50);
|
|
EXPECT_EQ(buckets[2].pre_sum, 90);
|
|
}
|
|
|
|
// Test case 6: Test when the sum of counts of all elements is smaller than a single bucket's capacity.
|
|
TEST(BuildHistogramTest, all_values_in_one_bucket) {
|
|
std::vector<Bucket<int>> buckets;
|
|
std::map<int, size_t> test_map = create_test_map<int>({1, 2, 3}, {5, 10, 15});
|
|
size_t num_buckets = 1;
|
|
EXPECT_TRUE(build_histogram(buckets, test_map, num_buckets));
|
|
EXPECT_EQ(buckets[0].lower, 1);
|
|
EXPECT_EQ(buckets[0].upper, 3);
|
|
EXPECT_EQ(buckets.size(), num_buckets);
|
|
EXPECT_EQ(buckets[0].ndv, 3);
|
|
EXPECT_EQ(buckets[0].count, 30);
|
|
EXPECT_EQ(buckets[0].pre_sum, 0);
|
|
}
|
|
|
|
// Test case 7: Test when the sum of counts of all elements is larger than the total capacity of all buckets.
|
|
TEST(BuildHistogramTest, all_values_in_multiple_buckets) {
|
|
std::vector<Bucket<int>> buckets;
|
|
std::map<int, size_t> test_map =
|
|
create_test_map<int>({1, 2, 3, 4, 5, 6}, {100, 200, 300, 400, 500, 600});
|
|
size_t num_buckets = 4;
|
|
EXPECT_TRUE(build_histogram(buckets, test_map, num_buckets));
|
|
// Check if the buckets are constructed correctly
|
|
EXPECT_EQ(buckets.size(), num_buckets);
|
|
EXPECT_EQ(buckets[0].lower, 1);
|
|
EXPECT_EQ(buckets[0].upper, 3);
|
|
EXPECT_EQ(buckets[0].ndv, 3);
|
|
EXPECT_EQ(buckets[0].count, 600);
|
|
EXPECT_EQ(buckets[0].pre_sum, 0);
|
|
|
|
EXPECT_EQ(buckets[1].lower, 4);
|
|
EXPECT_EQ(buckets[1].upper, 4);
|
|
EXPECT_EQ(buckets[1].ndv, 1);
|
|
EXPECT_EQ(buckets[1].count, 400);
|
|
EXPECT_EQ(buckets[1].pre_sum, 600);
|
|
|
|
EXPECT_EQ(buckets[2].lower, 5);
|
|
EXPECT_EQ(buckets[2].upper, 5);
|
|
EXPECT_EQ(buckets[2].ndv, 1);
|
|
EXPECT_EQ(buckets[2].count, 500);
|
|
EXPECT_EQ(buckets[2].pre_sum, 1000);
|
|
|
|
EXPECT_EQ(buckets[3].lower, 6);
|
|
EXPECT_EQ(buckets[3].upper, 6);
|
|
EXPECT_EQ(buckets[3].ndv, 1);
|
|
EXPECT_EQ(buckets[3].count, 600);
|
|
EXPECT_EQ(buckets[3].pre_sum, 1500);
|
|
|
|
// Check if the pre_sum values are correct.
|
|
std::vector<size_t> pre_sum(num_buckets + 1, 0);
|
|
for (size_t i = 0; i < num_buckets; ++i) {
|
|
pre_sum[i + 1] = pre_sum[i] + buckets[i].count;
|
|
}
|
|
for (size_t i = 0; i < buckets.size(); ++i) {
|
|
EXPECT_EQ(buckets[i].pre_sum, pre_sum[i]);
|
|
}
|
|
}
|
|
|
|
/*****************test histogram_to_json*****************/
|
|
|
|
// Test case 1: Empty buckets
|
|
TEST(HistogramToJsonTest, empty_buckets) {
|
|
rapidjson::StringBuffer buffer;
|
|
std::vector<Bucket<int>> buckets;
|
|
auto data_type = std::make_shared<DataTypeInt8>();
|
|
bool result = histogram_to_json(buffer, buckets, data_type);
|
|
std::string result_str = std::string(buffer.GetString());
|
|
std::string expect_result_str = "{\"num_buckets\":0,\"buckets\":[]}";
|
|
EXPECT_FALSE(result);
|
|
EXPECT_EQ(result_str, expect_result_str);
|
|
}
|
|
|
|
// Test case 2: Non-empty buckets
|
|
TEST(HistogramToJsonTest, non_empty_buckets) {
|
|
rapidjson::StringBuffer buffer;
|
|
std::vector<Bucket<int>> buckets = {{0, 10, 5, 20, 100}, {10, 20, 10, 30, 200}};
|
|
auto data_type = std::make_shared<DataTypeInt8>();
|
|
bool result = histogram_to_json(buffer, buckets, data_type);
|
|
std::string result_str = std::string(buffer.GetString());
|
|
std::string expect_result_str =
|
|
"{\"num_buckets\":2,\"buckets\":[{\"lower\":\"0\",\"upper\":\"10\",\"ndv\":5,"
|
|
"\"count\":20,\"pre_sum\":100},{\"lower\":\"10\",\"upper\":\"20\",\"ndv\":10,"
|
|
"\"count\":30,\"pre_sum\":200}]}";
|
|
EXPECT_TRUE(result);
|
|
EXPECT_EQ(result_str, expect_result_str);
|
|
}
|
|
|
|
/************test calculate_bucket_max_values************/
|
|
|
|
// Test case 1: Test with a map of size 1 and num_buckets = 1
|
|
TEST(CalculateBucketMaxValuesTest, single_value_one_bucket) {
|
|
std::map<int, size_t> value_map = {{1, 1}};
|
|
const size_t num_buckets = 1;
|
|
const size_t expected_max_values = 1;
|
|
EXPECT_EQ(calculate_bucket_max_values(value_map, num_buckets), expected_max_values);
|
|
}
|
|
|
|
// Test case 2: Test with a map of size 1 and num_buckets > 1
|
|
TEST(CalculateBucketMaxValuesTest, single_value_multiple_buckets) {
|
|
std::map<int, size_t> value_map = {{1, 1}};
|
|
const size_t num_buckets = 10;
|
|
const size_t expected_max_values = 1;
|
|
EXPECT_EQ(calculate_bucket_max_values(value_map, num_buckets), expected_max_values);
|
|
}
|
|
|
|
// Test case 3: Test with a map of size 2 and num_buckets = 1
|
|
TEST(CalculateBucketMaxValuesTest, multiple_values_one_bucket) {
|
|
std::map<int, size_t> value_map = {{1, 1}, {2, 2}};
|
|
const size_t num_buckets = 1;
|
|
const size_t expected_max_values = 3;
|
|
EXPECT_EQ(calculate_bucket_max_values(value_map, num_buckets), expected_max_values);
|
|
}
|
|
|
|
// Test case 4: Test with a map of size 2 and num_buckets > 1
|
|
TEST(CalculateBucketMaxValuesTest, multiple_values_multiple_buckets) {
|
|
std::map<int, size_t> value_map = {{1, 1}, {2, 2}};
|
|
const size_t num_buckets = 2;
|
|
const size_t expected_max_values = 1;
|
|
EXPECT_EQ(calculate_bucket_max_values(value_map, num_buckets), expected_max_values);
|
|
}
|
|
|
|
// Test case 5: Test with a map of size 3 and num_buckets > 1
|
|
TEST(CalculateBucketMaxValuesTest, multiple_values_multiple_buckets2) {
|
|
std::map<int, size_t> value_map = create_test_map<int>({1, 2, 3}, {2, 2, 2});
|
|
const size_t num_buckets = 3;
|
|
const size_t expected_max_values = 2;
|
|
EXPECT_EQ(calculate_bucket_max_values(value_map, num_buckets), expected_max_values);
|
|
}
|
|
|
|
// Test case 6: Test with a map of size 3 and num_buckets > 1
|
|
TEST(CalculateBucketMaxValuesTest, multiple_values_multiple_buckets3) {
|
|
std::map<int, size_t> value_map = create_test_map<int>({1, 2, 3}, {2, 2, 2});
|
|
const size_t num_buckets = 4;
|
|
const size_t expected_max_values = 1;
|
|
EXPECT_EQ(calculate_bucket_max_values(value_map, num_buckets), expected_max_values);
|
|
}
|
|
|
|
/**************test can_assign_into_buckets**************/
|
|
|
|
// Test case 1: Test with empty input map.
|
|
TEST(CanAssignIntoBucketsTest, empty_map) {
|
|
std::map<int, size_t> empty_map;
|
|
EXPECT_FALSE(can_assign_into_buckets(empty_map, 1, 1));
|
|
}
|
|
|
|
// Test case 2: Test with a single element in input map.
|
|
TEST(CanAssignIntoBucketsTest, one_element_map) {
|
|
std::map<int, size_t> test_map = {{42, 10}};
|
|
EXPECT_FALSE(can_assign_into_buckets(test_map, 1, 1));
|
|
EXPECT_FALSE(can_assign_into_buckets(test_map, 5, 1));
|
|
EXPECT_TRUE(can_assign_into_buckets(test_map, 5, 2));
|
|
}
|
|
|
|
// Test case 3: Test with enough buckets to store all distinct elements.
|
|
TEST(CanAssignIntoBucketsTest, enough_buckets) {
|
|
std::map<int, size_t> test_map = create_test_map<int>({1, 2, 3}, {10, 20, 30});
|
|
EXPECT_FALSE(can_assign_into_buckets(test_map, 1, 3));
|
|
EXPECT_TRUE(can_assign_into_buckets(test_map, 10, 3));
|
|
EXPECT_TRUE(can_assign_into_buckets(test_map, 20, 3));
|
|
}
|
|
|
|
// Test case 4: Test with not enough buckets to store all distinct elements.
|
|
TEST(CanAssignIntoBucketsTest, not_enough_buckets) {
|
|
std::map<int, size_t> test_map = create_test_map<int>({1, 2, 3, 4}, {5, 10, 15, 20});
|
|
EXPECT_TRUE(can_assign_into_buckets(test_map, 30, 2));
|
|
EXPECT_FALSE(can_assign_into_buckets(test_map, 30, 1));
|
|
}
|
|
|
|
// Test case 5: Test with all values in a single bucket.
|
|
TEST(CanAssignIntoBucketsTest, all_values_in_one_bucket) {
|
|
std::map<int, size_t> test_map = create_test_map<int>({1, 2, 3, 4}, {5, 10, 15, 20});
|
|
EXPECT_TRUE(can_assign_into_buckets(test_map, 100, 1));
|
|
EXPECT_FALSE(can_assign_into_buckets(test_map, 30, 1));
|
|
}
|
|
|
|
// Test case 6: Test with a single element that has a count greater than the total bucket capacity.
|
|
TEST(CanAssignIntoBucketsTest, single_element_larger_than_bucket_capacity) {
|
|
std::map<int, size_t> test_map = {{42, 100}};
|
|
EXPECT_FALSE(can_assign_into_buckets(test_map, 99, 1));
|
|
EXPECT_TRUE(can_assign_into_buckets(test_map, 100, 1));
|
|
EXPECT_TRUE(can_assign_into_buckets(test_map, 200, 1));
|
|
}
|
|
|
|
} // namespace doris::vectorized
|