/** * Copyright (c) 2024 OceanBase * OceanBase CE is licensed under Mulan PubL v2. * You can use this software according to the terms and conditions of the Mulan PubL v2. * You may obtain a copy of Mulan PubL v2 at: * http://license.coscl.org.cn/MulanPubL-2.0 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. * See the Mulan PubL v2 for more details. */ #include #include #include #include #include #define private public // #define unittest #define unittest_bloom_filter #include "sql/engine/px/ob_px_bloom_filter.h" #include "src/sql/engine/expr/ob_expr_in.h" using namespace std; namespace oceanbase { namespace sql { static constexpr uint64_t build_count = 4096; class SimpleTimer { public: SimpleTimer() { cpu_begin_time_ = rdtsc(); } ~SimpleTimer() { uint64_t elapse_time = rdtsc() - cpu_begin_time_; cout << "elapse time is: " << elapse_time << endl; } private: uint64_t cpu_begin_time_; }; class SmallHashSetTest : public ::testing::Test { public: SmallHashSetTest() = default; virtual ~SmallHashSetTest() = default; virtual void SetUp(){}; virtual void TearDown(){}; void insert_hash(int64_t insert_count); void test_hash(int64_t test_count); void insert_key(int64_t insert_count); void test_key(int64_t test_count); void performance_test(); public: ObArenaAllocator alloc_; ObSmallHashSet accurate_small_set_; ObColumnHashSet open_set_; ObSmallHashSet inaccurate_small_set_; std::unordered_set std_set_; hash::ObHashSet ob_set_; ObPxBloomFilter bloom_filter_; private: DISALLOW_COPY_AND_ASSIGN(SmallHashSetTest); }; void SmallHashSetTest::insert_hash(int64_t insert_count) { constexpr uint64_t rand_upper_bound = 100000; for (int64_t i = 0; i < insert_count; ++i) { int64_t num = common::ObRandom::rand(1, rand_upper_bound); uint64_t hash = murmurhash(&num, 8, 0); accurate_small_set_.insert_hash(hash); inaccurate_small_set_.insert_hash(hash); std_set_.insert(hash); } } void SmallHashSetTest::test_hash(int64_t test_count) { constexpr uint64_t rand_upper_bound = 100000; for (int64_t i = 0; i < test_count; ++i) { int64_t num = common::ObRandom::rand(1, rand_upper_bound); uint64_t hash = murmurhash(&num, 8, 0); bool in_accurate_set = accurate_small_set_.test_hash(hash); bool in_inaccurate_set = inaccurate_small_set_.test_hash(hash); bool in_std_set = std_set_.count(hash) != 0; EXPECT_EQ(in_accurate_set, in_std_set); EXPECT_GE(in_inaccurate_set, in_accurate_set); } #ifdef unittest int ret = OB_SUCCESS; uint64_t accurate_total = accurate_small_set_.seek_total_times_; uint64_t inaccurate_total = inaccurate_small_set_.seek_total_times_; double accurate_avg = accurate_small_set_.seek_total_times_ / double(test_count); double inaccurate_avg = inaccurate_small_set_.seek_total_times_ / double(test_count); COMMON_LOG(WARN, "avg seek", K(accurate_total), K(inaccurate_total), K(accurate_avg), K(inaccurate_avg)); #endif } void SmallHashSetTest::insert_key(int64_t insert_count) { constexpr uint64_t rand_upper_bound = 100000; for (int64_t i = 0; i < insert_count; ++i) { int64_t num = common::ObRandom::rand(1, rand_upper_bound); uint64_t hash = murmurhash(&num, 8, 0); open_set_.insert(hash, num); std_set_.insert(num); } } void SmallHashSetTest::test_key(int64_t test_count) { constexpr uint64_t rand_upper_bound = 100000; for (int64_t i = 0; i < test_count; ++i) { int64_t num = common::ObRandom::rand(1, rand_upper_bound); uint64_t hash = murmurhash(&num, 8, 0); bool in_accurate_set = open_set_.exists(hash, num); bool in_std_set = std_set_.count(num) != 0; EXPECT_EQ(in_accurate_set, in_std_set); } } void SmallHashSetTest::performance_test() { constexpr uint64_t probe_count = 1000000; std::vector build_hash_values(probe_count, 0); std::vector build_key_values(probe_count, 0); std::vector probe_hash_values(probe_count, 0); std::vector probe_key_values(probe_count, 0); accurate_small_set_.clear(); inaccurate_small_set_.clear(); open_set_.clear(); std_set_.clear(); // ------------------- build test ----------------------- for (int64_t i = 0; i < build_count; ++i) { int64_t num = common::ObRandom::rand(0, UINT64_MAX); build_key_values[i] = num; build_hash_values[i] = murmurhash(&num, 8, 0); } { cout << "Build:: for accurate_small_set, "; SimpleTimer timer; for (int64_t i = 0; i < build_count; ++i) { accurate_small_set_.insert_hash(build_hash_values[i]); } } { cout << "Build:: for inaccurate_small_set, "; SimpleTimer timer; for (int64_t i = 0; i < build_count; ++i) { inaccurate_small_set_.insert_hash(build_hash_values[i]); } } { cout << "Build:: for std::unordered_set, "; SimpleTimer timer; for (int64_t i = 0; i < build_count; ++i) { std_set_.insert(build_hash_values[i]); } } { cout << "Build:: for ob_hash_set, "; SimpleTimer timer; for (int64_t i = 0; i < build_count; ++i) { ob_set_.set_refactored(build_hash_values[i]); } } #ifdef unittest_bloom_filter { cout << "Build:: for bloom filter, "; SimpleTimer timer; for (int64_t i = 0; i < build_count; ++i) { bloom_filter_.put(build_hash_values[i]); } } #endif // ------------------- probe test ----------------------- for (int64_t i = 0; i < probe_count; ++i) { int64_t num = common::ObRandom::rand(0, UINT64_MAX); probe_key_values[i] = num; probe_hash_values[i] = murmurhash(&num, 8, 0); } { cout << "Probe:: for accurate_small_set, "; SimpleTimer timer; for (int64_t i = 0; i < probe_count; ++i) { accurate_small_set_.test_hash(probe_hash_values[i]); } } { cout << "Probe:: for inaccurate_small_set, "; SimpleTimer timer; for (int64_t i = 0; i < probe_count; ++i) { inaccurate_small_set_.test_hash(probe_hash_values[i]); } } { cout << "Probe:: for std::unordered_set, "; SimpleTimer timer; for (int64_t i = 0; i < probe_count; ++i) { std_set_.count(probe_hash_values[i]); } } { cout << "Probe:: for ob_hash_set, "; SimpleTimer timer; for (int64_t i = 0; i < probe_count; ++i) { ob_set_.exist_refactored(probe_hash_values[i]); } } #ifdef unittest_bloom_filter { cout << "Probe:: for bloom_filter batch, "; SimpleTimer timer; int64_t batch_size = 256; uint64_t round = probe_count / batch_size; for (int64_t i = 0; i < round; ++i) { uint64_t offset = i * batch_size; bloom_filter_.might_contain_batch(probe_hash_values.data() + offset, batch_size); } int64_t last_batch_size = probe_count % batch_size; if (last_batch_size > 0) { uint64_t offset = round * batch_size; bloom_filter_.might_contain_batch(probe_hash_values.data() + offset, last_batch_size); } } #endif // ------------------- false positive rate ----------------------- { int64_t total_count = probe_count; int64_t error_count = 0; for (int64_t i = 0; i < probe_count; ++i) { bool result_inacc = inaccurate_small_set_.test_hash(probe_hash_values[i]); bool result_std = std_set_.count(probe_hash_values[i]) != 0; if (result_inacc != result_std) { error_count++; } } double error_rate = double(error_count) / double(total_count); cout << "Probe:: the false positive rate is: " << error_rate << ", error_count: " << error_count << ", total_count: " << total_count << endl; } } TEST_F(SmallHashSetTest, test_small_hash_set) { int64_t tenant_id = 1; accurate_small_set_.init(build_count, tenant_id); inaccurate_small_set_.init(build_count, tenant_id); open_set_.init(build_count, tenant_id); cout << "the small hash set init size is: " << accurate_small_set_.capacity_ << endl; ob_set_.create(build_count * 4); bloom_filter_.init(build_count, alloc_, tenant_id, 0.03, 2147483648); insert_hash(4000); test_hash(100000); insert_key(4000); test_key(100000); performance_test(); } struct MurmurHashTest { typedef uint64_t result_type; typedef int argument_type; result_type operator()(argument_type num) const noexcept { return murmurhash(&num, sizeof(num), 0); } }; class OpenHashSetforKeyTest : public ::testing::Test { public: OpenHashSetforKeyTest() = default; virtual ~OpenHashSetforKeyTest() = default; virtual void SetUp() {}; virtual void TearDown() {}; void insert_hash(int64_t insert_count); void test_hash(int64_t test_count); void insert_key(int64_t insert_count); void test_key(int64_t test_count); void exist_performance(int build_cnt, int probe_cnt); public: ObColumnHashSet open_set_; ObSmallHashSet inaccurate_small_set_; std::unordered_set std_set_; hash::ObHashSet ob_set_; std::vector build_hash_values; std::vector probe_hash_values; std::vector probe_key_values; protected: template void run_probe_test(int cnt, const char* name, const SetType& set, const std::function& test_fn); private: DISALLOW_COPY_AND_ASSIGN(OpenHashSetforKeyTest); }; template void OpenHashSetforKeyTest::run_probe_test(int probe_count, const char* name, const SetType& set, const std::function& test_fn) { std::cout << "Probe time for " << name << ": "; SimpleTimer timer; for (int64_t i = 0; i < probe_count; ++i) { test_fn(set, probe_hash_values[i], probe_key_values[i]); } } void OpenHashSetforKeyTest::exist_performance(int build_cnt, int probe_count) { build_hash_values.resize(probe_count, 0); probe_hash_values.resize(probe_count, 0); probe_key_values.resize(probe_count, 0); // Build phase for (int64_t i = 0; i < build_cnt; ++i) { int64_t num = common::ObRandom::rand(0, build_cnt); build_hash_values[i] = murmurhash(&num, 8, 0); open_set_.insert(build_hash_values[i], num); inaccurate_small_set_.insert_hash(build_hash_values[i]); ob_set_.set_refactored(num); } // Generate probes for (int64_t i = 0; i < probe_count; ++i) { int64_t num = common::ObRandom::rand(0, build_cnt); probe_key_values[i] = num; probe_hash_values[i] = murmurhash(&probe_key_values[i], 8, 0); } auto test_open_set_fn = [](const ObColumnHashSet& set, uint64_t h, uint64_t k){ set.exists(h, k); }; auto test_ob_set_fn = [](const hash::ObHashSet& set, uint64_t h, uint64_t k){ set.exist_refactored(k); }; auto test_inaccurate_small_set_fn = [](const ObSmallHashSet& set, uint64_t h, uint64_t k){ set.test_hash(h); }; std::vector> tests_to_run = { [&]() { return run_probe_test>(probe_count, "open_set", open_set_, test_open_set_fn); }, [&]() { return run_probe_test>(probe_count, "ob_set", ob_set_, test_ob_set_fn); }, [&]() { return run_probe_test>(probe_count, "inaccurate_small_set", inaccurate_small_set_, test_inaccurate_small_set_fn); } }; std::random_device rd; std::mt19937 g(rd()); std::shuffle(tests_to_run.begin(), tests_to_run.end(), g); for (auto& test : tests_to_run) { test(); } } TEST_F(OpenHashSetforKeyTest, test_small_hash_set_exist_performance) { int64_t tenant_id = 1; int max_build_cnt = 1000000; const int build_counts[] = {10000, 100000, 1000000}; const int probe_counts[] = {10000, 100000, 1000000, 10000000}; open_set_.init(max_build_cnt * 2, tenant_id); inaccurate_small_set_.init(max_build_cnt * 2, tenant_id); ob_set_.create(max_build_cnt * 4); for (const auto &build_cnt : build_counts) { for (const auto &probe_cnt : probe_counts) { open_set_.clear(); inaccurate_small_set_.clear(); ob_set_.clear(); std::cout << "===== build_cnt: " << build_cnt << ", probe_cnt: " << probe_cnt << std::endl; exist_performance(build_cnt, probe_cnt); std::cout << "================" << std::endl; } } } } // namespace sql } // namespace oceanbase int main(int argc, char **argv) { OB_LOGGER.set_log_level("DEBUG"); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); }