Files
doris/be/src/util/topn_counter.cpp

149 lines
5.1 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "topn_counter.h"
#include <rapidjson/stringbuffer.h>
#include <rapidjson/writer.h>
#include <algorithm>
#include "gen_cpp/olap_common.pb.h"
#include "slice.h"
namespace doris {
void TopNCounter::add_item(const std::string& item, uint64_t incrementCount) {
auto iter = _counter_map->find(item);
if (iter != _counter_map->end()) {
iter->second.add_count(incrementCount);
} else {
_counter_map->insert(std::make_pair(item, Counter(item, incrementCount)));
}
_ordered = false;
}
void TopNCounter::serialize(std::string* buffer) {
sort_retain(_capacity);
PTopNCounter topn_counter;
topn_counter.set_top_num(_top_num);
topn_counter.set_space_expand_rate(_space_expand_rate);
for (std::vector<Counter>::const_iterator it = _counter_vec->begin(); it != _counter_vec->end();
++it) {
PCounter* counter = topn_counter.add_counter();
counter->set_item(it->get_item());
counter->set_count(it->get_count());
}
topn_counter.SerializeToString(buffer);
}
bool TopNCounter::deserialize(const doris::Slice& src) {
PTopNCounter topn_counter;
if (!topn_counter.ParseFromArray(src.data, src.size)) {
LOG(WARNING) << "topn counter deserialize failed";
return false;
}
_space_expand_rate = topn_counter.space_expand_rate();
set_top_num(topn_counter.top_num());
for (int i = 0; i < topn_counter.counter_size(); ++i) {
const PCounter& counter = topn_counter.counter(i);
_counter_map->insert(
std::make_pair(counter.item(), Counter(counter.item(), counter.count())));
_counter_vec->emplace_back(counter.item(), counter.count());
}
_ordered = true;
return true;
}
void TopNCounter::sort_retain(uint32_t capacity) {
_counter_vec->clear();
sort_retain(capacity, _counter_vec);
_ordered = true;
}
void TopNCounter::sort_retain(uint32_t capacity, std::vector<Counter>* sort_vec) {
for (std::unordered_map<std::string, Counter>::const_iterator it = _counter_map->begin();
it != _counter_map->end(); ++it) {
sort_vec->emplace_back(it->second.get_item(), it->second.get_count());
}
std::sort(sort_vec->begin(), sort_vec->end(), TopNComparator());
if (sort_vec->size() > capacity) {
for (uint32_t i = 0, n = sort_vec->size() - capacity; i < n; ++i) {
auto& counter = sort_vec->back();
_counter_map->erase(counter.get_item());
sort_vec->pop_back();
}
}
}
// Based on the parallel version of the Space Saving algorithm as described in:
// A parallel space saving algorithm for frequent items and the Hurwitz zeta distribution by Massimo Cafaro, et al.
void TopNCounter::merge(doris::TopNCounter&& other) {
if (other._counter_map->size() == 0) {
return;
}
_space_expand_rate = other._space_expand_rate;
set_top_num(other._top_num);
bool this_full = _counter_map->size() >= _capacity;
bool another_full = other._counter_map->size() >= other._capacity;
uint64_t m1 = this_full ? _counter_vec->back().get_count() : 0;
uint64_t m2 = another_full ? other._counter_vec->back().get_count() : 0;
if (another_full == true) {
for (auto& entry : *(this->_counter_map)) {
entry.second.add_count(m2);
}
}
for (auto& other_entry : *(other._counter_map)) {
auto itr = this->_counter_map->find(other_entry.first);
if (itr != _counter_map->end()) {
itr->second.add_count(other_entry.second.get_count() - m2);
} else {
this->_counter_map->insert(std::make_pair(
other_entry.first,
Counter(other_entry.first, other_entry.second.get_count() + m1)));
}
}
_ordered = false;
sort_retain(_capacity);
}
void TopNCounter::finalize(std::string& finalize_str) {
if (!_ordered) {
sort_retain(_top_num);
}
// use json format print
rapidjson::StringBuffer buffer;
rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
uint32_t k = 0;
writer.StartObject();
for (std::vector<Counter>::const_iterator it = _counter_vec->begin();
it != _counter_vec->end() && k < _top_num; ++it, ++k) {
writer.Key(it->get_item().data());
writer.Uint64(it->get_count());
}
writer.EndObject();
finalize_str = buffer.GetString();
}
} // namespace doris