// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #pragma once #include #include #include #include "olap/olap_common.h" #include "olap/uint24.h" #include "util/bit_stream_utils.h" #include "util/bit_stream_utils.inline.h" #include "util/faststring.h" namespace doris { static inline uint8_t bits_less_than_64(const uint64_t v) { return v == 0 ? 0 : 64 - __builtin_clzll(v); } // See https://stackoverflow.com/questions/28423405/counting-the-number-of-leading-zeros-in-a-128-bit-integer static inline uint8_t bits_may_more_than_64(const uint128_t v) { uint64_t hi = v >> 64; uint64_t lo = v; int z[3] = {__builtin_clzll(hi), __builtin_clzll(lo) + 64, 128}; int idx = !hi + ((!lo) & (!hi)); return 128 - z[idx]; } template static inline uint8_t bits(const T v) { if (sizeof(T) <= 8) { return bits_less_than_64(v); } else { return bits_may_more_than_64(v); } } // The implementation for frame-of-reference coding // The detail of frame-of-reference coding, please refer to // https://lemire.me/blog/2012/02/08/effective-compression-using-frame-of-reference-and-delta-coding/ // and https://www.elastic.co/cn/blog/frame-of-reference-and-roaring-bitmaps // // The encoded data format is as follows: // // 1. Body: // BitPackingFrame * FrameCount // 2. Footer: // // (8 bit StorageFormat + 8 bit BitWidth) * FrameCount // 8 bit FrameValueNum // 32 bit ValuesNum // // There are currently three storage formats // (1) if the StorageFormat == 0: When input data order is not ascending and the BitPackingFrame format is: // MinValue, (Value[i] - MinVale) * FrameValueNum // // (2) if the StorageFormat == 1: When input data order is ascending and the BitPackingFrame format is: // MinValue, (Value[i] - Value[i - 1]) * FrameValueNum // // (3) if the StorageFormat == 2: When overflow occurs when using (1) or (2) and save original values: // MinValue, (Value[i]) * FrameValueNum // // len(MinValue) can be 32(uint32_t), 64(uint64_t), 128(uint128_t) // // The OrderFlag is 1 represents ascending order, 0 represents not ascending order // The last frame value num maybe less than 128 template class ForEncoder { public: explicit ForEncoder(faststring* buffer) : _buffer(buffer) {} void put(const T value) { return put_batch(&value, 1); } void put_batch(const T* value, size_t count); // Flushes any pending values to the underlying buffer. // Returns the total number of bytes written uint32_t flush(); // underlying buffer size + footer meta size. // Note: should call this method before flush. uint32_t len() { return _buffer->size() + _storage_formats.size() + _bit_widths.size() + 5; } // Resets all the state in the encoder. void clear() { _values_num = 0; _buffered_values_num = 0; _buffer->clear(); } private: void bit_pack(const T* input, uint8_t in_num, int bit_width, uint8_t* output); void bit_packing_one_frame_value(const T* input); const T* copy_value(const T* val, size_t count); const T numeric_limits_max(); uint32_t _values_num = 0; uint8_t _buffered_values_num = 0; static const uint8_t FRAME_VALUE_NUM = 128; T _buffered_values[FRAME_VALUE_NUM]; faststring* _buffer; std::vector _storage_formats; std::vector _bit_widths; }; template class ForDecoder { public: explicit ForDecoder(const uint8_t* in_buffer, size_t buffer_len) : _buffer(in_buffer), _buffer_len(buffer_len), _parsed(false) {} // read footer metadata bool init(); bool get(T* val) { return get_batch(val, 1); } // Gets the next batch value. Returns false if there are no more. bool get_batch(T* val, size_t count); // The skip_num is positive means move forwards // The skip_num is negative means move backwards bool skip(int32_t skip_num); bool seek_at_or_after_value(const void* value, bool* exact_match); uint32_t current_index() const { return _current_index; } uint32_t count() const { return _values_num; } private: void bit_unpack(const uint8_t* input, uint8_t in_num, int bit_width, T* output); uint32_t frame_size(uint32_t frame_index) { return (frame_index == _frame_count - 1) ? _last_frame_size : _max_frame_size; } void decode_current_frame(T* output); T decode_frame_min_value(uint32_t frame_index); // Return index of the last frame which contains value < target. // Return `_frame_count - 1` when all frames are < target. // Return `_frame_count` when not found (all frames are >= target). uint32_t seek_last_frame_before_value(T target); // Seek to the first value in frame that >= target. // Return true when found and update exact_match. // Return false otherwise. bool seek_lower_bound_inside_frame(uint32_t frame_index, T target, bool* exact_match); T* copy_value(T* val, size_t count); const uint8_t* _buffer = nullptr; size_t _buffer_len = 0; bool _parsed = false; uint8_t _max_frame_size = 0; uint8_t _last_frame_size = 0; uint32_t _values_num = 0; uint32_t _frame_count = 0; std::vector _frame_offsets; std::vector _bit_widths; std::vector _storage_formats; uint32_t _current_index = 0; uint32_t _current_decoded_frame = -1; std::vector _out_buffer; // store values of decoded frame }; } // namespace doris