Files
doris/be/src/olap/rowset/run_length_integer_reader.cpp
Mingyu Chen c0e59e59aa [fix][refactor] fix bugs and refactor some code by lint (#7871)
1. Fix some `passedByValue` issues.
2. Fix some `dereferenceBeforeCheck` issues.
3. Fix some `uninitMemberVar` issues.
4. Fix some iterator `eraseDereference` issues.
5. Fix compile issue introduced from #7923 #7905 #7848
2022-02-01 14:31:14 +08:00

432 lines
12 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "olap/rowset/run_length_integer_reader.h"
#include "olap/in_stream.h"
#include "olap/rowset/column_reader.h"
#include "olap/serialize.h"
namespace doris {
RunLengthIntegerReader::RunLengthIntegerReader(ReadOnlyFileStream* input, bool is_singed)
: _input(input), _signed(is_singed), _num_literals(0), _used(0) {}
OLAPStatus RunLengthIntegerReader::_read_values() {
OLAPStatus res = OLAP_SUCCESS;
// read the first 2 bits and determine the encoding type
uint8_t first_byte;
res = _input->read((char*)&first_byte);
if (OLAP_SUCCESS != res) {
OLAP_LOG_WARNING("fail to read first byte.[res=%d]", res);
return res;
} else {
int enc = (first_byte >> 6) & 0x03;
if (RunLengthIntegerWriter::SHORT_REPEAT == enc) {
res = _read_short_repeat_values(first_byte);
} else if (RunLengthIntegerWriter::DIRECT == enc) {
res = _read_direct_values(first_byte);
} else if (RunLengthIntegerWriter::PATCHED_BASE == enc) {
res = _read_patched_base_values(first_byte);
} else {
res = _read_delta_values(first_byte);
}
}
return res;
}
OLAPStatus RunLengthIntegerReader::_read_delta_values(uint8_t first_byte) {
OLAPStatus res = OLAP_SUCCESS;
// extract the number of fixed bits
uint32_t fb = (first_byte >> 1) & 0x1f;
if (fb != 0) {
fb = ser::decode_bit_width(fb);
}
// extract the blob run length
int32_t len = (first_byte & 0x01) << 8;
uint8_t byte;
res = _input->read((char*)&byte);
if (OLAP_SUCCESS != res) {
OLAP_LOG_WARNING("fail to read byte from instream.[res=%d]", res);
return res;
}
len |= byte;
// read the first value stored as vint
int64_t first_val = 0;
if (_signed) {
res = ser::read_var_signed(_input, &first_val);
if (OLAP_SUCCESS != res) {
OLAP_LOG_WARNING("fail to read var signed.[res=%d]", res);
return res;
}
} else {
res = ser::read_var_unsigned(_input, &first_val);
if (OLAP_SUCCESS != res) {
OLAP_LOG_WARNING("fail to read var unsigned.[res=%d]", res);
return res;
}
}
// store first value to result buffer
int64_t prev_val = first_val;
_literals[_num_literals++] = first_val;
// if fixed bits is 0 then all values have fixed delta
if (fb == 0) {
// read the fixed delta value stored as vint (deltas can be negative even
// if all number are positive)
int64_t fd = 0;
res = ser::read_var_signed(_input, &fd);
if (OLAP_SUCCESS != res) {
OLAP_LOG_WARNING("fail to read var signed.[res=%d]", res);
return res;
}
// add fixed deltas to adjacent values
for (int i = 0; i < len; i++) {
//_literals[_num_literals++] = _literals[_num_literals - 2] + fd;
_literals[_num_literals] = _literals[_num_literals - 1] + fd;
_num_literals++;
}
} else {
int64_t delta_base = 0;
res = ser::read_var_signed(_input, &delta_base);
if (OLAP_SUCCESS != res) {
OLAP_LOG_WARNING("fail to read var signed.[res=%d]", res);
return res;
}
// add delta base and first value
_literals[_num_literals++] = first_val + delta_base;
prev_val = _literals[_num_literals - 1];
len -= 1;
// write the unpacked values, add it to previous value and store final
// value to result buffer. if the delta base value is negative then it
// is a decreasing sequence else an increasing sequence
res = ser::read_ints(_input, &_literals[_num_literals], len, fb);
if (OLAP_SUCCESS != res) {
OLAP_LOG_WARNING("fail to read ints.[res = %d]", res);
return res;
}
while (len > 0) {
if (delta_base < 0) {
_literals[_num_literals] = prev_val - _literals[_num_literals];
} else {
_literals[_num_literals] = prev_val + _literals[_num_literals];
}
prev_val = _literals[_num_literals];
--len;
++_num_literals;
}
}
return res;
}
OLAPStatus RunLengthIntegerReader::_read_patched_base_values(uint8_t first_byte) {
OLAPStatus res = OLAP_SUCCESS;
// extract the number of fixed bits
int32_t fbo = (first_byte >> 1) & 0x1f;
int32_t fb = ser::decode_bit_width(fbo);
// extract the run length of data blob
int32_t len = (first_byte & 0x01) << 8;
uint8_t byte;
res = _input->read((char*)&byte);
if (OLAP_SUCCESS != res) {
OLAP_LOG_WARNING("fail to read byte from in_straem.[res=%d]", res);
return res;
}
len |= byte;
// runs are always one off
len += 1;
// extract the number of bytes occupied by base
char third_byte = '\0';
res = _input->read(&third_byte);
if (OLAP_SUCCESS != res) {
OLAP_LOG_WARNING("fail to read byte from in_stream.[res=%d]", res);
return res;
}
int32_t bw = ((uint8_t)third_byte >> 5) & 0x07;
// base width is one off
bw += 1;
// extract patch width
uint32_t pwo = third_byte & 0x1f;
uint32_t pw = ser::decode_bit_width(pwo);
// read fourth byte and extract patch gap width
char four_byte = '\0';
res = _input->read(&four_byte);
if (OLAP_SUCCESS != res) {
OLAP_LOG_WARNING("fail to read byte from in_straem.[res=%d]", res);
return res;
}
int32_t pgw = ((uint8_t)four_byte >> 5) & 0x07;
// patch gap width is one off
pgw += 1;
// extract the length of the patch list
int32_t pl = four_byte & 0x1f;
// read the next base width number of bytes to extract base value
int64_t base = 0;
res = ser::bytes_to_long_be(_input, bw, &base);
if (OLAP_SUCCESS != res) {
OLAP_LOG_WARNING("fail to bytes to long be.[res=%d]", res);
return res;
}
int64_t mask = (1L << ((bw * 8) - 1));
// if MSB of base value is 1 then base is negative value else positive
// TODO(lijiao): Why is zig_zag not used here?
if ((base & mask) != 0) {
base = base & ~mask;
base = -base;
}
// unpack the data blob
int64_t unpacked[len];
res = ser::read_ints(_input, unpacked, len, fb);
if (OLAP_SUCCESS != res) {
return res;
}
// unpack the patch blob
int64_t unpacked_patch[pl];
uint32_t bit_width = ser::get_closet_fixed_bits(pw + pgw);
res = ser::read_ints(_input, unpacked_patch, pl, bit_width);
if (OLAP_SUCCESS != res) {
return res;
}
// apply the patch directly when decoding the packed data
int32_t patch_idx = 0;
int64_t curr_gap = 0;
int64_t curr_patch = 0;
curr_gap = (uint64_t)unpacked_patch[patch_idx] >> pw;
curr_patch = unpacked_patch[patch_idx] & ((1L << pw) - 1);
int64_t actual_gap = 0;
// special case: gap is >255 then patch value will be 0.
// if gap is <=255 then patch value cannot be 0
while (curr_gap == 255 && curr_patch == 0) {
actual_gap += 255;
++patch_idx;
curr_gap = (uint64_t)unpacked_patch[patch_idx] >> pw;
curr_patch = unpacked_patch[patch_idx] & ((1L << pw) - 1);
}
// add the left over gap
actual_gap += curr_gap;
// unpack data blob, patch it (if required), add base to get final result
for (int32_t i = 0; i < len; i++) {
if (i == actual_gap) {
// extract the patch value
int64_t patched_val = unpacked[i] | (curr_patch << fb);
// add base to patched value
_literals[_num_literals++] = base + patched_val;
// increment the patch to point to next entry in patch list
++patch_idx;
if (patch_idx < pl) {
// read the next gap and patch
curr_gap = (uint64_t)unpacked_patch[patch_idx] >> pw;
curr_patch = unpacked_patch[patch_idx] & ((1L << pw) - 1);
actual_gap = 0;
// special case: gap is >255 then patch will be 0. if gap is
// <=255 then patch cannot be 0
while (curr_gap == 255 && curr_patch == 0) {
actual_gap += 255;
++patch_idx;
curr_gap = (uint64_t)unpacked_patch[patch_idx] >> pw;
curr_patch = unpacked_patch[patch_idx] & ((1L << pw) - 1);
}
// add the left over gap
actual_gap += curr_gap;
// next gap is relative to the current gap
actual_gap += i;
}
} else {
// no patching required. add base to unpacked value to get final value
_literals[_num_literals++] = base + unpacked[i];
}
}
return res;
}
OLAPStatus RunLengthIntegerReader::_read_direct_values(uint8_t first_byte) {
OLAPStatus res = OLAP_SUCCESS;
// extract the number of fixed bits
uint32_t fbo = (first_byte >> 1) & 0x1f;
uint32_t fb = ser::decode_bit_width(fbo);
// extract the run length
int32_t len = (first_byte & 0x01) << 8;
uint8_t byte;
res = _input->read((char*)&byte);
if (OLAP_SUCCESS != res) {
return res;
}
len |= byte;
// runs are one off
len += 1;
// write the unpacked values and zigzag decode to result buffer
res = ser::read_ints(_input, _literals, len, fb);
if (OLAP_SUCCESS != res) {
return res;
}
if (_signed) {
for (int32_t i = 0; i < len; ++i) {
_literals[_num_literals] = ser::zig_zag_decode(_literals[_num_literals]);
++_num_literals;
}
} else {
_num_literals += len;
}
return res;
}
OLAPStatus RunLengthIntegerReader::_read_short_repeat_values(uint8_t first_byte) {
OLAPStatus res = OLAP_SUCCESS;
// read the number of bytes occupied by the value
int32_t size = (first_byte >> 3) & 0x07;
// #bytes are one off
size += 1;
// read the run length
int32_t len = first_byte & 0x07;
// run lengths values are stored only after MIN_REPEAT value is met
len += RunLengthIntegerWriter::MIN_REPEAT;
// read the repeated value which is store using fixed bytes
int64_t val = 0;
res = ser::bytes_to_long_be(_input, size, &val);
if (OLAP_SUCCESS != res) {
return res;
}
if (_signed) {
val = ser::zig_zag_decode(val);
}
// repeat the value for length times
for (int32_t i = 0; i < len; i++) {
_literals[_num_literals++] = val;
}
return res;
}
OLAPStatus RunLengthIntegerReader::seek(PositionProvider* position) {
OLAPStatus res = OLAP_SUCCESS;
if (OLAP_SUCCESS != (res = _input->seek(position))) {
return res;
}
int32_t consumed = static_cast<int32_t>(position->get_next());
if (consumed != 0) {
// a loop is required for cases where we break the run into two parts
while (consumed > 0) {
_num_literals = 0;
res = _read_values();
if (OLAP_SUCCESS != res) {
return res;
}
_used = consumed;
consumed -= _num_literals;
}
} else {
_used = 0;
_num_literals = 0;
}
return res;
}
OLAPStatus RunLengthIntegerReader::skip(uint64_t num_values) {
OLAPStatus res = OLAP_SUCCESS;
while (num_values > 0) {
if (_used == _num_literals) {
_num_literals = 0;
_used = 0;
res = _read_values();
if (OLAP_SUCCESS != res) {
OLAP_LOG_WARNING("fail to read values.[res=%d]", res);
return res;
}
}
int64_t consume = std::min(num_values, static_cast<uint64_t>(_num_literals - _used));
_used += consume;
num_values -= consume;
}
return res;
}
} // namespace doris