Files
doris/be/src/runtime/raw_value.h
Gabriel 3b46242483 [feature-wip] Optimize Decimal type (#10794)
* [feature-wip](decimalv3) support decimalv3

* [feature-wip] Optimize Decimal type

Co-authored-by: liaoxin <liaoxinbit@126.com>
2022-07-14 10:50:50 +08:00

532 lines
19 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/apache/impala/blob/branch-2.9.0/be/src/runtime/raw-value.h
// and modified by Doris
#pragma once
#include <string>
#include "common/consts.h"
#include "common/logging.h"
#include "runtime/string_value.h"
#include "runtime/types.h"
#include "util/hash_util.hpp"
#include "util/types.h"
namespace doris {
class MemPool;
class SlotDescriptor;
class Tuple;
// Useful utility functions for runtime values (which are passed around as void*).
class RawValue {
public:
// Ascii output precision for double/float
static const int ASCII_PRECISION;
// Convert 'value' into ascii and write to 'stream'. nullptr turns into NULL. 'scale'
// determines how many digits after the decimal are printed for floating point numbers,
// -1 indicates to use the stream's current formatting.
static void print_value(const void* value, const TypeDescriptor& type, int scale,
std::stringstream* stream);
// write ascii value to string instead of stringstream.
static void print_value(const void* value, const TypeDescriptor& type, int scale,
std::string* str);
// Writes the byte representation of a value to a stringstream character-by-character
static void print_value_as_bytes(const void* value, const TypeDescriptor& type,
std::stringstream* stream);
static uint32_t get_hash_value(const void* value, const PrimitiveType& type) {
return get_hash_value(value, type, 0);
}
static uint32_t get_hash_value(const void* value, const PrimitiveType& type, uint32_t seed);
// Returns hash value for 'value' interpreted as 'type'. The resulting hash value
// is combined with the seed value.
static uint32_t get_hash_value(const void* value, const TypeDescriptor& type, uint32_t seed) {
return get_hash_value(value, type.type, seed);
}
static uint32_t get_hash_value(const void* value, const TypeDescriptor& type) {
return get_hash_value(value, type.type, 0);
}
// Get the hash value using the fvn hash function. Using different seeds with FVN
// results in different hash functions. get_hash_value() does not have this property
// and cannot be safely used as the first step in data repartitioning.
// However, get_hash_value() can be significantly faster.
// TODO: fix get_hash_value
static uint32_t get_hash_value_fvn(const void* value, const PrimitiveType& type, uint32_t seed);
static uint32_t get_hash_value_fvn(const void* value, const TypeDescriptor& type,
uint32_t seed) {
return get_hash_value_fvn(value, type.type, seed);
}
// Get the hash value using the fvn hash function. Using different seeds with FVN
// results in different hash functions. get_hash_value() does not have this property
// and cannot be safely used as the first step in data repartitioning.
// However, get_hash_value() can be significantly faster.
// TODO: fix get_hash_value
static uint32_t zlib_crc32(const void* value, const TypeDescriptor& type, uint32_t seed);
// Same as the up function, only use in vec exec engine.
static uint32_t zlib_crc32(const void* value, size_t len, const TypeDescriptor& type,
uint32_t seed);
// Compares both values.
// Return value is < 0 if v1 < v2, 0 if v1 == v2, > 0 if v1 > v2.
static int compare(const void* v1, const void* v2, const TypeDescriptor& type);
// Writes the bytes of a given value into the slot of a tuple.
// For string values, the string data is copied into memory allocated from 'pool'
// only if pool is non-nullptr.
static void write(const void* value, Tuple* tuple, const SlotDescriptor* slot_desc,
MemPool* pool);
// Writes 'src' into 'dst' for type.
// For string values, the string data is copied into 'pool' if pool is non-nullptr.
// src must be non-nullptr.
static void write(const void* src, void* dst, const TypeDescriptor& type, MemPool* pool);
// Writes 'src' into 'dst' for type.
// String values are copied into *buffer and *buffer is updated by the length. *buf
// must be preallocated to be large enough.
static void write(const void* src, const TypeDescriptor& type, void* dst, uint8_t** buf);
// Returns true if v1 == v2.
// This is more performant than compare() == 0 for string equality, mostly because of
// the length comparison check.
static bool eq(const void* v1, const void* v2, const TypeDescriptor& type);
static bool lt(const void* v1, const void* v2, const TypeDescriptor& type);
};
inline bool RawValue::lt(const void* v1, const void* v2, const TypeDescriptor& type) {
const StringValue* string_value1;
const StringValue* string_value2;
switch (type.type) {
case TYPE_BOOLEAN:
return *reinterpret_cast<const bool*>(v1) < *reinterpret_cast<const bool*>(v2);
case TYPE_TINYINT:
return *reinterpret_cast<const int8_t*>(v1) < *reinterpret_cast<const int8_t*>(v2);
case TYPE_SMALLINT:
return *reinterpret_cast<const int16_t*>(v1) < *reinterpret_cast<const int16_t*>(v2);
case TYPE_INT:
return *reinterpret_cast<const int32_t*>(v1) < *reinterpret_cast<const int32_t*>(v2);
case TYPE_BIGINT:
return *reinterpret_cast<const int64_t*>(v1) < *reinterpret_cast<const int64_t*>(v2);
case TYPE_FLOAT:
return *reinterpret_cast<const float*>(v1) < *reinterpret_cast<const float*>(v2);
case TYPE_DOUBLE:
return *reinterpret_cast<const double*>(v1) < *reinterpret_cast<const double*>(v2);
case TYPE_CHAR:
case TYPE_VARCHAR:
case TYPE_HLL:
case TYPE_STRING:
string_value1 = reinterpret_cast<const StringValue*>(v1);
string_value2 = reinterpret_cast<const StringValue*>(v2);
return string_value1->lt(*string_value2);
case TYPE_DATE:
case TYPE_DATETIME:
return *reinterpret_cast<const DateTimeValue*>(v1) <
*reinterpret_cast<const DateTimeValue*>(v2);
case TYPE_DATEV2:
return *reinterpret_cast<const vectorized::DateV2Value*>(v1) <
*reinterpret_cast<const vectorized::DateV2Value*>(v2);
case TYPE_DECIMALV2:
return reinterpret_cast<const PackedInt128*>(v1)->value <
reinterpret_cast<const PackedInt128*>(v2)->value;
case TYPE_DECIMAL32:
return *reinterpret_cast<const int32_t*>(v1) < *reinterpret_cast<const int32_t*>(v2);
case TYPE_DECIMAL64:
return *reinterpret_cast<const int64_t*>(v1) < *reinterpret_cast<const int64_t*>(v2);
case TYPE_DECIMAL128:
return reinterpret_cast<const PackedInt128*>(v1)->value <
reinterpret_cast<const PackedInt128*>(v2)->value;
case TYPE_LARGEINT:
return reinterpret_cast<const PackedInt128*>(v1)->value <
reinterpret_cast<const PackedInt128*>(v2)->value;
default:
DCHECK(false) << "invalid type: " << type;
return 0;
};
}
inline bool RawValue::eq(const void* v1, const void* v2, const TypeDescriptor& type) {
const StringValue* string_value1;
const StringValue* string_value2;
switch (type.type) {
case TYPE_BOOLEAN:
return *reinterpret_cast<const bool*>(v1) == *reinterpret_cast<const bool*>(v2);
case TYPE_TINYINT:
return *reinterpret_cast<const int8_t*>(v1) == *reinterpret_cast<const int8_t*>(v2);
case TYPE_SMALLINT:
return *reinterpret_cast<const int16_t*>(v1) == *reinterpret_cast<const int16_t*>(v2);
case TYPE_INT:
return *reinterpret_cast<const int32_t*>(v1) == *reinterpret_cast<const int32_t*>(v2);
case TYPE_BIGINT:
return *reinterpret_cast<const int64_t*>(v1) == *reinterpret_cast<const int64_t*>(v2);
case TYPE_FLOAT:
return *reinterpret_cast<const float*>(v1) == *reinterpret_cast<const float*>(v2);
case TYPE_DOUBLE:
return *reinterpret_cast<const double*>(v1) == *reinterpret_cast<const double*>(v2);
case TYPE_CHAR:
case TYPE_VARCHAR:
case TYPE_HLL:
case TYPE_STRING:
string_value1 = reinterpret_cast<const StringValue*>(v1);
string_value2 = reinterpret_cast<const StringValue*>(v2);
return string_value1->eq(*string_value2);
case TYPE_DATE:
case TYPE_DATETIME:
return *reinterpret_cast<const DateTimeValue*>(v1) ==
*reinterpret_cast<const DateTimeValue*>(v2);
case TYPE_DATEV2:
return *reinterpret_cast<const vectorized::DateV2Value*>(v1) ==
*reinterpret_cast<const vectorized::DateV2Value*>(v2);
case TYPE_DECIMALV2:
return reinterpret_cast<const PackedInt128*>(v1)->value ==
reinterpret_cast<const PackedInt128*>(v2)->value;
case TYPE_DECIMAL32:
return *reinterpret_cast<const int32_t*>(v1) == *reinterpret_cast<const int32_t*>(v2);
case TYPE_DECIMAL64:
return *reinterpret_cast<const int64_t*>(v1) == *reinterpret_cast<const int64_t*>(v2);
case TYPE_DECIMAL128:
return reinterpret_cast<const PackedInt128*>(v1)->value ==
reinterpret_cast<const PackedInt128*>(v2)->value;
case TYPE_LARGEINT:
return reinterpret_cast<const PackedInt128*>(v1)->value ==
reinterpret_cast<const PackedInt128*>(v2)->value;
default:
DCHECK(false) << "invalid type: " << type;
return 0;
};
}
// Use boost::hash_combine for corner cases. boost::hash_combine is reimplemented
// here to use int32t's (instead of size_t)
// boost::hash_combine does:
// seed ^= v + 0x9e3779b9 + (seed << 6) + (seed >> 2);
inline uint32_t RawValue::get_hash_value(const void* v, const PrimitiveType& type, uint32_t seed) {
// Hash_combine with v = 0
if (v == nullptr) {
uint32_t value = 0x9e3779b9;
return seed ^ (value + (seed << 6) + (seed >> 2));
}
switch (type) {
case TYPE_VARCHAR:
case TYPE_CHAR:
case TYPE_HLL:
case TYPE_STRING: {
const StringValue* string_value = reinterpret_cast<const StringValue*>(v);
return HashUtil::hash(string_value->ptr, string_value->len, seed);
}
case TYPE_BOOLEAN: {
uint32_t value = *reinterpret_cast<const bool*>(v) + 0x9e3779b9;
return seed ^ (value + (seed << 6) + (seed >> 2));
}
case TYPE_TINYINT:
return HashUtil::hash(v, 1, seed);
case TYPE_SMALLINT:
return HashUtil::hash(v, 2, seed);
case TYPE_INT:
return HashUtil::hash(v, 4, seed);
case TYPE_BIGINT:
return HashUtil::hash(v, 8, seed);
case TYPE_FLOAT:
return HashUtil::hash(v, 4, seed);
case TYPE_DOUBLE:
return HashUtil::hash(v, 8, seed);
case TYPE_DATE:
case TYPE_DATETIME:
return HashUtil::hash(v, 16, seed);
case TYPE_DATEV2:
return HashUtil::hash(v, 4, seed);
case TYPE_DECIMALV2:
return HashUtil::hash(v, 16, seed);
case TYPE_DECIMAL32:
return HashUtil::hash(v, 4, seed);
case TYPE_DECIMAL64:
return HashUtil::hash(v, 8, seed);
case TYPE_DECIMAL128:
return HashUtil::hash(v, 16, seed);
case TYPE_LARGEINT:
return HashUtil::hash(v, 16, seed);
default:
DCHECK(false) << "invalid type: " << type;
return 0;
}
}
inline uint32_t RawValue::get_hash_value_fvn(const void* v, const PrimitiveType& type,
uint32_t seed) {
// Hash_combine with v = 0
if (v == nullptr) {
uint32_t value = 0x9e3779b9;
return seed ^ (value + (seed << 6) + (seed >> 2));
}
switch (type) {
case TYPE_VARCHAR:
case TYPE_CHAR:
case TYPE_HLL:
case TYPE_OBJECT:
case TYPE_STRING: {
const StringValue* string_value = reinterpret_cast<const StringValue*>(v);
return HashUtil::fnv_hash(string_value->ptr, string_value->len, seed);
}
case TYPE_BOOLEAN: {
uint32_t value = *reinterpret_cast<const bool*>(v) + 0x9e3779b9;
return seed ^ (value + (seed << 6) + (seed >> 2));
}
case TYPE_TINYINT:
return HashUtil::fnv_hash(v, 1, seed);
case TYPE_SMALLINT:
return HashUtil::fnv_hash(v, 2, seed);
case TYPE_INT:
return HashUtil::fnv_hash(v, 4, seed);
case TYPE_BIGINT:
return HashUtil::fnv_hash(v, 8, seed);
case TYPE_FLOAT:
return HashUtil::fnv_hash(v, 4, seed);
case TYPE_DOUBLE:
return HashUtil::fnv_hash(v, 8, seed);
case TYPE_DATE:
case TYPE_DATETIME:
return HashUtil::fnv_hash(v, 16, seed);
case TYPE_DATEV2:
return HashUtil::fnv_hash(v, 4, seed);
case TYPE_DECIMALV2:
return HashUtil::fnv_hash(v, 16, seed);
case TYPE_DECIMAL32:
return HashUtil::fnv_hash(v, 4, seed);
case TYPE_DECIMAL64:
return HashUtil::fnv_hash(v, 8, seed);
case TYPE_DECIMAL128:
return HashUtil::fnv_hash(v, 16, seed);
case TYPE_LARGEINT:
return HashUtil::fnv_hash(v, 16, seed);
default:
DCHECK(false) << "invalid type: " << type;
return 0;
}
}
// NOTE: this is just for split data, decimal use old doris hash function
// Because crc32 hardware is not equal with zlib crc32
inline uint32_t RawValue::zlib_crc32(const void* v, const TypeDescriptor& type, uint32_t seed) {
// Hash_combine with v = 0
if (v == nullptr) {
uint32_t value = 0x9e3779b9;
return seed ^ (value + (seed << 6) + (seed >> 2));
}
switch (type.type) {
case TYPE_VARCHAR:
case TYPE_HLL:
case TYPE_STRING: {
const StringValue* string_value = reinterpret_cast<const StringValue*>(v);
return HashUtil::zlib_crc_hash(string_value->ptr, string_value->len, seed);
}
case TYPE_CHAR: {
// TODO(zc): ugly, use actual value to compute hash value
const StringValue* string_value = reinterpret_cast<const StringValue*>(v);
int len = 0;
while (len < string_value->len) {
if (string_value->ptr[len] == '\0') {
break;
}
len++;
}
return HashUtil::zlib_crc_hash(string_value->ptr, len, seed);
}
case TYPE_BOOLEAN:
case TYPE_TINYINT:
return HashUtil::zlib_crc_hash(v, 1, seed);
case TYPE_SMALLINT:
return HashUtil::zlib_crc_hash(v, 2, seed);
case TYPE_INT:
return HashUtil::zlib_crc_hash(v, 4, seed);
case TYPE_BIGINT:
return HashUtil::zlib_crc_hash(v, 8, seed);
case TYPE_LARGEINT:
return HashUtil::zlib_crc_hash(v, 16, seed);
case TYPE_FLOAT:
return HashUtil::zlib_crc_hash(v, 4, seed);
case TYPE_DOUBLE:
return HashUtil::zlib_crc_hash(v, 8, seed);
case TYPE_DATE:
case TYPE_DATETIME: {
const DateTimeValue* date_val = (const DateTimeValue*)v;
char buf[64];
int len = date_val->to_buffer(buf);
return HashUtil::zlib_crc_hash(buf, len, seed);
}
case TYPE_DATEV2: {
const vectorized::DateV2Value* date_v2_val = (const vectorized::DateV2Value*)v;
char buf[64];
int len = date_v2_val->to_buffer(buf);
return HashUtil::zlib_crc_hash(buf, len, seed);
}
case TYPE_DECIMALV2: {
const DecimalV2Value* dec_val = (const DecimalV2Value*)v;
int64_t int_val = dec_val->int_value();
int32_t frac_val = dec_val->frac_value();
seed = HashUtil::zlib_crc_hash(&int_val, sizeof(int_val), seed);
return HashUtil::zlib_crc_hash(&frac_val, sizeof(frac_val), seed);
}
case TYPE_DECIMAL32:
return HashUtil::zlib_crc_hash(v, 4, seed);
case TYPE_DECIMAL64:
return HashUtil::zlib_crc_hash(v, 8, seed);
case TYPE_DECIMAL128:
return HashUtil::zlib_crc_hash(v, 16, seed);
default:
DCHECK(false) << "invalid type: " << type;
return 0;
}
}
// NOTE: this is just for split data, decimal use old doris hash function
// Because crc32 hardware is not equal with zlib crc32
inline uint32_t RawValue::zlib_crc32(const void* v, size_t len, const TypeDescriptor& type,
uint32_t seed) {
// Hash_combine with v = 0
if (v == nullptr) {
uint32_t value = 0x9e3779b9;
return seed ^ (value + (seed << 6) + (seed >> 2));
}
switch (type.type) {
case TYPE_VARCHAR:
case TYPE_HLL:
case TYPE_STRING:
case TYPE_CHAR: {
return HashUtil::zlib_crc_hash(v, len, seed);
}
case TYPE_BOOLEAN:
case TYPE_TINYINT:
return HashUtil::zlib_crc_hash(v, 1, seed);
case TYPE_SMALLINT:
return HashUtil::zlib_crc_hash(v, 2, seed);
case TYPE_INT:
return HashUtil::zlib_crc_hash(v, 4, seed);
case TYPE_BIGINT:
return HashUtil::zlib_crc_hash(v, 8, seed);
case TYPE_LARGEINT:
return HashUtil::zlib_crc_hash(v, 16, seed);
case TYPE_FLOAT:
return HashUtil::zlib_crc_hash(v, 4, seed);
case TYPE_DOUBLE:
return HashUtil::zlib_crc_hash(v, 8, seed);
case TYPE_DATE:
case TYPE_DATETIME: {
auto* date_val = (const vectorized::VecDateTimeValue*)v;
char buf[64];
int len = date_val->to_buffer(buf);
return HashUtil::zlib_crc_hash(buf, len, seed);
}
case TYPE_DATEV2: {
auto* date_v2_val = (const vectorized::DateV2Value*)v;
char buf[64];
int date_v2_len = date_v2_val->to_buffer(buf);
return HashUtil::zlib_crc_hash(buf, date_v2_len, seed);
}
case TYPE_DECIMALV2: {
const DecimalV2Value* dec_val = (const DecimalV2Value*)v;
int64_t int_val = dec_val->int_value();
int32_t frac_val = dec_val->frac_value();
seed = HashUtil::zlib_crc_hash(&int_val, sizeof(int_val), seed);
return HashUtil::zlib_crc_hash(&frac_val, sizeof(frac_val), seed);
}
case TYPE_DECIMAL32:
return HashUtil::zlib_crc_hash(v, 4, seed);
case TYPE_DECIMAL64:
return HashUtil::zlib_crc_hash(v, 8, seed);
case TYPE_DECIMAL128:
return HashUtil::zlib_crc_hash(v, 16, seed);
default:
DCHECK(false) << "invalid type: " << type;
return 0;
}
}
} // namespace doris