From 0ead048b93faeae95cf5b7387a72ef02ce87ae7a Mon Sep 17 00:00:00 2001 From: Pxl Date: Wed, 14 Sep 2022 21:25:22 +0800 Subject: [PATCH] [Enhancement](column) remove ColumnString terminating zero and add a data_version for pblock (#12456) 1. remove ColumnString terminating zero 2. add a data_version for pblock 3. change EncryptionMode to enum class --- be/src/common/config.h | 3 + be/src/exprs/encryption_functions.cpp | 18 +-- be/src/exprs/encryption_functions.h | 47 +++++--- be/src/runtime/fold_constant_executor.cpp | 1 - be/src/udf/udf.h | 2 + be/src/util/encryption_util.cpp | 70 ++++++------ be/src/util/encryption_util.h | 6 +- be/src/vec/CMakeLists.txt | 1 - .../aggregate_function_collect.h | 1 + be/src/vec/columns/column.h | 5 - be/src/vec/columns/column_array.cpp | 10 +- be/src/vec/columns/column_const.h | 5 - be/src/vec/columns/column_string.cpp | 91 +++++++++------ be/src/vec/columns/column_string.h | 81 +++++--------- be/src/vec/common/allocator_fwd.h | 1 + be/src/vec/common/columns_hashing.h | 3 +- be/src/vec/common/string_buffer.hpp | 5 +- be/src/vec/core/block.cpp | 19 +++- be/src/vec/core/block.h | 6 +- be/src/vec/core/block_info.cpp | 42 ------- be/src/vec/core/block_info.h | 74 ------------- be/src/vec/data_types/data_type.h | 8 +- be/src/vec/data_types/data_type_array.cpp | 18 +-- be/src/vec/data_types/data_type_array.h | 8 +- be/src/vec/data_types/data_type_bitmap.cpp | 7 +- be/src/vec/data_types/data_type_bitmap.h | 8 +- be/src/vec/data_types/data_type_decimal.cpp | 8 +- be/src/vec/data_types/data_type_decimal.h | 7 +- .../data_type_fixed_length_object.cpp | 8 +- .../data_type_fixed_length_object.h | 8 +- be/src/vec/data_types/data_type_hll.cpp | 7 +- be/src/vec/data_types/data_type_hll.h | 7 +- be/src/vec/data_types/data_type_nothing.cpp | 6 +- be/src/vec/data_types/data_type_nothing.h | 9 +- be/src/vec/data_types/data_type_nullable.cpp | 16 +-- be/src/vec/data_types/data_type_nullable.h | 7 +- .../vec/data_types/data_type_number_base.cpp | 8 +- be/src/vec/data_types/data_type_number_base.h | 9 +- be/src/vec/data_types/data_type_string.cpp | 65 +++++++++-- be/src/vec/data_types/data_type_string.h | 7 +- be/src/vec/exec/vaggregation_node.h | 1 + .../functions/array/function_array_distinct.h | 8 +- .../vec/functions/array/function_array_sort.h | 6 +- be/src/vec/functions/date_time_transforms.h | 35 ++---- be/src/vec/functions/function_bit.cpp | 2 +- be/src/vec/functions/function_bitmap.cpp | 8 +- be/src/vec/functions/function_cast.h | 6 +- .../function_datetime_string_to_string.h | 1 + be/src/vec/functions/function_encryption.cpp | 59 +++++----- be/src/vec/functions/function_hash.cpp | 7 +- be/src/vec/functions/function_hex.cpp | 16 +-- be/src/vec/functions/function_json.cpp | 12 +- be/src/vec/functions/function_string.cpp | 37 +++---- be/src/vec/functions/function_string.h | 44 ++++---- be/src/vec/functions/function_timestamp.cpp | 9 +- be/src/vec/functions/hll_hash.cpp | 6 +- be/src/vec/functions/like.cpp | 13 +-- be/src/vec/olap/olap_data_convertor.cpp | 5 +- be/src/vec/olap/olap_data_convertor.h | 12 +- be/src/vec/runtime/vdata_stream_recvr.cpp | 1 - be/src/vec/sink/vmysql_result_writer.cpp | 1 + .../utils/arrow_column_to_doris_column.cpp | 4 - be/test/util/encryption_util_test.cpp | 104 ++++++++++-------- be/test/vec/core/block_test.cpp | 3 +- be/test/vec/core/column_array_test.cpp | 2 +- be/test/vec/core/column_complex_test.cpp | 8 +- be/test/vec/function/function_string_test.cpp | 52 +++++---- .../arrow_column_to_doris_column_test.cpp | 1 + gensrc/proto/data.proto | 1 + 69 files changed, 552 insertions(+), 624 deletions(-) delete mode 100644 be/src/vec/core/block_info.cpp delete mode 100644 be/src/vec/core/block_info.h diff --git a/be/src/common/config.h b/be/src/common/config.h index 68050bd120..3b8edf8daf 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -257,6 +257,9 @@ CONF_Bool(enable_vectorized_compaction, "true"); // whether enable vectorized schema change/material-view/rollup task. CONF_Bool(enable_vectorized_alter_table, "true"); +// serialize data version +CONF_mInt32(block_data_version, "-1"); + // check the configuration of auto compaction in seconds when auto compaction disabled CONF_mInt32(check_auto_compaction_interval_seconds, "5"); diff --git a/be/src/exprs/encryption_functions.cpp b/be/src/exprs/encryption_functions.cpp index 9b462592c0..52eb572698 100644 --- a/be/src/exprs/encryption_functions.cpp +++ b/be/src/exprs/encryption_functions.cpp @@ -41,8 +41,8 @@ StringVal encrypt(FunctionContext* ctx, const StringVal& src, const StringVal& k cipher_text.reset(new char[cipher_len]); int cipher_text_len = 0; cipher_text_len = EncryptionUtil::encrypt(mode, (unsigned char*)src.ptr, src.len, - (unsigned char*)key.ptr, key.len, (char*)iv.ptr, true, - (unsigned char*)cipher_text.get()); + (unsigned char*)key.ptr, key.len, (char*)iv.ptr, + iv.len, true, (unsigned char*)cipher_text.get()); if (cipher_text_len < 0) { return StringVal::null(); } @@ -58,9 +58,9 @@ StringVal decrypt(FunctionContext* ctx, const StringVal& src, const StringVal& k std::unique_ptr plain_text; plain_text.reset(new char[cipher_len]); int plain_text_len = 0; - plain_text_len = - EncryptionUtil::decrypt(mode, (unsigned char*)src.ptr, src.len, (unsigned char*)key.ptr, - key.len, (char*)iv.ptr, true, (unsigned char*)plain_text.get()); + plain_text_len = EncryptionUtil::decrypt(mode, (unsigned char*)src.ptr, src.len, + (unsigned char*)key.ptr, key.len, (char*)iv.ptr, + iv.len, true, (unsigned char*)plain_text.get()); if (plain_text_len < 0) { return StringVal::null(); } @@ -80,7 +80,7 @@ StringVal EncryptionFunctions::aes_decrypt(FunctionContext* ctx, const StringVal StringVal EncryptionFunctions::aes_encrypt(FunctionContext* ctx, const StringVal& src, const StringVal& key, const StringVal& iv, const StringVal& mode) { - EncryptionMode encryption_mode = AES_128_ECB; + EncryptionMode encryption_mode = EncryptionMode::AES_128_ECB; if (mode.len != 0 && !mode.is_null) { std::string mode_str(reinterpret_cast(mode.ptr), mode.len); if (aes_mode_map.count(mode_str) == 0) { @@ -94,7 +94,7 @@ StringVal EncryptionFunctions::aes_encrypt(FunctionContext* ctx, const StringVal StringVal EncryptionFunctions::aes_decrypt(FunctionContext* ctx, const StringVal& src, const StringVal& key, const StringVal& iv, const StringVal& mode) { - EncryptionMode encryption_mode = AES_128_ECB; + EncryptionMode encryption_mode = EncryptionMode::AES_128_ECB; if (mode.len != 0 && !mode.is_null) { std::string mode_str(reinterpret_cast(mode.ptr), mode.len); if (aes_mode_map.count(mode_str) == 0) { @@ -118,7 +118,7 @@ StringVal EncryptionFunctions::sm4_decrypt(FunctionContext* ctx, const StringVal StringVal EncryptionFunctions::sm4_encrypt(FunctionContext* ctx, const StringVal& src, const StringVal& key, const StringVal& iv, const StringVal& mode) { - EncryptionMode encryption_mode = SM4_128_ECB; + EncryptionMode encryption_mode = EncryptionMode::SM4_128_ECB; if (mode.len != 0 && !mode.is_null) { std::string mode_str(reinterpret_cast(mode.ptr), mode.len); if (sm4_mode_map.count(mode_str) == 0) { @@ -132,7 +132,7 @@ StringVal EncryptionFunctions::sm4_encrypt(FunctionContext* ctx, const StringVal StringVal EncryptionFunctions::sm4_decrypt(FunctionContext* ctx, const StringVal& src, const StringVal& key, const StringVal& iv, const StringVal& mode) { - EncryptionMode encryption_mode = SM4_128_ECB; + EncryptionMode encryption_mode = EncryptionMode::SM4_128_ECB; if (mode.len != 0 && !mode.is_null) { std::string mode_str(reinterpret_cast(mode.ptr), mode.len); if (sm4_mode_map.count(mode_str) == 0) { diff --git a/be/src/exprs/encryption_functions.h b/be/src/exprs/encryption_functions.h index 0083cb4c81..e4976ab519 100644 --- a/be/src/exprs/encryption_functions.h +++ b/be/src/exprs/encryption_functions.h @@ -29,23 +29,36 @@ class Expr; struct ExprValue; class TupleRow; static StringCaseUnorderedMap aes_mode_map { - {"AES_128_ECB", AES_128_ECB}, {"AES_192_ECB", AES_192_ECB}, - {"AES_256_ECB", AES_256_ECB}, {"AES_128_CBC", AES_128_CBC}, - {"AES_192_CBC", AES_192_CBC}, {"AES_256_CBC", AES_256_CBC}, - {"AES_128_CFB", AES_128_CFB}, {"AES_192_CFB", AES_192_CFB}, - {"AES_256_CFB", AES_256_CFB}, {"AES_128_CFB1", AES_128_CFB1}, - {"AES_192_CFB1", AES_192_CFB1}, {"AES_256_CFB1", AES_256_CFB1}, - {"AES_128_CFB8", AES_128_CFB8}, {"AES_192_CFB8", AES_192_CFB8}, - {"AES_256_CFB8", AES_256_CFB8}, {"AES_128_CFB128", AES_128_CFB128}, - {"AES_192_CFB128", AES_192_CFB128}, {"AES_256_CFB128", AES_256_CFB128}, - {"AES_128_CTR", AES_128_CTR}, {"AES_192_CTR", AES_192_CTR}, - {"AES_256_CTR", AES_256_CTR}, {"AES_128_OFB", AES_128_OFB}, - {"AES_192_OFB", AES_192_OFB}, {"AES_256_OFB", AES_256_OFB}}; -static StringCaseUnorderedMap sm4_mode_map {{"SM4_128_ECB", SM4_128_ECB}, - {"SM4_128_CBC", SM4_128_CBC}, - {"SM4_128_CFB128", SM4_128_CFB128}, - {"SM4_128_OFB", SM4_128_OFB}, - {"SM4_128_CTR", SM4_128_CTR}}; + {"AES_128_ECB", EncryptionMode::AES_128_ECB}, + {"AES_192_ECB", EncryptionMode::AES_192_ECB}, + {"AES_256_ECB", EncryptionMode::AES_256_ECB}, + {"AES_128_CBC", EncryptionMode::AES_128_CBC}, + {"AES_192_CBC", EncryptionMode::AES_192_CBC}, + {"AES_256_CBC", EncryptionMode::AES_256_CBC}, + {"AES_128_CFB", EncryptionMode::AES_128_CFB}, + {"AES_192_CFB", EncryptionMode::AES_192_CFB}, + {"AES_256_CFB", EncryptionMode::AES_256_CFB}, + {"AES_128_CFB1", EncryptionMode::AES_128_CFB1}, + {"AES_192_CFB1", EncryptionMode::AES_192_CFB1}, + {"AES_256_CFB1", EncryptionMode::AES_256_CFB1}, + {"AES_128_CFB8", EncryptionMode::AES_128_CFB8}, + {"AES_192_CFB8", EncryptionMode::AES_192_CFB8}, + {"AES_256_CFB8", EncryptionMode::AES_256_CFB8}, + {"AES_128_CFB128", EncryptionMode::AES_128_CFB128}, + {"AES_192_CFB128", EncryptionMode::AES_192_CFB128}, + {"AES_256_CFB128", EncryptionMode::AES_256_CFB128}, + {"AES_128_CTR", EncryptionMode::AES_128_CTR}, + {"AES_192_CTR", EncryptionMode::AES_192_CTR}, + {"AES_256_CTR", EncryptionMode::AES_256_CTR}, + {"AES_128_OFB", EncryptionMode::AES_128_OFB}, + {"AES_192_OFB", EncryptionMode::AES_192_OFB}, + {"AES_256_OFB", EncryptionMode::AES_256_OFB}}; +static StringCaseUnorderedMap sm4_mode_map { + {"SM4_128_ECB", EncryptionMode::SM4_128_ECB}, + {"SM4_128_CBC", EncryptionMode::SM4_128_CBC}, + {"SM4_128_CFB128", EncryptionMode::SM4_128_CFB128}, + {"SM4_128_OFB", EncryptionMode::SM4_128_OFB}, + {"SM4_128_CTR", EncryptionMode::SM4_128_CTR}}; class EncryptionFunctions { public: static void init(); diff --git a/be/src/runtime/fold_constant_executor.cpp b/be/src/runtime/fold_constant_executor.cpp index 2ee1b7cde9..59d820c1ad 100644 --- a/be/src/runtime/fold_constant_executor.cpp +++ b/be/src/runtime/fold_constant_executor.cpp @@ -179,7 +179,6 @@ Status FoldConstantExecutor::_init(const TQueryGlobals& query_globals) { _runtime_profile = _runtime_state->runtime_profile(); _runtime_profile->set_name("FoldConstantExpr"); - SCOPED_ATTACH_TASK(_runtime_state.get()); _mem_tracker = std::make_unique("FoldConstantExpr"); _mem_pool.reset(new MemPool(_mem_tracker.get())); diff --git a/be/src/udf/udf.h b/be/src/udf/udf.h index 907ee2dd3d..deae429f31 100644 --- a/be/src/udf/udf.h +++ b/be/src/udf/udf.h @@ -762,6 +762,8 @@ struct StringVal : public AnyVal { // copy of ptr so the underlying string must exist as long as this StringVal does. StringVal(const char* ptr) : len(strlen(ptr)), ptr((uint8_t*)ptr) {} + StringVal(const char* ptr, int64_t len) : len(len), ptr((uint8_t*)ptr) {} + static StringVal null() { StringVal sv; sv.is_null = true; diff --git a/be/src/util/encryption_util.cpp b/be/src/util/encryption_util.cpp index b9396e968f..b490a30fc3 100644 --- a/be/src/util/encryption_util.cpp +++ b/be/src/util/encryption_util.cpp @@ -31,63 +31,63 @@ static const int ENCRYPTION_MAX_KEY_LENGTH = 256; const EVP_CIPHER* get_evp_type(const EncryptionMode mode) { switch (mode) { - case AES_128_ECB: + case EncryptionMode::AES_128_ECB: return EVP_aes_128_ecb(); - case AES_128_CBC: + case EncryptionMode::AES_128_CBC: return EVP_aes_128_cbc(); - case AES_128_CFB: + case EncryptionMode::AES_128_CFB: return EVP_aes_128_cfb(); - case AES_128_CFB1: + case EncryptionMode::AES_128_CFB1: return EVP_aes_128_cfb1(); - case AES_128_CFB8: + case EncryptionMode::AES_128_CFB8: return EVP_aes_128_cfb8(); - case AES_128_CFB128: + case EncryptionMode::AES_128_CFB128: return EVP_aes_128_cfb128(); - case AES_128_CTR: + case EncryptionMode::AES_128_CTR: return EVP_aes_128_ctr(); - case AES_128_OFB: + case EncryptionMode::AES_128_OFB: return EVP_aes_128_ofb(); - case AES_192_ECB: + case EncryptionMode::AES_192_ECB: return EVP_aes_192_ecb(); - case AES_192_CBC: + case EncryptionMode::AES_192_CBC: return EVP_aes_192_cbc(); - case AES_192_CFB: + case EncryptionMode::AES_192_CFB: return EVP_aes_192_cfb(); - case AES_192_CFB1: + case EncryptionMode::AES_192_CFB1: return EVP_aes_192_cfb1(); - case AES_192_CFB8: + case EncryptionMode::AES_192_CFB8: return EVP_aes_192_cfb8(); - case AES_192_CFB128: + case EncryptionMode::AES_192_CFB128: return EVP_aes_192_cfb128(); - case AES_192_CTR: + case EncryptionMode::AES_192_CTR: return EVP_aes_192_ctr(); - case AES_192_OFB: + case EncryptionMode::AES_192_OFB: return EVP_aes_192_ofb(); - case AES_256_ECB: + case EncryptionMode::AES_256_ECB: return EVP_aes_256_ecb(); - case AES_256_CBC: + case EncryptionMode::AES_256_CBC: return EVP_aes_256_cbc(); - case AES_256_CFB: + case EncryptionMode::AES_256_CFB: return EVP_aes_256_cfb(); - case AES_256_CFB1: + case EncryptionMode::AES_256_CFB1: return EVP_aes_256_cfb1(); - case AES_256_CFB8: + case EncryptionMode::AES_256_CFB8: return EVP_aes_256_cfb8(); - case AES_256_CFB128: + case EncryptionMode::AES_256_CFB128: return EVP_aes_256_cfb128(); - case AES_256_CTR: + case EncryptionMode::AES_256_CTR: return EVP_aes_256_ctr(); - case AES_256_OFB: + case EncryptionMode::AES_256_OFB: return EVP_aes_256_ofb(); - case SM4_128_CBC: + case EncryptionMode::SM4_128_CBC: return EVP_sm4_cbc(); - case SM4_128_ECB: + case EncryptionMode::SM4_128_ECB: return EVP_sm4_ecb(); - case SM4_128_CFB128: + case EncryptionMode::SM4_128_CFB128: return EVP_sm4_cfb128(); - case SM4_128_OFB: + case EncryptionMode::SM4_128_OFB: return EVP_sm4_ofb(); - case SM4_128_CTR: + case EncryptionMode::SM4_128_CTR: return EVP_sm4_ctr(); default: return nullptr; @@ -128,7 +128,7 @@ static uint mode_key_sizes[] = { static void create_key(const unsigned char* origin_key, uint32_t key_length, uint8_t* encrypt_key, EncryptionMode mode) { - const uint key_size = mode_key_sizes[mode] / 8; + const uint key_size = mode_key_sizes[int(mode)] / 8; uint8_t* origin_key_end = ((uint8_t*)origin_key) + key_length; /* origin key boundary*/ uint8_t* encrypt_key_end; /* encrypt key boundary */ @@ -173,7 +173,8 @@ static int do_encrypt(EVP_CIPHER_CTX* cipher_ctx, const EVP_CIPHER* cipher, int EncryptionUtil::encrypt(EncryptionMode mode, const unsigned char* source, uint32_t source_length, const unsigned char* key, uint32_t key_length, - const char* iv_str, bool padding, unsigned char* encrypt) { + const char* iv_str, int iv_input_length, bool padding, + unsigned char* encrypt) { const EVP_CIPHER* cipher = get_evp_type(mode); /* The encrypt key to be used for encryption */ unsigned char encrypt_key[ENCRYPTION_MAX_KEY_LENGTH / 8]; @@ -188,7 +189,7 @@ int EncryptionUtil::encrypt(EncryptionMode mode, const unsigned char* source, if (iv_str) { init_vec = &iv_default[0]; - memcpy(init_vec, iv_str, strnlen(iv_str, EVP_MAX_IV_LENGTH)); + memcpy(init_vec, iv_str, std::min(iv_input_length, EVP_MAX_IV_LENGTH)); init_vec[iv_length] = '\0'; } EVP_CIPHER_CTX* cipher_ctx = EVP_CIPHER_CTX_new(); @@ -230,7 +231,8 @@ static int do_decrypt(EVP_CIPHER_CTX* cipher_ctx, const EVP_CIPHER* cipher, int EncryptionUtil::decrypt(EncryptionMode mode, const unsigned char* encrypt, uint32_t encrypt_length, const unsigned char* key, uint32_t key_length, - const char* iv_str, bool padding, unsigned char* decrypt_content) { + const char* iv_str, int iv_input_length, bool padding, + unsigned char* decrypt_content) { const EVP_CIPHER* cipher = get_evp_type(mode); /* The encrypt key to be used for decryption */ @@ -246,7 +248,7 @@ int EncryptionUtil::decrypt(EncryptionMode mode, const unsigned char* encrypt, if (iv_str) { init_vec = &iv_default[0]; - memcpy(init_vec, iv_str, strnlen(iv_str, EVP_MAX_IV_LENGTH)); + memcpy(init_vec, iv_str, std::min(iv_input_length, EVP_MAX_IV_LENGTH)); init_vec[iv_length] = '\0'; } EVP_CIPHER_CTX* cipher_ctx = EVP_CIPHER_CTX_new(); diff --git a/be/src/util/encryption_util.h b/be/src/util/encryption_util.h index 711a817a17..8e61a11995 100644 --- a/be/src/util/encryption_util.h +++ b/be/src/util/encryption_util.h @@ -21,7 +21,7 @@ namespace doris { -enum EncryptionMode { +enum class EncryptionMode { AES_128_ECB, AES_192_ECB, AES_256_ECB, @@ -59,11 +59,11 @@ class EncryptionUtil { public: static int encrypt(EncryptionMode mode, const unsigned char* source, uint32_t source_length, const unsigned char* key, uint32_t key_length, const char* iv_str, - bool padding, unsigned char* encrypt); + int iv_input_length, bool padding, unsigned char* encrypt); static int decrypt(EncryptionMode mode, const unsigned char* encrypt, uint32_t encrypt_length, const unsigned char* key, uint32_t key_length, const char* iv_str, - bool padding, unsigned char* decrypt_content); + int iv_input_length, bool padding, unsigned char* decrypt_content); }; } // namespace doris diff --git a/be/src/vec/CMakeLists.txt b/be/src/vec/CMakeLists.txt index f6214a0501..d8fc97e6b8 100644 --- a/be/src/vec/CMakeLists.txt +++ b/be/src/vec/CMakeLists.txt @@ -56,7 +56,6 @@ set(VEC_FILES common/pod_array.cpp common/string_utils/string_utils.cpp core/block.cpp - core/block_info.cpp core/column_with_type_and_name.cpp core/field.cpp core/field.cpp diff --git a/be/src/vec/aggregate_functions/aggregate_function_collect.h b/be/src/vec/aggregate_functions/aggregate_function_collect.h index 5df33ab6f2..caedd5131f 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_collect.h +++ b/be/src/vec/aggregate_functions/aggregate_function_collect.h @@ -20,6 +20,7 @@ #include "common/status.h" #include "vec/aggregate_functions/aggregate_function.h" #include "vec/aggregate_functions/key_holder_helpers.h" +#include "vec/columns/column_array.h" #include "vec/common/aggregation_common.h" #include "vec/common/hash_table/hash_set.h" #include "vec/common/pod_array_fwd.h" diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h index 3e4bc62cc0..9334c97476 100644 --- a/be/src/vec/columns/column.h +++ b/be/src/vec/columns/column.h @@ -22,7 +22,6 @@ #include "runtime/define_primitive_type.h" #include "vec/common/cow.h" -#include "vec/common/exception.h" #include "vec/common/pod_array_fwd.h" #include "vec/common/string_ref.h" #include "vec/common/typeid_cast.h" @@ -147,10 +146,6 @@ public: /// Is used to optimize some computations (in aggregation, for example). virtual StringRef get_data_at(size_t n) const = 0; - /// Like getData, but has special behavior for columns that contain variable-length strings. - /// Returns zero-ending memory chunk (i.e. its size is 1 byte longer). - virtual StringRef get_data_at_with_terminating_zero(size_t n) const { return get_data_at(n); } - /// If column stores integers, it returns n-th element transformed to UInt64 using static_cast. /// If column stores floating point numbers, bits of n-th elements are copied to lower bits of UInt64, the remaining bits are zeros. /// Is used to optimize some computations (in aggregation, for example). diff --git a/be/src/vec/columns/column_array.cpp b/be/src/vec/columns/column_array.cpp index 473cde0982..f9b1c53467 100644 --- a/be/src/vec/columns/column_array.cpp +++ b/be/src/vec/columns/column_array.cpp @@ -152,13 +152,15 @@ StringRef ColumnArray::get_data_at(size_t n) const { */ size_t offset_of_first_elem = offset_at(n); - StringRef first = get_data().get_data_at_with_terminating_zero(offset_of_first_elem); + StringRef first = get_data().get_data_at(offset_of_first_elem); size_t array_size = size_at(n); - if (array_size == 0) return StringRef(first.data, 0); + if (array_size == 0) { + return StringRef(first.data, 0); + } - size_t offset_of_last_elem = get_offsets()[n] - 1; - StringRef last = get_data().get_data_at_with_terminating_zero(offset_of_last_elem); + size_t offset_of_last_elem = offset_at(n + 1) - 1; + StringRef last = get_data().get_data_at(offset_of_last_elem); return StringRef(first.data, last.data + last.size - first.data); } diff --git a/be/src/vec/columns/column_const.h b/be/src/vec/columns/column_const.h index 422316075b..3f4e735780 100644 --- a/be/src/vec/columns/column_const.h +++ b/be/src/vec/columns/column_const.h @@ -23,7 +23,6 @@ #include "vec/columns/column.h" #include "vec/columns/column_nullable.h" #include "vec/common/assert_cast.h" -#include "vec/common/exception.h" #include "vec/common/typeid_cast.h" #include "vec/core/field.h" @@ -67,10 +66,6 @@ public: StringRef get_data_at(size_t) const override { return data->get_data_at(0); } - StringRef get_data_at_with_terminating_zero(size_t) const override { - return data->get_data_at_with_terminating_zero(0); - } - UInt64 get64(size_t) const override { return data->get64(0); } UInt64 get_uint(size_t) const override { return data->get_uint(0); } diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index ccc1074e29..9d2d43b48a 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -31,7 +31,9 @@ namespace doris::vectorized { MutableColumnPtr ColumnString::clone_resized(size_t to_size) const { auto res = ColumnString::create(); - if (to_size == 0) return res; + if (to_size == 0) { + return res; + } size_t from_size = size(); @@ -43,22 +45,12 @@ MutableColumnPtr ColumnString::clone_resized(size_t to_size) const { } else { /// Copy column and append empty strings for extra elements. - Offset offset = 0; if (from_size > 0) { res->offsets.assign(offsets.begin(), offsets.end()); res->chars.assign(chars.begin(), chars.end()); - offset = offsets.back(); } - /// Empty strings are just zero terminating bytes. - - res->chars.resize_fill(res->chars.size() + to_size - from_size); - - res->offsets.resize(to_size); - for (size_t i = from_size; i < to_size; ++i) { - ++offset; - res->offsets[i] = offset; - } + res->offsets.resize_fill(to_size, chars.size()); } return res; @@ -77,7 +69,9 @@ MutableColumnPtr ColumnString::get_shinked_column() { } void ColumnString::insert_range_from(const IColumn& src, size_t start, size_t length) { - if (length == 0) return; + if (length == 0) { + return; + } const ColumnString& src_concrete = assert_cast(src); @@ -99,9 +93,10 @@ void ColumnString::insert_range_from(const IColumn& src, size_t start, size_t le size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see PaddedPODArray offsets.resize(old_size + length); - for (size_t i = 0; i < length; ++i) + for (size_t i = 0; i < length; ++i) { offsets[old_size + i] = src_concrete.offsets[start + i] - nested_offset + prev_max_offset; + } } } @@ -137,7 +132,9 @@ void ColumnString::update_crcs_with_value(std::vector& hashes, doris:: } ColumnPtr ColumnString::filter(const Filter& filt, ssize_t result_size_hint) const { - if (offsets.size() == 0) return ColumnString::create(); + if (offsets.size() == 0) { + return ColumnString::create(); + } auto res = ColumnString::create(); @@ -152,27 +149,32 @@ ColumnPtr ColumnString::filter(const Filter& filt, ssize_t result_size_hint) con ColumnPtr ColumnString::permute(const Permutation& perm, size_t limit) const { size_t size = offsets.size(); - if (limit == 0) + if (limit == 0) { limit = size; - else + } else { limit = std::min(size, limit); + } if (perm.size() < limit) { LOG(FATAL) << "Size of permutation is less than required."; } - if (limit == 0) return ColumnString::create(); + if (limit == 0) { + return ColumnString::create(); + } auto res = ColumnString::create(); Chars& res_chars = res->chars; Offsets& res_offsets = res->offsets; - if (limit == size) + if (limit == size) { res_chars.resize(chars.size()); - else { + } else { size_t new_chars_size = 0; - for (size_t i = 0; i < limit; ++i) new_chars_size += size_at(perm[i]); + for (size_t i = 0; i < limit; ++i) { + new_chars_size += size_at(perm[i]); + } res_chars.resize(new_chars_size); } @@ -285,7 +287,9 @@ void ColumnString::deserialize_vec_with_null_map(std::vector& keys, template ColumnPtr ColumnString::index_impl(const PaddedPODArray& indexes, size_t limit) const { - if (limit == 0) return ColumnString::create(); + if (limit == 0) { + return ColumnString::create(); + } auto res = ColumnString::create(); @@ -293,7 +297,9 @@ ColumnPtr ColumnString::index_impl(const PaddedPODArray& indexes, size_t l Offsets& res_offsets = res->offsets; size_t new_chars_size = 0; - for (size_t i = 0; i < limit; ++i) new_chars_size += size_at(indexes[i]); + for (size_t i = 0; i < limit; ++i) { + new_chars_size += size_at(indexes[i]); + } res_chars.resize(new_chars_size); res_offsets.resize(limit); @@ -321,8 +327,8 @@ struct ColumnString::less { explicit less(const ColumnString& parent_) : parent(parent_) {} bool operator()(size_t lhs, size_t rhs) const { int res = memcmp_small_allow_overflow15( - parent.chars.data() + parent.offset_at(lhs), parent.size_at(lhs) - 1, - parent.chars.data() + parent.offset_at(rhs), parent.size_at(rhs) - 1); + parent.chars.data() + parent.offset_at(lhs), parent.size_at(lhs), + parent.chars.data() + parent.offset_at(rhs), parent.size_at(rhs)); return positive ? (res < 0) : (res > 0); } @@ -332,20 +338,26 @@ void ColumnString::get_permutation(bool reverse, size_t limit, int /*nan_directi Permutation& res) const { size_t s = offsets.size(); res.resize(s); - for (size_t i = 0; i < s; ++i) res[i] = i; + for (size_t i = 0; i < s; ++i) { + res[i] = i; + } - if (limit >= s) limit = 0; + if (limit >= s) { + limit = 0; + } if (limit) { - if (reverse) + if (reverse) { std::partial_sort(res.begin(), res.begin() + limit, res.end(), less(*this)); - else + } else { std::partial_sort(res.begin(), res.begin() + limit, res.end(), less(*this)); + } } else { - if (reverse) + if (reverse) { std::sort(res.begin(), res.end(), less(*this)); - else + } else { std::sort(res.begin(), res.end(), less(*this)); + } } } @@ -357,7 +369,9 @@ ColumnPtr ColumnString::replicate(const Offsets& replicate_offsets) const { auto res = ColumnString::create(); - if (0 == col_size) return res; + if (0 == col_size) { + return res; + } Chars& res_chars = res->chars; Offsets& res_offsets = res->offsets; @@ -390,7 +404,9 @@ ColumnPtr ColumnString::replicate(const Offsets& replicate_offsets) const { void ColumnString::replicate(const uint32_t* counts, size_t target_size, IColumn& column) const { size_t col_size = size(); - if (0 == col_size) return; + if (0 == col_size) { + return; + } auto& res = reinterpret_cast(column); @@ -439,7 +455,9 @@ void ColumnString::get_extremes(Field& min, Field& max) const { size_t col_size = size(); - if (col_size == 0) return; + if (col_size == 0) { + return; + } size_t min_idx = 0; size_t max_idx = 0; @@ -447,10 +465,11 @@ void ColumnString::get_extremes(Field& min, Field& max) const { less less_op(*this); for (size_t i = 1; i < col_size; ++i) { - if (less_op(i, min_idx)) + if (less_op(i, min_idx)) { min_idx = i; - else if (less_op(max_idx, i)) + } else if (less_op(max_idx, i)) { max_idx = i; + } } get(min_idx, min); diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index 2d972ac980..9cae0509cc 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -88,20 +88,15 @@ public: Field operator[](size_t n) const override { assert(n < size()); - return Field(&chars[offset_at(n)], size_at(n) - 1); + return Field(&chars[offset_at(n)], size_at(n)); } void get(size_t n, Field& res) const override { assert(n < size()); - res.assign_string(&chars[offset_at(n)], size_at(n) - 1); + res.assign_string(&chars[offset_at(n)], size_at(n)); } StringRef get_data_at(size_t n) const override { - assert(n < size()); - return StringRef(&chars[offset_at(n)], size_at(n) - 1); - } - - StringRef get_data_at_with_terminating_zero(size_t n) const override { assert(n < size()); return StringRef(&chars[offset_at(n)], size_at(n)); } @@ -115,7 +110,7 @@ public: void insert(const Field& x) override { const String& s = doris::vectorized::get(x); const size_t old_size = chars.size(); - const size_t size_to_append = s.size() + 1; + const size_t size_to_append = s.size(); const size_t new_size = old_size + size_to_append; chars.resize(new_size); @@ -132,9 +127,8 @@ public: const size_t size_to_append = src.offsets[n] - src.offsets[n - 1]; /// -1th index is Ok, see PaddedPODArray. - if (size_to_append == 1) { + if (!size_to_append) { /// shortcut for empty string - chars.push_back(0); offsets.push_back(chars.size()); } else { const size_t old_size = chars.size(); @@ -150,13 +144,12 @@ public: void insert_data(const char* pos, size_t length) override { const size_t old_size = chars.size(); - const size_t new_size = old_size + length + 1; + const size_t new_size = old_size + length; - chars.resize(new_size); if (length) { + chars.resize(new_size); memcpy(chars.data() + old_size, pos, length); } - chars[old_size + length] = 0; offsets.push_back(new_size); } @@ -164,7 +157,7 @@ public: uint32_t* start_offset_array, size_t num) override { size_t new_size = 0; for (size_t i = 0; i < num; i++) { - new_size += len_array[i] + 1; + new_size += len_array[i]; } const size_t old_size = chars.size(); @@ -175,9 +168,10 @@ public: for (size_t i = 0; i < num; i++) { uint32_t len = len_array[i]; uint32_t start_offset = start_offset_array[i]; - if (len) memcpy(data + offset, data_array + start_offset, len); - data[offset + len] = 0; - offset += len + 1; + if (len) { + memcpy(data + offset, data_array + start_offset, len); + } + offset += len; offsets.push_back(offset); } }; @@ -185,7 +179,7 @@ public: void insert_many_strings(const StringRef* strings, size_t num) override { size_t new_size = 0; for (size_t i = 0; i < num; i++) { - new_size += strings[i].size + 1; + new_size += strings[i].size; } const size_t old_size = chars.size(); @@ -195,9 +189,10 @@ public: size_t offset = old_size; for (size_t i = 0; i < num; i++) { uint32_t len = strings[i].size; - if (len) memcpy(data + offset, strings[i].data, len); - data[offset + len] = 0; - offset += len + 1; + if (len) { + memcpy(data + offset, strings[i].data, len); + } + offset += len; offsets.push_back(offset); } } @@ -210,16 +205,6 @@ public: } } - /// Like getData, but inserting data should be zero-ending (i.e. length is 1 byte greater than real string size). - void insert_data_with_terminating_zero(const char* pos, size_t length) { - const size_t old_size = chars.size(); - const size_t new_size = old_size + length; - - chars.resize(new_size); - memcpy(chars.data() + old_size, pos, length); - offsets.push_back(new_size); - } - void pop_back(size_t n) override { size_t nested_n = offsets.back() - offset_at(offsets.size() - n); chars.resize(chars.size() - nested_n); @@ -278,31 +263,17 @@ public: template ColumnPtr index_impl(const PaddedPODArray& indexes, size_t limit) const; - void insert_default() override { - chars.push_back(0); - offsets.push_back(offsets.back() + 1); - } + void insert_default() override { offsets.push_back(chars.size()); } void insert_many_defaults(size_t length) override { - size_t chars_old_size = chars.size(); - chars.resize(chars_old_size + length); - memset(chars.data() + chars_old_size, 0, length); - - const size_t old_size = offsets.size(); - const size_t new_size = old_size + length; - const auto num = offsets.back() + 1; - offsets.resize_fill(new_size, num); - for (size_t i = old_size, j = 0; i < new_size; i++, j++) { - offsets[i] += j; - } + offsets.resize_fill(offsets.size() + length, chars.size()); } int compare_at(size_t n, size_t m, const IColumn& rhs_, int /*nan_direction_hint*/) const override { const ColumnString& rhs = assert_cast(rhs_); - return memcmp_small_allow_overflow15(chars.data() + offset_at(n), size_at(n) - 1, - rhs.chars.data() + rhs.offset_at(m), - rhs.size_at(m) - 1); + return memcmp_small_allow_overflow15(chars.data() + offset_at(n), size_at(n), + rhs.chars.data() + rhs.offset_at(m), rhs.size_at(m)); } void get_permutation(bool reverse, size_t limit, int nan_direction_hint, @@ -355,12 +326,12 @@ public: if (!self_row) { chars.clear(); - offsets[self_row] = data.size + 1; + offsets[self_row] = data.size; } else { - offsets[self_row] = offsets[self_row - 1] + data.size + 1; + offsets[self_row] = offsets[self_row - 1] + data.size; } - chars.insert(data.data, data.data + data.size + 1); + chars.insert(data.data, data.data + data.size); } // should replace according to 0,1,2... ,size,0,1,2... @@ -369,12 +340,10 @@ public: if (!self_row) { chars.clear(); - offsets[self_row] = 1; + offsets[self_row] = 0; } else { - offsets[self_row] = offsets[self_row - 1] + 1; + offsets[self_row] = offsets[self_row - 1]; } - - chars.emplace_back(0); } }; diff --git a/be/src/vec/common/allocator_fwd.h b/be/src/vec/common/allocator_fwd.h index a92665d12c..eb1b7ede6c 100644 --- a/be/src/vec/common/allocator_fwd.h +++ b/be/src/vec/common/allocator_fwd.h @@ -23,6 +23,7 @@ */ #pragma once +#include template class Allocator; diff --git a/be/src/vec/common/columns_hashing.h b/be/src/vec/common/columns_hashing.h index 844fc26ca7..99ce1105fe 100644 --- a/be/src/vec/common/columns_hashing.h +++ b/be/src/vec/common/columns_hashing.h @@ -29,7 +29,6 @@ #include "vec/common/hash_table/hash_table.h" #include "vec/common/hash_table/hash_table_key_holder.h" #include "vec/common/hash_table/ph_hash_map.h" -#include "vec/common/hash_table/string_hash_map.h" #include "vec/common/unaligned.h" namespace doris::vectorized { @@ -95,7 +94,7 @@ struct HashMethodString : public columns_hashing_impl::HashMethodBase< } auto get_key_holder(ssize_t row, [[maybe_unused]] Arena& pool) const { - StringRef key(chars + offsets[row - 1], offsets[row] - offsets[row - 1] - 1); + StringRef key(chars + offsets[row - 1], offsets[row] - offsets[row - 1]); if constexpr (place_string_to_arena) { return ArenaKeyHolder {key, pool}; diff --git a/be/src/vec/common/string_buffer.hpp b/be/src/vec/common/string_buffer.hpp index 68ea3aca18..fe15035bb1 100644 --- a/be/src/vec/common/string_buffer.hpp +++ b/be/src/vec/common/string_buffer.hpp @@ -49,12 +49,11 @@ public: } void commit() override { - _data.push_back(0); - _offsets.push_back(_offsets.back() + _now_offset + 1); + _offsets.push_back(_offsets.back() + _now_offset); _now_offset = 0; } - ~VectorBufferWriter() { DCHECK(_now_offset == 0); } + ~VectorBufferWriter() override { DCHECK(_now_offset == 0); } private: ColumnString::Chars& _data; diff --git a/be/src/vec/core/block.cpp b/be/src/vec/core/block.cpp index af90143955..7da0f0e525 100644 --- a/be/src/vec/core/block.cpp +++ b/be/src/vec/core/block.cpp @@ -32,6 +32,7 @@ #include "util/block_compression.h" #include "util/simd/bits.h" #include "vec/columns/column.h" +#include "vec/columns/column_array.h" #include "vec/columns/column_const.h" #include "vec/columns/column_nullable.h" #include "vec/columns/column_string.h" @@ -62,6 +63,10 @@ Block::Block(const std::vector& slots, size_t block_size) { } Block::Block(const PBlock& pblock) { + CHECK(pblock.data_version() <= Block::max_data_version) + << "invalid pblock.data_version()=" << pblock.data_version() + << ", max_data_version=" << max_data_version; + const char* buf = nullptr; std::string compression_scratch; if (pblock.compressed()) { @@ -96,7 +101,7 @@ Block::Block(const PBlock& pblock) { for (const auto& pcol_meta : pblock.column_metas()) { DataTypePtr type = DataTypeFactory::instance().create_data_type(pcol_meta); MutableColumnPtr data_column = type->create_column(); - buf = type->deserialize(buf, data_column.get()); + buf = type->deserialize(buf, data_column.get(), pblock.data_version()); data.emplace_back(data_column->get_ptr(), type, pcol_meta.name()); } initialize_index_by_name(); @@ -573,7 +578,6 @@ DataTypes Block::get_data_types() const { } void Block::clear() { - info = BlockInfo(); data.clear(); index_by_name.clear(); } @@ -593,7 +597,6 @@ void Block::clear_column_data(int column_size) noexcept { } void Block::swap(Block& other) noexcept { - std::swap(info, other.info); data.swap(other.data); index_by_name.swap(other.index_by_name); } @@ -700,13 +703,19 @@ Status Block::serialize(PBlock* pblock, size_t* uncompressed_bytes, size_t* comp Status Block::serialize(PBlock* pblock, std::string* compressed_buffer, size_t* uncompressed_bytes, size_t* compressed_bytes, segment_v2::CompressionTypePB compression_type, bool allow_transfer_large_data) const { + CHECK(config::block_data_version <= Block::max_data_version) + << "invalid config::block_data_version=" << config::block_data_version + << ", max_data_version=" << max_data_version; + pblock->set_data_version(config::block_data_version); + // calc uncompressed size for allocation size_t content_uncompressed_size = 0; for (const auto& c : *this) { PColumnMeta* pcm = pblock->add_column_metas(); c.to_pb_column_meta(pcm); // get serialized size - content_uncompressed_size += c.type->get_uncompressed_serialized_bytes(*(c.column)); + content_uncompressed_size += + c.type->get_uncompressed_serialized_bytes(*(c.column), config::block_data_version); } // serialize data values @@ -726,7 +735,7 @@ Status Block::serialize(PBlock* pblock, std::string* compressed_buffer, size_t* char* buf = column_values->data(); for (const auto& c : *this) { - buf = c.type->serialize(*(c.column), buf); + buf = c.type->serialize(*(c.column), buf, config::block_data_version); } *uncompressed_bytes = content_uncompressed_size; diff --git a/be/src/vec/core/block.h b/be/src/vec/core/block.h index 75813df153..b9a147bcf7 100644 --- a/be/src/vec/core/block.h +++ b/be/src/vec/core/block.h @@ -32,7 +32,6 @@ #include "runtime/descriptors.h" #include "vec/columns/column.h" #include "vec/columns/column_nullable.h" -#include "vec/core/block_info.h" #include "vec/core/column_with_type_and_name.h" #include "vec/core/columns_with_type_and_name.h" #include "vec/core/names.h" @@ -71,7 +70,10 @@ private: mutable int64_t _compress_time_ns = 0; public: - BlockInfo info; + // When we have some breaking change for serialize/deserialize, we should update data_version. + constexpr static int max_data_version = 0; + // -1: not contain data_version. + // 0: remove ColumnString's terminating zero. Block() = default; Block(std::initializer_list il); diff --git a/be/src/vec/core/block_info.cpp b/be/src/vec/core/block_info.cpp deleted file mode 100644 index 3672a25b7b..0000000000 --- a/be/src/vec/core/block_info.cpp +++ /dev/null @@ -1,42 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// This file is copied from -// https://github.com/ClickHouse/ClickHouse/blob/master/src/Core/BlockInfo.cpp -// and modified by Doris - -#include "vec/core/block_info.h" - -#include "vec/common/exception.h" -#include "vec/core/types.h" - -namespace doris::vectorized { - -void BlockMissingValues::set_bit(size_t column_idx, size_t row_idx) { - RowsBitMask& mask = rows_mask_by_column_id[column_idx]; - mask.resize(row_idx + 1); - mask[row_idx] = true; -} - -const BlockMissingValues::RowsBitMask& BlockMissingValues::get_defaults_bitmask( - size_t column_idx) const { - static RowsBitMask none; - auto it = rows_mask_by_column_id.find(column_idx); - if (it != rows_mask_by_column_id.end()) return it->second; - return none; -} - -} // namespace doris::vectorized diff --git a/be/src/vec/core/block_info.h b/be/src/vec/core/block_info.h deleted file mode 100644 index 2ab7e2d5ef..0000000000 --- a/be/src/vec/core/block_info.h +++ /dev/null @@ -1,74 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// This file is copied from -// https://github.com/ClickHouse/ClickHouse/blob/master/src/Core/BlockInfo.h -// and modified by Doris - -#pragma once - -#include - -#include "vec/core/types.h" - -namespace doris::vectorized { - -/** More information about the block. - */ -struct BlockInfo { - /** is_overflows: - * After running GROUP BY ... WITH TOTALS with the max_rows_to_group_by and group_by_overflow_mode = 'any' settings, - * a row is inserted in the separate block with aggregated values that have not passed max_rows_to_group_by. - * If it is such a block, then is_overflows is set to true for it. - */ - - /** bucket_num: - * When using the two-level aggregation method, data with different key groups are scattered across different buckets. - * In this case, the bucket number is indicated here. It is used to optimize the merge for distributed aggregation. - * Otherwise -1. - */ - -#define APPLY_FOR_BLOCK_INFO_FIELDS(M) \ - M(bool, is_overflows, false, 1) \ - M(Int32, bucket_num, -1, 2) - -#define DECLARE_FIELD_VEC(TYPE, NAME, DEFAULT, FIELD_NUM) TYPE NAME = DEFAULT; - - APPLY_FOR_BLOCK_INFO_FIELDS(DECLARE_FIELD_VEC) - -#undef DECLARE_FIELD_VEC -}; - -/// Block extention to support delayed defaults. AddingDefaultsBlockInputStream uses it to replace missing values with column defaults. -class BlockMissingValues { -public: - using RowsBitMask = std::vector; /// a bit per row for a column - - const RowsBitMask& get_defaults_bitmask(size_t column_idx) const; - void set_bit(size_t column_idx, size_t row_idx); - bool empty() const { return rows_mask_by_column_id.empty(); } - size_t size() const { return rows_mask_by_column_id.size(); } - void clear() { rows_mask_by_column_id.clear(); } - -private: - using RowsMaskByColumnId = std::unordered_map; - - /// If rows_mask_by_column_id[column_id][row_id] is true related value in Block should be replaced with column default. - /// It could contain less columns and rows then related block. - RowsMaskByColumnId rows_mask_by_column_id; -}; - -} // namespace doris::vectorized diff --git a/be/src/vec/data_types/data_type.h b/be/src/vec/data_types/data_type.h index ccb3d1fce0..979112cd17 100644 --- a/be/src/vec/data_types/data_type.h +++ b/be/src/vec/data_types/data_type.h @@ -24,7 +24,6 @@ #include #include "gen_cpp/data.pb.h" -#include "runtime/primitive_type.h" #include "vec/common/cow.h" #include "vec/common/string_buffer.hpp" #include "vec/core/types.h" @@ -236,9 +235,10 @@ public: /// Updates avg_value_size_hint for newly read column. Uses to optimize deserialization. Zero expected for first column. static void update_avg_value_size_hint(const IColumn& column, double& avg_value_size_hint); - virtual int64_t get_uncompressed_serialized_bytes(const IColumn& column) const = 0; - virtual char* serialize(const IColumn& column, char* buf) const = 0; - virtual const char* deserialize(const char* buf, IColumn* column) const = 0; + virtual int64_t get_uncompressed_serialized_bytes(const IColumn& column, + int data_version) const = 0; + virtual char* serialize(const IColumn& column, char* buf, int data_version) const = 0; + virtual const char* deserialize(const char* buf, IColumn* column, int data_version) const = 0; virtual void to_pb_column_meta(PColumnMeta* col_meta) const; diff --git a/be/src/vec/data_types/data_type_array.cpp b/be/src/vec/data_types/data_type_array.cpp index 851e957d7e..a656e0fc81 100644 --- a/be/src/vec/data_types/data_type_array.cpp +++ b/be/src/vec/data_types/data_type_array.cpp @@ -21,10 +21,9 @@ #include "vec/data_types/data_type_array.h" #include "gen_cpp/data.pb.h" +#include "vec/columns/column_array.h" #include "vec/columns/column_nullable.h" -#include "vec/common/string_utils/string_utils.h" #include "vec/data_types/data_type_nullable.h" -#include "vec/io/io_helper.h" namespace doris::vectorized { @@ -55,14 +54,16 @@ size_t DataTypeArray::get_number_of_dimensions() const { ->get_number_of_dimensions(); /// Every modern C++ compiler optimizes tail recursion. } -int64_t DataTypeArray::get_uncompressed_serialized_bytes(const IColumn& column) const { +int64_t DataTypeArray::get_uncompressed_serialized_bytes(const IColumn& column, + int data_version) const { auto ptr = column.convert_to_full_column_if_const(); const auto& data_column = assert_cast(*ptr.get()); return sizeof(ColumnArray::Offset64) * (column.size() + 1) + - get_nested_type()->get_uncompressed_serialized_bytes(data_column.get_data()); + get_nested_type()->get_uncompressed_serialized_bytes(data_column.get_data(), + data_version); } -char* DataTypeArray::serialize(const IColumn& column, char* buf) const { +char* DataTypeArray::serialize(const IColumn& column, char* buf, int data_version) const { auto ptr = column.convert_to_full_column_if_const(); const auto& data_column = assert_cast(*ptr.get()); @@ -73,10 +74,10 @@ char* DataTypeArray::serialize(const IColumn& column, char* buf) const { memcpy(buf, data_column.get_offsets().data(), column.size() * sizeof(ColumnArray::Offset64)); buf += column.size() * sizeof(ColumnArray::Offset64); // children - return get_nested_type()->serialize(data_column.get_data(), buf); + return get_nested_type()->serialize(data_column.get_data(), buf, data_version); } -const char* DataTypeArray::deserialize(const char* buf, IColumn* column) const { +const char* DataTypeArray::deserialize(const char* buf, IColumn* column, int data_version) const { auto* data_column = assert_cast(column); auto& offsets = data_column->get_offsets(); @@ -88,7 +89,8 @@ const char* DataTypeArray::deserialize(const char* buf, IColumn* column) const { memcpy(offsets.data(), buf, sizeof(ColumnArray::Offset64) * row_num); buf += sizeof(ColumnArray::Offset64) * row_num; // children - return get_nested_type()->deserialize(buf, data_column->get_data_ptr()->assume_mutable()); + return get_nested_type()->deserialize(buf, data_column->get_data_ptr()->assume_mutable(), + data_version); } void DataTypeArray::to_pb_column_meta(PColumnMeta* col_meta) const { diff --git a/be/src/vec/data_types/data_type_array.h b/be/src/vec/data_types/data_type_array.h index c87298b1f0..fc122f2636 100644 --- a/be/src/vec/data_types/data_type_array.h +++ b/be/src/vec/data_types/data_type_array.h @@ -20,7 +20,6 @@ #pragma once -#include "vec/columns/column_array.h" #include "vec/data_types/data_type.h" namespace doris::vectorized { @@ -73,9 +72,10 @@ public: /// 1 for plain array, 2 for array of arrays and so on. size_t get_number_of_dimensions() const; - int64_t get_uncompressed_serialized_bytes(const IColumn& column) const override; - char* serialize(const IColumn& column, char* buf) const override; - const char* deserialize(const char* buf, IColumn* column) const override; + int64_t get_uncompressed_serialized_bytes(const IColumn& column, + int data_version) const override; + char* serialize(const IColumn& column, char* buf, int data_version) const override; + const char* deserialize(const char* buf, IColumn* column, int data_version) const override; void to_pb_column_meta(PColumnMeta* col_meta) const override; diff --git a/be/src/vec/data_types/data_type_bitmap.cpp b/be/src/vec/data_types/data_type_bitmap.cpp index 5c49f9c65c..2ffb37ebbd 100644 --- a/be/src/vec/data_types/data_type_bitmap.cpp +++ b/be/src/vec/data_types/data_type_bitmap.cpp @@ -26,7 +26,8 @@ namespace doris::vectorized { // binary: | // : row num | bitmap1 size | bitmap2 size | ... // : bitmap1 | bitmap2 | ... -int64_t DataTypeBitMap::get_uncompressed_serialized_bytes(const IColumn& column) const { +int64_t DataTypeBitMap::get_uncompressed_serialized_bytes(const IColumn& column, + int data_version) const { auto ptr = column.convert_to_full_column_if_const(); auto& data_column = assert_cast(*ptr); @@ -40,7 +41,7 @@ int64_t DataTypeBitMap::get_uncompressed_serialized_bytes(const IColumn& column) return allocate_len_size + allocate_content_size; } -char* DataTypeBitMap::serialize(const IColumn& column, char* buf) const { +char* DataTypeBitMap::serialize(const IColumn& column, char* buf, int data_version) const { auto ptr = column.convert_to_full_column_if_const(); auto& data_column = assert_cast(*ptr); @@ -63,7 +64,7 @@ char* DataTypeBitMap::serialize(const IColumn& column, char* buf) const { return data_ptr; } -const char* DataTypeBitMap::deserialize(const char* buf, IColumn* column) const { +const char* DataTypeBitMap::deserialize(const char* buf, IColumn* column, int data_version) const { auto& data_column = assert_cast(*column); auto& data = data_column.get_data(); diff --git a/be/src/vec/data_types/data_type_bitmap.h b/be/src/vec/data_types/data_type_bitmap.h index ba8e96be75..5292604bab 100644 --- a/be/src/vec/data_types/data_type_bitmap.h +++ b/be/src/vec/data_types/data_type_bitmap.h @@ -21,7 +21,6 @@ #include "vec/columns/column_complex.h" #include "vec/core/types.h" #include "vec/data_types/data_type.h" -#include "vec/data_types/data_type_hll.h" namespace doris::vectorized { class DataTypeBitMap : public IDataType { @@ -37,9 +36,10 @@ public: TypeIndex get_type_id() const override { return TypeIndex::BitMap; } - int64_t get_uncompressed_serialized_bytes(const IColumn& column) const override; - char* serialize(const IColumn& column, char* buf) const override; - const char* deserialize(const char* buf, IColumn* column) const override; + int64_t get_uncompressed_serialized_bytes(const IColumn& column, + int data_version) const override; + char* serialize(const IColumn& column, char* buf, int data_version) const override; + const char* deserialize(const char* buf, IColumn* column, int data_version) const override; MutableColumnPtr create_column() const override; diff --git a/be/src/vec/data_types/data_type_decimal.cpp b/be/src/vec/data_types/data_type_decimal.cpp index 7a50eb487e..e41825ccef 100644 --- a/be/src/vec/data_types/data_type_decimal.cpp +++ b/be/src/vec/data_types/data_type_decimal.cpp @@ -86,12 +86,13 @@ Status DataTypeDecimal::from_string(ReadBuffer& rb, IColumn* column) const { // binary: row_num | value1 | value2 | ... template -int64_t DataTypeDecimal::get_uncompressed_serialized_bytes(const IColumn& column) const { +int64_t DataTypeDecimal::get_uncompressed_serialized_bytes(const IColumn& column, + int data_version) const { return sizeof(uint32_t) + column.size() * sizeof(FieldType); } template -char* DataTypeDecimal::serialize(const IColumn& column, char* buf) const { +char* DataTypeDecimal::serialize(const IColumn& column, char* buf, int data_version) const { // row num const auto row_num = column.size(); *reinterpret_cast(buf) = row_num; @@ -105,7 +106,8 @@ char* DataTypeDecimal::serialize(const IColumn& column, char* buf) const { } template -const char* DataTypeDecimal::deserialize(const char* buf, IColumn* column) const { +const char* DataTypeDecimal::deserialize(const char* buf, IColumn* column, + int data_version) const { // row num uint32_t row_num = *reinterpret_cast(buf); buf += sizeof(uint32_t); diff --git a/be/src/vec/data_types/data_type_decimal.h b/be/src/vec/data_types/data_type_decimal.h index 3aa85aabeb..a6c9696dfc 100644 --- a/be/src/vec/data_types/data_type_decimal.h +++ b/be/src/vec/data_types/data_type_decimal.h @@ -131,9 +131,10 @@ public: std::string do_get_name() const override; TypeIndex get_type_id() const override { return TypeId::value; } - int64_t get_uncompressed_serialized_bytes(const IColumn& column) const override; - char* serialize(const IColumn& column, char* buf) const override; - const char* deserialize(const char* buf, IColumn* column) const override; + int64_t get_uncompressed_serialized_bytes(const IColumn& column, + int data_version) const override; + char* serialize(const IColumn& column, char* buf, int data_version) const override; + const char* deserialize(const char* buf, IColumn* column, int data_version) const override; void to_pb_column_meta(PColumnMeta* col_meta) const override; diff --git a/be/src/vec/data_types/data_type_fixed_length_object.cpp b/be/src/vec/data_types/data_type_fixed_length_object.cpp index d96945b2cd..562c98d02d 100644 --- a/be/src/vec/data_types/data_type_fixed_length_object.cpp +++ b/be/src/vec/data_types/data_type_fixed_length_object.cpp @@ -17,11 +17,10 @@ #include "vec/data_types/data_type_fixed_length_object.h" -#include "vec/aggregate_functions/aggregate_function_avg.h" - namespace doris::vectorized { -char* DataTypeFixedLengthObject::serialize(const IColumn& column, char* buf) const { +char* DataTypeFixedLengthObject::serialize(const IColumn& column, char* buf, + int data_version) const { // row num const auto row_num = column.size(); *reinterpret_cast(buf) = row_num; @@ -40,7 +39,8 @@ char* DataTypeFixedLengthObject::serialize(const IColumn& column, char* buf) con return buf; } -const char* DataTypeFixedLengthObject::deserialize(const char* buf, IColumn* column) const { +const char* DataTypeFixedLengthObject::deserialize(const char* buf, IColumn* column, + int data_version) const { // row num uint32_t row_num = *reinterpret_cast(buf); buf += sizeof(uint32_t); diff --git a/be/src/vec/data_types/data_type_fixed_length_object.h b/be/src/vec/data_types/data_type_fixed_length_object.h index 5d346b297d..fe581584ba 100644 --- a/be/src/vec/data_types/data_type_fixed_length_object.h +++ b/be/src/vec/data_types/data_type_fixed_length_object.h @@ -18,7 +18,6 @@ #pragma once #include "vec/columns/column_fixed_length_object.h" -#include "vec/common/typeid_cast.h" #include "vec/data_types/data_type.h" namespace doris::vectorized { @@ -39,13 +38,14 @@ public: bool equals(const IDataType& rhs) const override { return typeid(rhs) == typeid(*this); } - int64_t get_uncompressed_serialized_bytes(const IColumn& column) const override { + int64_t get_uncompressed_serialized_bytes(const IColumn& column, + int data_version) const override { return static_cast(column).byte_size() + sizeof(uint32_t) + sizeof(size_t); } - char* serialize(const IColumn& column, char* buf) const override; - const char* deserialize(const char* buf, IColumn* column) const override; + char* serialize(const IColumn& column, char* buf, int data_version) const override; + const char* deserialize(const char* buf, IColumn* column, int data_version) const override; MutableColumnPtr create_column() const override; diff --git a/be/src/vec/data_types/data_type_hll.cpp b/be/src/vec/data_types/data_type_hll.cpp index fde7a2a23b..55bfd3a463 100644 --- a/be/src/vec/data_types/data_type_hll.cpp +++ b/be/src/vec/data_types/data_type_hll.cpp @@ -26,7 +26,7 @@ namespace doris::vectorized { // Two part of binary: + | // first: row num | hll1 size | hll2 size | ... // second: hll1 | hll2 | ... -char* DataTypeHLL::serialize(const IColumn& column, char* buf) const { +char* DataTypeHLL::serialize(const IColumn& column, char* buf, int data_version) const { auto ptr = column.convert_to_full_column_if_const(); auto& data_column = assert_cast(*ptr); @@ -52,7 +52,7 @@ char* DataTypeHLL::serialize(const IColumn& column, char* buf) const { // Two part of binary: + | // first: row num | hll1 size | hll2 size | ... // second: hll1 | hll2 | ... -const char* DataTypeHLL::deserialize(const char* buf, IColumn* column) const { +const char* DataTypeHLL::deserialize(const char* buf, IColumn* column, int data_version) const { auto& data_column = assert_cast(*column); auto& data = data_column.get_data(); @@ -71,7 +71,8 @@ const char* DataTypeHLL::deserialize(const char* buf, IColumn* column) const { return buf; } -int64_t DataTypeHLL::get_uncompressed_serialized_bytes(const IColumn& column) const { +int64_t DataTypeHLL::get_uncompressed_serialized_bytes(const IColumn& column, + int data_version) const { auto ptr = column.convert_to_full_column_if_const(); auto& data_column = assert_cast(*ptr); diff --git a/be/src/vec/data_types/data_type_hll.h b/be/src/vec/data_types/data_type_hll.h index f63890e754..15f56dc575 100644 --- a/be/src/vec/data_types/data_type_hll.h +++ b/be/src/vec/data_types/data_type_hll.h @@ -36,9 +36,10 @@ public: TypeIndex get_type_id() const override { return TypeIndex::HLL; } - int64_t get_uncompressed_serialized_bytes(const IColumn& column) const override; - char* serialize(const IColumn& column, char* buf) const override; - const char* deserialize(const char* buf, IColumn* column) const override; + int64_t get_uncompressed_serialized_bytes(const IColumn& column, + int data_version) const override; + char* serialize(const IColumn& column, char* buf, int data_version) const override; + const char* deserialize(const char* buf, IColumn* column, int data_version) const override; MutableColumnPtr create_column() const override; bool get_is_parametric() const override { return false; } diff --git a/be/src/vec/data_types/data_type_nothing.cpp b/be/src/vec/data_types/data_type_nothing.cpp index 9cd1bc84c1..88aa777ae3 100644 --- a/be/src/vec/data_types/data_type_nothing.cpp +++ b/be/src/vec/data_types/data_type_nothing.cpp @@ -20,9 +20,7 @@ #include "vec/data_types/data_type_nothing.h" -#include "gen_cpp/data.pb.h" #include "vec/columns/column_nothing.h" -#include "vec/common/typeid_cast.h" namespace doris::vectorized { @@ -30,11 +28,11 @@ MutableColumnPtr DataTypeNothing::create_column() const { return ColumnNothing::create(0); } -char* DataTypeNothing::serialize(const IColumn& column, char* buf) const { +char* DataTypeNothing::serialize(const IColumn& column, char* buf, int data_version) const { return buf; } -const char* DataTypeNothing::deserialize(const char* buf, IColumn* column) const { +const char* DataTypeNothing::deserialize(const char* buf, IColumn* column, int data_version) const { return buf; } diff --git a/be/src/vec/data_types/data_type_nothing.h b/be/src/vec/data_types/data_type_nothing.h index 8a7d1e63f3..6e168fd55d 100644 --- a/be/src/vec/data_types/data_type_nothing.h +++ b/be/src/vec/data_types/data_type_nothing.h @@ -48,9 +48,12 @@ public: size_t get_size_of_value_in_memory() const override { return 0; } bool can_be_inside_nullable() const override { return true; } - int64_t get_uncompressed_serialized_bytes(const IColumn& column) const override { return 0; } - char* serialize(const IColumn& column, char* buf) const override; - const char* deserialize(const char* buf, IColumn* column) const override; + int64_t get_uncompressed_serialized_bytes(const IColumn& column, + int data_version) const override { + return 0; + } + char* serialize(const IColumn& column, char* buf, int data_version) const override; + const char* deserialize(const char* buf, IColumn* column, int data_version) const override; [[noreturn]] Field get_default() const override { LOG(FATAL) << "Method get_default() is not implemented for data type " << get_name(); diff --git a/be/src/vec/data_types/data_type_nullable.cpp b/be/src/vec/data_types/data_type_nullable.cpp index 41a1abc0ce..ff3034d25c 100644 --- a/be/src/vec/data_types/data_type_nullable.cpp +++ b/be/src/vec/data_types/data_type_nullable.cpp @@ -20,7 +20,6 @@ #include "vec/data_types/data_type_nullable.h" -#include "common/logging.h" #include "gen_cpp/data.pb.h" #include "vec/columns/column_nullable.h" #include "vec/common/assert_cast.h" @@ -86,16 +85,18 @@ Status DataTypeNullable::from_string(ReadBuffer& rb, IColumn* column) const { // binary: row num | | // : is_null1 | is_null2 | ... // : value1 | value2 | ...> -int64_t DataTypeNullable::get_uncompressed_serialized_bytes(const IColumn& column) const { +int64_t DataTypeNullable::get_uncompressed_serialized_bytes(const IColumn& column, + int data_version) const { int64_t size = sizeof(uint32_t); size += sizeof(bool) * column.size(); size += nested_data_type->get_uncompressed_serialized_bytes( assert_cast(*column.convert_to_full_column_if_const()) - .get_nested_column()); + .get_nested_column(), + data_version); return size; } -char* DataTypeNullable::serialize(const IColumn& column, char* buf) const { +char* DataTypeNullable::serialize(const IColumn& column, char* buf, int data_version) const { auto ptr = column.convert_to_full_column_if_const(); const ColumnNullable& col = assert_cast(*ptr.get()); @@ -106,10 +107,11 @@ char* DataTypeNullable::serialize(const IColumn& column, char* buf) const { memcpy(buf, col.get_null_map_data().data(), column.size() * sizeof(bool)); buf += column.size() * sizeof(bool); // data values - return nested_data_type->serialize(col.get_nested_column(), buf); + return nested_data_type->serialize(col.get_nested_column(), buf, data_version); } -const char* DataTypeNullable::deserialize(const char* buf, IColumn* column) const { +const char* DataTypeNullable::deserialize(const char* buf, IColumn* column, + int data_version) const { ColumnNullable* col = assert_cast(column); // row num uint32_t row_num = *reinterpret_cast(buf); @@ -120,7 +122,7 @@ const char* DataTypeNullable::deserialize(const char* buf, IColumn* column) cons buf += row_num * sizeof(bool); // data values IColumn& nested = col->get_nested_column(); - return nested_data_type->deserialize(buf, &nested); + return nested_data_type->deserialize(buf, &nested, data_version); } void DataTypeNullable::to_pb_column_meta(PColumnMeta* col_meta) const { diff --git a/be/src/vec/data_types/data_type_nullable.h b/be/src/vec/data_types/data_type_nullable.h index 1f51fd1789..4f96ab6fab 100644 --- a/be/src/vec/data_types/data_type_nullable.h +++ b/be/src/vec/data_types/data_type_nullable.h @@ -36,9 +36,10 @@ public: const char* get_family_name() const override { return "Nullable"; } TypeIndex get_type_id() const override { return TypeIndex::Nullable; } - int64_t get_uncompressed_serialized_bytes(const IColumn& column) const override; - char* serialize(const IColumn& column, char* buf) const override; - const char* deserialize(const char* buf, IColumn* column) const override; + int64_t get_uncompressed_serialized_bytes(const IColumn& column, + int data_version) const override; + char* serialize(const IColumn& column, char* buf, int data_version) const override; + const char* deserialize(const char* buf, IColumn* column, int data_version) const override; void to_pb_column_meta(PColumnMeta* col_meta) const override; diff --git a/be/src/vec/data_types/data_type_number_base.cpp b/be/src/vec/data_types/data_type_number_base.cpp index b3052e9043..28d1cf3886 100644 --- a/be/src/vec/data_types/data_type_number_base.cpp +++ b/be/src/vec/data_types/data_type_number_base.cpp @@ -101,12 +101,13 @@ std::string DataTypeNumberBase::to_string(const IColumn& column, size_t row_n // binary: row num | value1 | value2 | ... template -int64_t DataTypeNumberBase::get_uncompressed_serialized_bytes(const IColumn& column) const { +int64_t DataTypeNumberBase::get_uncompressed_serialized_bytes(const IColumn& column, + int data_version) const { return sizeof(uint32_t) + column.size() * sizeof(FieldType); } template -char* DataTypeNumberBase::serialize(const IColumn& column, char* buf) const { +char* DataTypeNumberBase::serialize(const IColumn& column, char* buf, int data_version) const { // row num const auto row_num = column.size(); *reinterpret_cast(buf) = row_num; @@ -121,7 +122,8 @@ char* DataTypeNumberBase::serialize(const IColumn& column, char* buf) const { } template -const char* DataTypeNumberBase::deserialize(const char* buf, IColumn* column) const { +const char* DataTypeNumberBase::deserialize(const char* buf, IColumn* column, + int data_version) const { // row num uint32_t row_num = *reinterpret_cast(buf); buf += sizeof(uint32_t); diff --git a/be/src/vec/data_types/data_type_number_base.h b/be/src/vec/data_types/data_type_number_base.h index 14fbfb3d07..356f107174 100644 --- a/be/src/vec/data_types/data_type_number_base.h +++ b/be/src/vec/data_types/data_type_number_base.h @@ -21,8 +21,6 @@ #pragma once #include "vec/columns/column_vector.h" -#include "vec/common/assert_cast.h" -#include "vec/common/string_ref.h" #include "vec/core/types.h" #include "vec/data_types/data_type.h" @@ -43,9 +41,10 @@ public: TypeIndex get_type_id() const override { return TypeId::value; } Field get_default() const override; - int64_t get_uncompressed_serialized_bytes(const IColumn& column) const override; - char* serialize(const IColumn& column, char* buf) const override; - const char* deserialize(const char* buf, IColumn* column) const override; + int64_t get_uncompressed_serialized_bytes(const IColumn& column, + int data_version) const override; + char* serialize(const IColumn& column, char* buf, int data_version) const override; + const char* deserialize(const char* buf, IColumn* column, int data_version) const override; MutableColumnPtr create_column() const override; diff --git a/be/src/vec/data_types/data_type_string.cpp b/be/src/vec/data_types/data_type_string.cpp index b319ae3e93..68a5117b47 100644 --- a/be/src/vec/data_types/data_type_string.cpp +++ b/be/src/vec/data_types/data_type_string.cpp @@ -20,12 +20,11 @@ #include "vec/data_types/data_type_string.h" -#include "gen_cpp/data.pb.h" -#include "vec/columns/column_const.h" +#include + #include "vec/columns/column_string.h" #include "vec/common/assert_cast.h" #include "vec/core/field.h" -#include "vec/io/io_helper.h" #ifdef __SSE2__ #include @@ -42,7 +41,6 @@ static inline void read(IColumn& column, Reader&& reader) { size_t old_offsets_size = offsets.size(); try { reader(data); - data.push_back(0); offsets.push_back(data.size()); } catch (...) { offsets.resize_assume_reserved(old_offsets_size); @@ -85,17 +83,47 @@ bool DataTypeString::equals(const IDataType& rhs) const { // binary: | total length | // : row num | offset1 |offset2 | ... // : | (*ptr.get()); + + if (data_version == -1) { + return sizeof(IColumn::Offset) * (column.size() + 1) + sizeof(uint64_t) + + data_column.get_chars().size() + column.size(); + } + return sizeof(IColumn::Offset) * (column.size() + 1) + sizeof(uint64_t) + data_column.get_chars().size(); } -char* DataTypeString::serialize(const IColumn& column, char* buf) const { +char* DataTypeString::serialize(const IColumn& column, char* buf, int data_version) const { auto ptr = column.convert_to_full_column_if_const(); const auto& data_column = assert_cast(*ptr.get()); + if (data_version == -1) { + // row num + *reinterpret_cast(buf) = column.size(); + buf += sizeof(IColumn::Offset); + // offsets + for (int i = 0; i < column.size(); i++) { + *reinterpret_cast(buf) = data_column.get_offsets()[i] + i + 1; + buf += sizeof(IColumn::Offset); + } + // total length + *reinterpret_cast(buf) = data_column.get_chars().size() + column.size(); + buf += sizeof(uint64_t); + // values + for (int i = 0; i < column.size(); i++) { + auto data = data_column.get_data_at(i); + memcpy(buf, data.data, data.size); + buf += data.size; + *buf = '\0'; + buf++; + } + return buf; + } + // row num *reinterpret_cast(buf) = column.size(); buf += sizeof(IColumn::Offset); @@ -113,11 +141,34 @@ char* DataTypeString::serialize(const IColumn& column, char* buf) const { return buf; } -const char* DataTypeString::deserialize(const char* buf, IColumn* column) const { +const char* DataTypeString::deserialize(const char* buf, IColumn* column, int data_version) const { ColumnString* column_string = assert_cast(column); ColumnString::Chars& data = column_string->get_chars(); ColumnString::Offsets& offsets = column_string->get_offsets(); + if (data_version == -1) { + // row num + IColumn::Offset row_num = *reinterpret_cast(buf); + buf += sizeof(IColumn::Offset); + // offsets + offsets.resize(row_num); + for (int i = 0; i < row_num; i++) { + offsets[i] = *reinterpret_cast(buf) - i - 1; + buf += sizeof(IColumn::Offset); + } + // total length + uint64_t value_len = *reinterpret_cast(buf); + buf += sizeof(uint64_t); + // values + data.resize(value_len - row_num); + for (int i = 0; i < row_num; i++) { + memcpy(data.data() + offsets[i - 1], buf, offsets[i] - offsets[i - 1]); + buf += offsets[i] - offsets[i - 1] + 1; + } + + return buf; + } + // row num IColumn::Offset row_num = *reinterpret_cast(buf); buf += sizeof(IColumn::Offset); diff --git a/be/src/vec/data_types/data_type_string.h b/be/src/vec/data_types/data_type_string.h index 1ecf86f2fe..0c0e94512e 100644 --- a/be/src/vec/data_types/data_type_string.h +++ b/be/src/vec/data_types/data_type_string.h @@ -36,9 +36,10 @@ public: TypeIndex get_type_id() const override { return TypeIndex::String; } - int64_t get_uncompressed_serialized_bytes(const IColumn& column) const override; - char* serialize(const IColumn& column, char* buf) const override; - const char* deserialize(const char* buf, IColumn* column) const override; + int64_t get_uncompressed_serialized_bytes(const IColumn& column, + int data_version) const override; + char* serialize(const IColumn& column, char* buf, int data_version) const override; + const char* deserialize(const char* buf, IColumn* column, int data_version) const override; MutableColumnPtr create_column() const override; diff --git a/be/src/vec/exec/vaggregation_node.h b/be/src/vec/exec/vaggregation_node.h index c413246d66..ddacf96777 100644 --- a/be/src/vec/exec/vaggregation_node.h +++ b/be/src/vec/exec/vaggregation_node.h @@ -25,6 +25,7 @@ #include "vec/aggregate_functions/aggregate_function.h" #include "vec/common/columns_hashing.h" #include "vec/common/hash_table/fixed_hash_map.h" +#include "vec/common/hash_table/string_hash_map.h" #include "vec/exprs/vectorized_agg_fn.h" #include "vec/exprs/vslot_ref.h" diff --git a/be/src/vec/functions/array/function_array_distinct.h b/be/src/vec/functions/array/function_array_distinct.h index 77e997aba6..abbdcb9403 100644 --- a/be/src/vec/functions/array/function_array_distinct.h +++ b/be/src/vec/functions/array/function_array_distinct.h @@ -20,15 +20,10 @@ #pragma once #include "vec/columns/column_array.h" -#include "vec/columns/column_const.h" #include "vec/common/hash_table/hash_set.h" #include "vec/common/hash_table/hash_table.h" -#include "vec/common/sip_hash.h" #include "vec/data_types/data_type_array.h" -#include "vec/data_types/data_type_number.h" #include "vec/functions/function.h" -#include "vec/functions/function_helpers.h" -#include "vec/io/io_helper.h" namespace doris::vectorized { @@ -199,13 +194,12 @@ private: set.insert(src_str_ref); // copy the src data to column_string_chars const size_t old_size = column_string_chars.size(); - const size_t new_size = old_size + src_str_ref.size + 1; + const size_t new_size = old_size + src_str_ref.size; column_string_chars.resize(new_size); if (src_str_ref.size > 0) { memcpy(column_string_chars.data() + old_size, src_str_ref.data, src_str_ref.size); } - column_string_chars[old_size + src_str_ref.size] = 0; column_string_offsets.push_back(new_size); if (dest_null_map) { diff --git a/be/src/vec/functions/array/function_array_sort.h b/be/src/vec/functions/array/function_array_sort.h index 87fa684b13..90bd678c5e 100644 --- a/be/src/vec/functions/array/function_array_sort.h +++ b/be/src/vec/functions/array/function_array_sort.h @@ -20,11 +20,8 @@ #pragma once #include "vec/columns/column_array.h" -#include "vec/columns/column_const.h" #include "vec/data_types/data_type_array.h" -#include "vec/data_types/data_type_number.h" #include "vec/functions/function.h" -#include "vec/functions/function_helpers.h" namespace doris::vectorized { @@ -219,13 +216,12 @@ private: StringRef src_str_ref = src_data_concrete->get_data_at(permutation[j]); // copy the src data to column_string_chars const size_t old_size = column_string_chars.size(); - const size_t new_size = old_size + src_str_ref.size + 1; + const size_t new_size = old_size + src_str_ref.size; column_string_chars.resize(new_size); if (src_str_ref.size > 0) { memcpy(column_string_chars.data() + old_size, src_str_ref.data, src_str_ref.size); } - column_string_chars[old_size + src_str_ref.size] = 0; column_string_offsets.push_back(new_size); if (dest_null_map) { diff --git a/be/src/vec/functions/date_time_transforms.h b/be/src/vec/functions/date_time_transforms.h index 5d96f73eea..6f733cc218 100644 --- a/be/src/vec/functions/date_time_transforms.h +++ b/be/src/vec/functions/date_time_transforms.h @@ -21,14 +21,15 @@ #pragma once #include "common/status.h" -#include "runtime/datetime_value.h" #include "util/binary_cast.hpp" +#include "vec/columns/column_nullable.h" #include "vec/columns/column_string.h" #include "vec/columns/column_vector.h" -#include "vec/common/exception.h" +#include "vec/core/block.h" +#include "vec/core/column_numbers.h" #include "vec/core/types.h" #include "vec/data_types/data_type_date_time.h" -#include "vec/functions/function_helpers.h" +#include "vec/data_types/data_type_string.h" #include "vec/runtime/vdatetime_value.h" namespace doris::vectorized { @@ -154,14 +155,10 @@ struct DayNameImpl { size_t& offset, bool& is_null) { const auto* day_name = dt.day_name(); is_null = !dt.is_valid_date(); - if (day_name == nullptr || is_null) { - offset += 1; - res_data[offset - 1] = 0; - } else { + if (day_name != nullptr && !is_null) { auto len = strlen(day_name); memcpy(&res_data[offset], day_name, len); - offset += len + 1; - res_data[offset - 1] = 0; + offset += len; } return offset; } @@ -187,14 +184,10 @@ struct MonthNameImpl { size_t& offset, bool& is_null) { const auto* month_name = dt.month_name(); is_null = !dt.is_valid_date(); - if (month_name == nullptr || is_null) { - offset += 1; - res_data[offset - 1] = 0; - } else { + if (month_name != nullptr && !is_null) { auto len = strlen(month_name); memcpy(&res_data[offset], month_name, len); - offset += (len + 1); - res_data[offset - 1] = 0; + offset += len; } return offset; } @@ -220,18 +213,14 @@ struct DateFormatImpl { size_t& offset) { const auto& dt = (DateType&)t; if (format.size > 128) { - offset += 1; - res_data.emplace_back(0); return std::pair {offset, true}; } char buf[128]; if (!dt.to_format_string(format.data, format.size, buf)) { - offset += 1; - res_data.emplace_back(0); return std::pair {offset, true}; } - auto len = strlen(buf) + 1; + auto len = strlen(buf); res_data.insert(buf, buf + len); offset += len; return std::pair {offset, false}; @@ -274,19 +263,15 @@ struct FromUnixTimeImpl { DateType dt; if (format.size > 128 || val < 0 || val > INT_MAX || !dt.from_unixtime(val, time_zone)) { - offset += 1; - res_data.emplace_back(0); return std::pair {offset, true}; } char buf[128]; if (!dt.to_format_string(format.data, format.size, buf)) { - offset += 1; - res_data.emplace_back(0); return std::pair {offset, true}; } - auto len = strlen(buf) + 1; + auto len = strlen(buf); res_data.insert(buf, buf + len); offset += len; return std::pair {offset, false}; diff --git a/be/src/vec/functions/function_bit.cpp b/be/src/vec/functions/function_bit.cpp index 7af28601a0..002341bcb2 100644 --- a/be/src/vec/functions/function_bit.cpp +++ b/be/src/vec/functions/function_bit.cpp @@ -94,7 +94,7 @@ struct BitLengthImpl { auto size = offsets.size(); res.resize(size); for (int i = 0; i < size; ++i) { - int str_size = offsets[i] - offsets[i - 1] - 1; + int str_size = offsets[i] - offsets[i - 1]; res[i] = (str_size * 8); } return Status::OK(); diff --git a/be/src/vec/functions/function_bitmap.cpp b/be/src/vec/functions/function_bitmap.cpp index 0d2f85ee55..b10974d876 100644 --- a/be/src/vec/functions/function_bitmap.cpp +++ b/be/src/vec/functions/function_bitmap.cpp @@ -63,7 +63,7 @@ struct ToBitmap { continue; } else { const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); - size_t str_size = offsets[i] - offsets[i - 1] - 1; + size_t str_size = offsets[i] - offsets[i - 1]; StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS; uint64_t int_value = StringParser::string_to_unsigned_int( raw_str, str_size, &parse_result); @@ -85,7 +85,7 @@ struct BitmapFromString { std::vector bits; for (size_t i = 0; i < size; ++i) { const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); - int64_t str_size = offsets[i] - offsets[i - 1] - 1; + int64_t str_size = offsets[i] - offsets[i - 1]; if ((str_size > INT32_MAX) || !(SplitStringAndParse({raw_str, (int)str_size}, ",", &safe_strtou64, &bits))) { @@ -153,7 +153,7 @@ struct BitmapHash { for (size_t i = 0; i < size; ++i) { const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); - size_t str_size = offsets[i] - offsets[i - 1] - 1; + size_t str_size = offsets[i] - offsets[i - 1]; uint32_t hash_value = HashUtil::murmur_hash3_32(raw_str, str_size, HashUtil::MURMUR3_32_SEED); res_data[i].add(hash_value); @@ -172,7 +172,7 @@ struct BitmapHash { continue; } else { const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); - size_t str_size = offsets[i] - offsets[i - 1] - 1; + size_t str_size = offsets[i] - offsets[i - 1]; uint32_t hash_value = HashUtil::murmur_hash3_32(raw_str, str_size, HashUtil::MURMUR3_32_SEED); res_data[i].add(hash_value); diff --git a/be/src/vec/functions/function_cast.h b/be/src/vec/functions/function_cast.h index 9a34a7b1b5..1dacf86689 100644 --- a/be/src/vec/functions/function_cast.h +++ b/be/src/vec/functions/function_cast.h @@ -22,17 +22,15 @@ #include -#include "common/logging.h" +#include "vec/columns/column_array.h" #include "vec/columns/column_const.h" #include "vec/columns/column_nullable.h" #include "vec/columns/column_string.h" #include "vec/columns/columns_common.h" #include "vec/common/assert_cast.h" -#include "vec/common/field_visitors.h" #include "vec/common/string_buffer.hpp" #include "vec/data_types/data_type_decimal.h" #include "vec/data_types/data_type_factory.hpp" -#include "vec/data_types/data_type_nothing.h" #include "vec/data_types/data_type_nullable.h" #include "vec/data_types/data_type_number.h" #include "vec/data_types/data_type_string.h" @@ -895,7 +893,7 @@ struct ConvertThroughParsing { ? (*offsets)[i] : (current_offset + fixed_string_size); size_t string_size = std::is_same_v - ? next_offset - current_offset - 1 + ? next_offset - current_offset : fixed_string_size; ReadBuffer read_buffer(&(*chars)[current_offset], string_size); diff --git a/be/src/vec/functions/function_datetime_string_to_string.h b/be/src/vec/functions/function_datetime_string_to_string.h index ca33ebd854..7e9e84268b 100644 --- a/be/src/vec/functions/function_datetime_string_to_string.h +++ b/be/src/vec/functions/function_datetime_string_to_string.h @@ -17,6 +17,7 @@ #pragma once +#include "vec/columns/column_const.h" #include "vec/columns/column_nullable.h" #include "vec/columns/columns_number.h" #include "vec/data_types/data_type_date.h" diff --git a/be/src/vec/functions/function_encryption.cpp b/be/src/vec/functions/function_encryption.cpp index cfa5b0d9fc..fa650e234c 100644 --- a/be/src/vec/functions/function_encryption.cpp +++ b/be/src/vec/functions/function_encryption.cpp @@ -16,13 +16,8 @@ // under the License. #include "exprs/encryption_functions.h" -#include "runtime/string_search.hpp" #include "util/encryption_util.h" -#include "util/string_util.h" -#include "vec/common/pod_array_fwd.h" #include "vec/functions/function_string.h" -#include "vec/functions/function_string_to_string.h" -#include "vec/functions/function_totype.h" #include "vec/functions/simple_function_factory.h" namespace doris::vectorized { @@ -93,16 +88,16 @@ public: template static void exectue_result(std::vector& offsets_list, std::vector& chars_list, size_t i, - EncryptionMode& encryption_mode, const char* iv_raw, + EncryptionMode& encryption_mode, const char* iv_raw, int iv_length, ColumnString::Chars& result_data, ColumnString::Offsets& result_offset, NullMap& null_map) { - int src_size = (*offsets_list[0])[i] - (*offsets_list[0])[i - 1] - 1; + int src_size = (*offsets_list[0])[i] - (*offsets_list[0])[i - 1]; const auto src_raw = reinterpret_cast(&(*chars_list[0])[(*offsets_list[0])[i - 1]]); - int key_size = (*offsets_list[1])[i] - (*offsets_list[1])[i - 1] - 1; + int key_size = (*offsets_list[1])[i] - (*offsets_list[1])[i - 1]; const auto key_raw = reinterpret_cast(&(*chars_list[1])[(*offsets_list[1])[i - 1]]); - if (*src_raw == '\0' && src_size == 0) { + if (src_size == 0) { StringOP::push_null_string(i, result_data, result_offset, null_map); return; } @@ -115,7 +110,7 @@ static void exectue_result(std::vector& offsets_li int ret_code = 0; ret_code = Impl::exectue_impl(encryption_mode, (unsigned char*)src_raw, src_size, - (unsigned char*)key_raw, key_size, iv_raw, true, + (unsigned char*)key_raw, key_size, iv_raw, iv_length, true, (unsigned char*)p.get()); if (ret_code < 0) { @@ -143,7 +138,7 @@ struct EncryptionAndDecryptTwoImpl { } EncryptionMode encryption_mode = mode; exectue_result(offsets_list, chars_list, i, encryption_mode, nullptr, - result_data, result_offset, null_map); + 0, result_data, result_offset, null_map); } return Status::OK(); } @@ -167,12 +162,13 @@ struct EncryptionAndDecryptFourImpl { } EncryptionMode encryption_mode = mode; - int mode_size = (*offsets_list[3])[i] - (*offsets_list[3])[i - 1] - 1; + int mode_size = (*offsets_list[3])[i] - (*offsets_list[3])[i - 1]; + int iv_size = (*offsets_list[2])[i] - (*offsets_list[2])[i - 1]; const auto mode_raw = reinterpret_cast(&(*chars_list[3])[(*offsets_list[3])[i - 1]]); const auto iv_raw = reinterpret_cast(&(*chars_list[2])[(*offsets_list[2])[i - 1]]); - if (*mode_raw != '\0' || mode_size != 0) { + if (mode_size != 0) { std::string mode_str(mode_raw, mode_size); if constexpr (is_sm_mode) { if (sm4_mode_map.count(mode_str) == 0) { @@ -190,7 +186,7 @@ struct EncryptionAndDecryptFourImpl { } exectue_result(offsets_list, chars_list, i, encryption_mode, iv_raw, - result_data, result_offset, null_map); + iv_size, result_data, result_offset, null_map); } return Status::OK(); } @@ -199,18 +195,18 @@ struct EncryptionAndDecryptFourImpl { struct EncryptImpl { static int exectue_impl(EncryptionMode mode, const unsigned char* source, uint32_t source_length, const unsigned char* key, uint32_t key_length, - const char* iv, bool padding, unsigned char* encrypt) { - return EncryptionUtil::encrypt(mode, source, source_length, key, key_length, iv, true, - encrypt); + const char* iv, int iv_length, bool padding, unsigned char* encrypt) { + return EncryptionUtil::encrypt(mode, source, source_length, key, key_length, iv, iv_length, + true, encrypt); } }; struct DecryptImpl { static int exectue_impl(EncryptionMode mode, const unsigned char* source, uint32_t source_length, const unsigned char* key, uint32_t key_length, - const char* iv, bool padding, unsigned char* encrypt) { - return EncryptionUtil::decrypt(mode, source, source_length, key, key_length, iv, true, - encrypt); + const char* iv, int iv_length, bool padding, unsigned char* encrypt) { + return EncryptionUtil::decrypt(mode, source, source_length, key, key_length, iv, iv_length, + true, encrypt); } }; @@ -232,22 +228,29 @@ struct AESDecryptName { void register_function_encryption(SimpleFunctionFactory& factory) { factory.register_function, SM4EncryptName>>(); + EncryptionAndDecryptTwoImpl, + SM4EncryptName>>(); factory.register_function, SM4DecryptName>>(); + EncryptionAndDecryptTwoImpl, + SM4DecryptName>>(); factory.register_function, AESEncryptName>>(); + EncryptionAndDecryptTwoImpl, + AESEncryptName>>(); factory.register_function, AESDecryptName>>(); + EncryptionAndDecryptTwoImpl, + AESDecryptName>>(); factory.register_function, SM4EncryptName>>(); + EncryptionAndDecryptFourImpl, + SM4EncryptName>>(); factory.register_function, SM4DecryptName>>(); + EncryptionAndDecryptFourImpl, + SM4DecryptName>>(); factory.register_function, AESEncryptName>>(); + EncryptionAndDecryptFourImpl, + AESEncryptName>>(); factory.register_function, + EncryptionAndDecryptFourImpl, AESDecryptName>>(); } diff --git a/be/src/vec/functions/function_hash.cpp b/be/src/vec/functions/function_hash.cpp index 9f3101b3a3..645187d860 100644 --- a/be/src/vec/functions/function_hash.cpp +++ b/be/src/vec/functions/function_hash.cpp @@ -101,7 +101,7 @@ struct MurmurHash2Impl64 { for (size_t i = 0; i < size; ++i) { const ReturnType val = HashUtil::murmur_hash2_64( reinterpret_cast(&data[current_offset]), - offsets[i] - current_offset - 1, 0); + offsets[i] - current_offset, 0); if (first) col_to.insert_data(reinterpret_cast(&val), 0); @@ -187,14 +187,13 @@ struct MurmurHash3Impl32 { if (first) { UInt32 val = HashUtil::murmur_hash3_32( reinterpret_cast(&data[current_offset]), - offsets[i] - current_offset - 1, HashUtil::MURMUR3_32_SEED); + offsets[i] - current_offset, HashUtil::MURMUR3_32_SEED); col_to.insert_data(const_cast(reinterpret_cast(&val)), 0); } else { assert_cast&>(col_to).get_data()[i] = HashUtil::murmur_hash3_32( reinterpret_cast(&data[current_offset]), - offsets[i] - current_offset - 1, - ext::bit_cast(col_to[i])); + offsets[i] - current_offset, ext::bit_cast(col_to[i])); } current_offset = offsets[i]; } diff --git a/be/src/vec/functions/function_hex.cpp b/be/src/vec/functions/function_hex.cpp index add150791d..4402706e86 100644 --- a/be/src/vec/functions/function_hex.cpp +++ b/be/src/vec/functions/function_hex.cpp @@ -19,9 +19,7 @@ #include "vec/columns/column_complex.h" #include "vec/data_types/data_type.h" #include "vec/data_types/data_type_hll.h" -#include "vec/functions/function_const.h" #include "vec/functions/function_string.h" -#include "vec/functions/function_totype.h" #include "vec/functions/simple_function_factory.h" namespace doris::vectorized { @@ -63,17 +61,11 @@ public: static void hex_encode(const unsigned char* source, size_t srclen, unsigned char*& dst_data_ptr, size_t& offset) { - if (srclen == 0) { - DCHECK(*source == '\0'); - *dst_data_ptr = '\0'; - dst_data_ptr++; - offset++; - } else { + if (srclen != 0) { doris::simd::VStringFunctions::hex_encode(source, srclen, reinterpret_cast(dst_data_ptr)); - dst_data_ptr[srclen * 2] = '\0'; - dst_data_ptr += (srclen * 2 + 1); - offset += (srclen * 2 + 1); + dst_data_ptr += (srclen * 2); + offset += (srclen * 2); } } @@ -92,7 +84,7 @@ struct HexStringImpl { auto dst_data_ptr = dst_data.data(); for (int i = 0; i < input_rows_count; ++i) { auto source = reinterpret_cast(&data[offsets[i - 1]]); - size_t srclen = offsets[i] - offsets[i - 1] - 1; + size_t srclen = offsets[i] - offsets[i - 1]; hex_encode(source, srclen, dst_data_ptr, offset); dst_offsets[i] = offset; } diff --git a/be/src/vec/functions/function_json.cpp b/be/src/vec/functions/function_json.cpp index 6abdc6a4b2..55184997a3 100644 --- a/be/src/vec/functions/function_json.cpp +++ b/be/src/vec/functions/function_json.cpp @@ -204,13 +204,13 @@ rapidjson::Value* get_json_object(const std::string_view& json_string, if (UNLIKELY((*parsed_paths).size() == 1)) { if (fntype == JSON_FUN_STRING) { - document->SetString(json_string.data(), document->GetAllocator()); + document->SetString(json_string.data(), json_string.size(), document->GetAllocator()); } else { return document; } } - document->Parse(json_string.data()); + document->Parse(json_string.data(), json_string.size()); if (UNLIKELY(document->HasParseError())) { // VLOG_CRITICAL << "Error at offset " << document->GetErrorOffset() << ": " // << GetParseError_En(document->GetParseError()); @@ -235,10 +235,10 @@ struct GetJsonNumberType { res.resize(size); for (size_t i = 0; i < size; ++i) { const char* l_raw_str = reinterpret_cast(&ldata[loffsets[i - 1]]); - int l_str_size = loffsets[i] - loffsets[i - 1] - 1; + int l_str_size = loffsets[i] - loffsets[i - 1]; const char* r_raw_str = reinterpret_cast(&rdata[roffsets[i - 1]]); - int r_str_size = roffsets[i] - roffsets[i - 1] - 1; + int r_str_size = roffsets[i] - roffsets[i - 1]; if (null_map[i]) { res[i] = 0; @@ -323,10 +323,10 @@ struct GetJsonString { res_offsets.resize(input_rows_count); for (size_t i = 0; i < input_rows_count; ++i) { - int l_size = loffsets[i] - loffsets[i - 1] - 1; + int l_size = loffsets[i] - loffsets[i - 1]; const auto l_raw = reinterpret_cast(&ldata[loffsets[i - 1]]); - int r_size = roffsets[i] - roffsets[i - 1] - 1; + int r_size = roffsets[i] - roffsets[i - 1]; const auto r_raw = reinterpret_cast(&rdata[roffsets[i - 1]]); if (null_map[i]) { diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp index d5f451ef2d..4d52b91d0b 100644 --- a/be/src/vec/functions/function_string.cpp +++ b/be/src/vec/functions/function_string.cpp @@ -24,7 +24,6 @@ #include #include "runtime/string_search.hpp" -#include "util/encryption_util.h" #include "util/url_coding.h" #include "vec/common/pod_array_fwd.h" #include "vec/functions/function_reverse.h" @@ -49,8 +48,7 @@ struct StringASCII { res.resize(size); for (int i = 0; i < size; ++i) { const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); - // if strlen(raw_str) == 0, raw_str[0] is '\0' - res[i] = (strlen(raw_str) == 0) ? 0 : static_cast(raw_str[0]); + res[i] = (offsets[i] == offsets[i - 1]) ? 0 : static_cast(raw_str[0]); } return Status::OK(); } @@ -71,7 +69,7 @@ struct StringLengthImpl { auto size = offsets.size(); res.resize(size); for (int i = 0; i < size; ++i) { - int str_size = offsets[i] - offsets[i - 1] - 1; + int str_size = offsets[i] - offsets[i - 1]; res[i] = str_size; } return Status::OK(); @@ -94,8 +92,8 @@ struct StringUtf8LengthImpl { res.resize(size); for (int i = 0; i < size; ++i) { const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); - int str_size = offsets[i] - offsets[i - 1] - 1; - res[i] = get_char_len(StringValue(const_cast(raw_str), str_size), str_size); + int str_size = offsets[i] - offsets[i - 1]; + res[i] = get_char_len(StringValue(raw_str, str_size), str_size); } return Status::OK(); } @@ -188,8 +186,8 @@ struct InStrOP { return; } - StringValue str_sv(const_cast(strl.data()), strl.length()); - StringValue substr_sv(const_cast(strr.data()), strr.length()); + StringValue str_sv(strl.data(), strl.length()); + StringValue substr_sv(strr.data(), strr.length()); StringSearch search(&substr_sv); // Hive returns positions starting from 1. int loc = search.search(&str_sv); @@ -224,10 +222,10 @@ struct StringFunctionImpl { res.resize(size); for (int i = 0; i < size; ++i) { const char* l_raw_str = reinterpret_cast(&ldata[loffsets[i - 1]]); - int l_str_size = loffsets[i] - loffsets[i - 1] - 1; + int l_str_size = loffsets[i] - loffsets[i - 1]; const char* r_raw_str = reinterpret_cast(&rdata[roffsets[i - 1]]); - int r_str_size = roffsets[i] - roffsets[i - 1] - 1; + int r_str_size = roffsets[i] - roffsets[i - 1]; std::string_view lview(l_raw_str, l_str_size); std::string_view rview(r_raw_str, r_str_size); @@ -288,7 +286,8 @@ struct TrimImpl { for (size_t i = 0; i < offset_size; ++i) { const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); - StringVal str(raw_str); + ColumnString::Offset size = offsets[i] - offsets[i - 1]; + StringVal str(raw_str, size); if constexpr (is_ltrim) { str = simd::VStringFunctions::ltrim(str); } @@ -367,9 +366,9 @@ struct UnHexImpl { } auto source = reinterpret_cast(&data[offsets[i - 1]]); - size_t srclen = offsets[i] - offsets[i - 1] - 1; + size_t srclen = offsets[i] - offsets[i - 1]; - if (*source == '\0' && srclen == 0) { + if (srclen == 0) { StringOP::push_empty_string(i, dst_data, dst_offsets); continue; } @@ -438,9 +437,9 @@ struct ToBase64Impl { } auto source = reinterpret_cast(&data[offsets[i - 1]]); - size_t srclen = offsets[i] - offsets[i - 1] - 1; + size_t srclen = offsets[i] - offsets[i - 1]; - if (*source == '\0' && srclen == 0) { + if (srclen == 0) { StringOP::push_null_string(i, dst_data, dst_offsets, null_map); continue; } @@ -478,9 +477,9 @@ struct FromBase64Impl { } auto source = reinterpret_cast(&data[offsets[i - 1]]); - size_t srclen = offsets[i] - offsets[i - 1] - 1; + size_t srclen = offsets[i] - offsets[i - 1]; - if (*source == '\0' && srclen == 0) { + if (srclen == 0) { StringOP::push_null_string(i, dst_data, dst_offsets, null_map); continue; } @@ -518,10 +517,10 @@ struct StringAppendTrailingCharIfAbsent { for (size_t i = 0; i < input_rows_count; ++i) { buffer.clear(); - int l_size = loffsets[i] - loffsets[i - 1] - 1; + int l_size = loffsets[i] - loffsets[i - 1]; const auto l_raw = reinterpret_cast(&ldata[loffsets[i - 1]]); - int r_size = roffsets[i] - roffsets[i - 1] - 1; + int r_size = roffsets[i] - roffsets[i - 1]; const auto r_raw = reinterpret_cast(&rdata[roffsets[i - 1]]); if (r_size != 1) { diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index 69c61f4d6f..0bd1dac58f 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -38,7 +38,6 @@ #include "vec/columns/columns_number.h" #include "vec/common/assert_cast.h" #include "vec/common/string_ref.h" -#include "vec/data_types/data_type_array.h" #include "vec/data_types/data_type_decimal.h" #include "vec/data_types/data_type_nullable.h" #include "vec/data_types/data_type_number.h" @@ -99,7 +98,6 @@ inline size_t get_char_len(const StringValue& str, size_t end_pos) { struct StringOP { static void push_empty_string(int index, ColumnString::Chars& chars, ColumnString::Offsets& offsets) { - chars.push_back('\0'); offsets[index] = chars.size(); } @@ -112,7 +110,6 @@ struct StringOP { static void push_value_string(const std::string_view& string_value, int index, ColumnString::Chars& chars, ColumnString::Offsets& offsets) { chars.insert(string_value.data(), string_value.data() + string_value.size()); - chars.push_back('\0'); offsets[index] = chars.size(); } }; @@ -167,7 +164,7 @@ private: for (int i = 0; i < size; ++i) { auto* raw_str = reinterpret_cast(&chars[offsets[i - 1]]); - int str_size = offsets[i] - offsets[i - 1] - 1; + int str_size = offsets[i] - offsets[i - 1]; // return empty string if start > src.length if (start[i] > str_size) { StringOP::push_empty_string(i, res_chars, res_offsets); @@ -360,7 +357,7 @@ public: auto& pos_data = assert_cast(pos_col.get())->get_data(); for (int i = 0; i < input_rows_count; ++i) { - strlen_data[i] = str_offset[i] - str_offset[i - 1] - 1; + strlen_data[i] = str_offset[i] - str_offset[i - 1]; } for (int i = 0; i < input_rows_count; ++i) { @@ -407,7 +404,7 @@ public: auto& res_map_data = res_map->get_data(); for (int i = 0; i < input_rows_count; ++i) { - int size = offsets[i] - offsets[i - 1] - 1; + int size = offsets[i] - offsets[i - 1]; res_map_data[i] |= (size == 0); } @@ -464,7 +461,7 @@ public: // but it's not necessary to ignore it for (size_t i = 0; i < offsets_list.size(); ++i) { for (size_t j = 0; j < input_rows_count; ++j) { - res_reserve_size += (*offsets_list[i])[j] - (*offsets_list[i])[j - 1] - 1; + res_reserve_size += (*offsets_list[i])[j] - (*offsets_list[i])[j - 1]; } } // for each terminal zero @@ -478,14 +475,11 @@ public: auto& current_offsets = *offsets_list[j]; auto& current_chars = *chars_list[j]; - int size = current_offsets[i] - current_offsets[i - 1] - 1; + int size = current_offsets[i] - current_offsets[i - 1]; memcpy(&res_data[res_offset[i - 1]] + current_length, ¤t_chars[current_offsets[i - 1]], size); current_length += size; } - // add terminal zero - *(&res_data[res_offset[i - 1]] + current_length) = '\0'; - current_length++; res_offset[i] = res_offset[i - 1] + current_length; } @@ -526,7 +520,6 @@ public: auto& res_offset = res->get_offsets(); res_offset.resize(input_rows_count); for (size_t i = 0; i < input_rows_count; ++i) { - res_data.push_back('\0'); res_offset[i] = res_data.size(); } block.get_by_position(result).column = @@ -554,10 +547,11 @@ public: DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { const IDataType* first_type = arguments[0].get(); - if (first_type->is_nullable()) + if (first_type->is_nullable()) { return make_nullable(std::make_shared()); - else + } else { return std::make_shared(); + } } bool use_default_implementation_for_nulls() const override { return false; } bool use_default_implementation_for_constants() const override { return true; } @@ -679,7 +673,7 @@ private: continue; } - int sep_size = sep_offsets[i] - sep_offsets[i - 1] - 1; + int sep_size = sep_offsets[i] - sep_offsets[i - 1]; const char* sep_data = reinterpret_cast(&sep_chars[sep_offsets[i - 1]]); std::string_view sep(sep_data, sep_size); @@ -691,8 +685,8 @@ private: const auto current_src_string_offset = current_src_array_offset ? src_string_offsets[current_src_array_offset - 1] : 0; - size_t bytes_to_copy = src_string_offsets[current_src_array_offset] - - current_src_string_offset - 1; + size_t bytes_to_copy = + src_string_offsets[current_src_array_offset] - current_src_string_offset; const char* ptr = reinterpret_cast(&string_src_chars[current_src_string_offset]); @@ -725,7 +719,7 @@ private: continue; } - int sep_size = sep_offsets[i] - sep_offsets[i - 1] - 1; + int sep_size = sep_offsets[i] - sep_offsets[i - 1]; const char* sep_data = reinterpret_cast(&sep_chars[sep_offsets[i - 1]]); std::string_view sep(sep_data, sep_size); @@ -735,7 +729,7 @@ private: auto& current_offsets = *offsets_list[j]; auto& current_chars = *chars_list[j]; auto& current_nullmap = *null_list[j]; - int size = current_offsets[i] - current_offsets[i - 1] - 1; + int size = current_offsets[i] - current_offsets[i - 1]; const char* ptr = reinterpret_cast(¤t_chars[current_offsets[i - 1]]); if (!current_nullmap[i]) { @@ -793,7 +787,7 @@ public: for (ssize_t i = 0; i < input_row_size; ++i) { buffer.clear(); const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); - int size = offsets[i] - offsets[i - 1] - 1; + int size = offsets[i] - offsets[i - 1]; int repeat = repeats[i]; // assert size * repeat won't exceed DCHECK_LE(static_cast(size) * repeat, std::numeric_limits::max()); @@ -873,11 +867,11 @@ public: null_map_data[i] = true; StringOP::push_empty_string(i, res_chars, res_offsets); } else { - int str_len = strcol_offsets[i] - strcol_offsets[i - 1] - 1; + int str_len = strcol_offsets[i] - strcol_offsets[i - 1]; const char* str_data = reinterpret_cast(&strcol_chars[strcol_offsets[i - 1]]); - int pad_len = padcol_offsets[i] - padcol_offsets[i - 1] - 1; + int pad_len = padcol_offsets[i] - padcol_offsets[i - 1]; const char* pad_data = reinterpret_cast(&padcol_chars[padcol_offsets[i - 1]]); @@ -1019,7 +1013,7 @@ public: int32_t num = 0; while (num < part_number) { pre_offset = offset; - size_t n = str.size - offset - 1; + size_t n = str.size - offset; const char* pos = reinterpret_cast( memchr(str.data + offset + 1, delimiter_str[0], n)); if (pos != nullptr) { @@ -1137,7 +1131,7 @@ public: auto& current_offsets = *offsets_list[j]; auto& current_chars = *chars_list[j]; - int size = current_offsets[i] - current_offsets[i - 1] - 1; + int size = current_offsets[i] - current_offsets[i - 1]; if (size < 1) { continue; } @@ -1491,7 +1485,7 @@ struct ReverseImpl { res_data.reserve(data.size()); for (ssize_t i = 0; i < rows_count; ++i) { auto src_str = reinterpret_cast(&data[offsets[i - 1]]); - int64_t src_len = offsets[i] - offsets[i - 1] - 1; + int64_t src_len = offsets[i] - offsets[i - 1]; char dst[src_len]; simd::VStringFunctions::reverse(StringVal((uint8_t*)src_str, src_len), StringVal((uint8_t*)dst, src_len)); diff --git a/be/src/vec/functions/function_timestamp.cpp b/be/src/vec/functions/function_timestamp.cpp index de06f993d9..3c2f0d5b65 100644 --- a/be/src/vec/functions/function_timestamp.cpp +++ b/be/src/vec/functions/function_timestamp.cpp @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -#include "runtime/datetime_value.h" #include "runtime/runtime_state.h" #include "udf/udf_internal.h" #include "vec/columns/column_nullable.h" @@ -23,9 +22,11 @@ #include "vec/columns/column_vector.h" #include "vec/data_types/data_type_date.h" #include "vec/data_types/data_type_date_time.h" -#include "vec/functions/function_totype.h" +#include "vec/data_types/data_type_number.h" +#include "vec/data_types/data_type_string.h" #include "vec/functions/simple_function_factory.h" #include "vec/runtime/vdatetime_value.h" +#include "vec/utils/util.hpp" namespace doris::vectorized { @@ -97,10 +98,10 @@ struct StrToDate { res.resize(size); for (size_t i = 0; i < size; ++i) { const char* l_raw_str = reinterpret_cast(&ldata[loffsets[i - 1]]); - int l_str_size = loffsets[i] - loffsets[i - 1] - 1; + int l_str_size = loffsets[i] - loffsets[i - 1]; const char* r_raw_str = reinterpret_cast(&rdata[roffsets[i - 1]]); - int r_str_size = roffsets[i] - roffsets[i - 1] - 1; + int r_str_size = roffsets[i] - roffsets[i - 1]; auto& ts_val = *reinterpret_cast(&res[i]); if (!ts_val.from_date_format_str(r_raw_str, r_str_size, l_raw_str, l_str_size)) { diff --git a/be/src/vec/functions/hll_hash.cpp b/be/src/vec/functions/hll_hash.cpp index bc7e1f5979..66e2cf089b 100644 --- a/be/src/vec/functions/hll_hash.cpp +++ b/be/src/vec/functions/hll_hash.cpp @@ -15,9 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "exprs/hll_function.h" #include "olap/hll.h" -#include "udf/udf.h" #include "vec/data_types/data_type_hll.h" #include "vec/functions/function_always_not_nullable.h" #include "vec/functions/simple_function_factory.h" @@ -37,7 +35,7 @@ struct HLLHash { for (size_t i = 0; i < size; ++i) { const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); - size_t str_size = offsets[i] - offsets[i - 1] - 1; + size_t str_size = offsets[i] - offsets[i - 1]; uint64_t hash_value = HashUtil::murmur_hash64A(raw_str, str_size, HashUtil::MURMUR_SEED); res_data[i].update(hash_value); @@ -56,7 +54,7 @@ struct HLLHash { continue; } else { const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); - size_t str_size = offsets[i] - offsets[i - 1] - 1; + size_t str_size = offsets[i] - offsets[i - 1]; uint64_t hash_value = HashUtil::murmur_hash64A(raw_str, str_size, HashUtil::MURMUR_SEED); res_data[i].update(hash_value); diff --git a/be/src/vec/functions/like.cpp b/be/src/vec/functions/like.cpp index 86db32c0dd..426fea9562 100644 --- a/be/src/vec/functions/like.cpp +++ b/be/src/vec/functions/like.cpp @@ -19,12 +19,7 @@ #include "runtime/string_value.h" #include "runtime/string_value.hpp" -#include "vec/columns/column_const.h" -#include "vec/columns/column_set.h" #include "vec/columns/columns_number.h" -#include "vec/data_types/data_type_nullable.h" -#include "vec/data_types/data_type_number.h" -#include "vec/functions/function.h" #include "vec/functions/simple_function_factory.h" namespace doris::vectorized { @@ -226,10 +221,10 @@ Status FunctionLikeBase::vector_vector(const ColumnString::Chars& values, /// Determine which index it refers to. /// begin + value_offsets[i] is the start offset of string at i+1 - while (begin + value_offsets[i] <= pos) ++i; + while (begin + value_offsets[i] < pos) ++i; /// We check that the entry does not pass through the boundaries of strings. - if (pos + needle_size < begin + value_offsets[i]) { + if (pos + needle_size <= begin + value_offsets[i]) { result[i] = 1; } @@ -245,10 +240,10 @@ Status FunctionLikeBase::vector_vector(const ColumnString::Chars& values, for (int i = 0; i < size; ++i) { char* val_raw_str = (char*)(&values[value_offsets[i - 1]]); - UInt32 val_str_size = value_offsets[i] - value_offsets[i - 1] - 1; + UInt32 val_str_size = value_offsets[i] - value_offsets[i - 1]; char* pattern_raw_str = (char*)(&patterns[pattern_offsets[i - 1]]); - UInt32 patter_str_size = pattern_offsets[i] - pattern_offsets[i - 1] - 1; + UInt32 patter_str_size = pattern_offsets[i] - pattern_offsets[i - 1]; RETURN_IF_ERROR((function)(search_state, StringValue(val_raw_str, val_str_size), StringValue(pattern_raw_str, patter_str_size), &result[i])); } diff --git a/be/src/vec/olap/olap_data_convertor.cpp b/be/src/vec/olap/olap_data_convertor.cpp index c8d07e4a8e..61413663e6 100644 --- a/be/src/vec/olap/olap_data_convertor.cpp +++ b/be/src/vec/olap/olap_data_convertor.cpp @@ -17,7 +17,6 @@ #include "vec/olap/olap_data_convertor.h" -#include "common/consts.h" #include "olap/tablet_schema.h" #include "vec/columns/column_array.h" #include "vec/columns/column_complex.h" @@ -447,7 +446,7 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorVarChar::convert_to_olap() while (offset_cur != offset_end) { if (!*nullmap_cur) { slice->data = const_cast(char_data + string_offset); - slice->size = *offset_cur - string_offset - 1; + slice->size = *offset_cur - string_offset; if (UNLIKELY(slice->size > config::string_type_length_soft_limit_bytes && _check_length)) { return Status::NotSupported( @@ -468,7 +467,7 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorVarChar::convert_to_olap() } else { while (offset_cur != offset_end) { slice->data = const_cast(char_data + string_offset); - slice->size = *offset_cur - string_offset - 1; + slice->size = *offset_cur - string_offset; if (UNLIKELY(slice->size > config::string_type_length_soft_limit_bytes && _check_length)) { return Status::NotSupported( diff --git a/be/src/vec/olap/olap_data_convertor.h b/be/src/vec/olap/olap_data_convertor.h index 719c7d78ca..bb0bfcf6e2 100644 --- a/be/src/vec/olap/olap_data_convertor.h +++ b/be/src/vec/olap/olap_data_convertor.h @@ -18,7 +18,6 @@ #pragma once #include "olap/types.h" -#include "runtime/mem_pool.h" #include "vec/columns/column_nullable.h" #include "vec/core/column_with_type_and_name.h" #include "vec/core/types.h" @@ -117,7 +116,7 @@ private: private: static bool should_padding(const ColumnString* column, size_t padding_length) { // Check sum of data length, including terminating zero. - return column->size() * (padding_length + 1) != column->chars.size(); + return column->size() * padding_length != column->chars.size(); } static ColumnPtr clone_and_padding(const ColumnString* input, size_t padding_length) { @@ -126,11 +125,11 @@ private: assert_cast(column->assume_mutable().get()); column->offsets.resize(input->size()); - column->chars.resize(input->size() * (padding_length + 1)); - memset(padded_column->chars.data(), 0, input->size() * (padding_length + 1)); + column->chars.resize(input->size() * padding_length); + memset(padded_column->chars.data(), 0, input->size() * padding_length); for (size_t i = 0; i < input->size(); i++) { - column->offsets[i] = (i + 1) * (padding_length + 1); + column->offsets[i] = (i + 1) * padding_length; auto str = input->get_data_at(i); @@ -139,8 +138,7 @@ private: << ", real=" << str.size; if (str.size) { - memcpy(padded_column->chars.data() + i * (padding_length + 1), str.data, - str.size); + memcpy(padded_column->chars.data() + i * padding_length, str.data, str.size); } } diff --git a/be/src/vec/runtime/vdata_stream_recvr.cpp b/be/src/vec/runtime/vdata_stream_recvr.cpp index 02d450ce1b..0e40404733 100644 --- a/be/src/vec/runtime/vdata_stream_recvr.cpp +++ b/be/src/vec/runtime/vdata_stream_recvr.cpp @@ -152,7 +152,6 @@ void VDataStreamRecvr::SenderQueue::add_block(Block* block, bool use_move) { return; } Block* nblock = new Block(block->get_columns_with_type_and_name()); - nblock->info = block->info; // local exchange should copy the block contented if use move == false if (use_move) { diff --git a/be/src/vec/sink/vmysql_result_writer.cpp b/be/src/vec/sink/vmysql_result_writer.cpp index c90b0c9f8d..b558e60e78 100644 --- a/be/src/vec/sink/vmysql_result_writer.cpp +++ b/be/src/vec/sink/vmysql_result_writer.cpp @@ -20,6 +20,7 @@ #include "runtime/buffer_control_block.h" #include "runtime/large_int_value.h" #include "runtime/runtime_state.h" +#include "vec/columns/column_array.h" #include "vec/columns/column_nullable.h" #include "vec/columns/column_vector.h" #include "vec/common/assert_cast.h" diff --git a/be/src/vec/utils/arrow_column_to_doris_column.cpp b/be/src/vec/utils/arrow_column_to_doris_column.cpp index 4d9716b55c..eacd1136b5 100644 --- a/be/src/vec/utils/arrow_column_to_doris_column.cpp +++ b/be/src/vec/utils/arrow_column_to_doris_column.cpp @@ -23,10 +23,8 @@ #include "arrow/array/array_binary.h" #include "arrow/array/array_nested.h" -#include "arrow/scalar.h" #include "arrow/type.h" #include "arrow/type_fwd.h" -#include "arrow/type_traits.h" #include "gutil/casts.h" #include "vec/columns/column_array.h" #include "vec/columns/column_nullable.h" @@ -114,7 +112,6 @@ static Status convert_column_with_string_data(const arrow::Array* array, size_t const auto* raw_data = buffer->data() + concrete_array->value_offset(offset_i); column_chars_t.insert(raw_data, raw_data + concrete_array->value_length(offset_i)); } - column_chars_t.emplace_back('\0'); column_offsets.emplace_back(column_chars_t.size()); } @@ -136,7 +133,6 @@ static Status convert_column_with_fixed_size_data(const arrow::Array* array, siz const auto* raw_data = array_data + (offset_i * width); column_chars_t.insert(raw_data, raw_data + width); } - column_chars_t.emplace_back('\0'); column_offsets.emplace_back(column_chars_t.size()); } return Status::OK(); diff --git a/be/test/util/encryption_util_test.cpp b/be/test/util/encryption_util_test.cpp index 089bad2185..60e1902e7c 100644 --- a/be/test/util/encryption_util_test.cpp +++ b/be/test/util/encryption_util_test.cpp @@ -37,14 +37,14 @@ private: void do_aes_test(const std::string& source, const std::string& key) { int cipher_len = source.length() + 16; std::unique_ptr dest(new unsigned char[cipher_len]); - int ret_code = EncryptionUtil::encrypt(AES_128_ECB, (unsigned char*)source.c_str(), - source.length(), (unsigned char*)key.c_str(), - key.length(), nullptr, true, dest.get()); + int ret_code = EncryptionUtil::encrypt( + EncryptionMode::AES_128_ECB, (unsigned char*)source.c_str(), source.length(), + (unsigned char*)key.c_str(), key.length(), nullptr, 0, true, dest.get()); EXPECT_TRUE(ret_code > 0); int encrypted_length = ret_code; std::unique_ptr decrypted(new char[cipher_len]); - ret_code = EncryptionUtil::decrypt(AES_128_ECB, dest.get(), encrypted_length, - (unsigned char*)key.c_str(), key.length(), nullptr, true, + ret_code = EncryptionUtil::decrypt(EncryptionMode::AES_128_ECB, dest.get(), encrypted_length, + (unsigned char*)key.c_str(), key.length(), nullptr, 0, true, (unsigned char*)decrypted.get()); EXPECT_TRUE(ret_code > 0); std::string decrypted_content(decrypted.get(), ret_code); @@ -54,14 +54,14 @@ void do_aes_test(const std::string& source, const std::string& key) { void do_sm4_test(const std::string& source, const std::string& key) { int cipher_len = source.length() + 16; std::unique_ptr dest(new unsigned char[cipher_len]); - int ret_code = EncryptionUtil::encrypt(SM4_128_ECB, (unsigned char*)source.c_str(), - source.length(), (unsigned char*)key.c_str(), - key.length(), nullptr, true, dest.get()); + int ret_code = EncryptionUtil::encrypt( + EncryptionMode::SM4_128_ECB, (unsigned char*)source.c_str(), source.length(), + (unsigned char*)key.c_str(), key.length(), nullptr, 0, true, dest.get()); EXPECT_TRUE(ret_code > 0); int encrypted_length = ret_code; std::unique_ptr decrypted(new char[cipher_len]); - ret_code = EncryptionUtil::decrypt(SM4_128_ECB, dest.get(), encrypted_length, - (unsigned char*)key.c_str(), key.length(), nullptr, true, + ret_code = EncryptionUtil::decrypt(EncryptionMode::SM4_128_ECB, dest.get(), encrypted_length, + (unsigned char*)key.c_str(), key.length(), nullptr, 0, true, (unsigned char*)decrypted.get()); EXPECT_TRUE(ret_code > 0); std::string decrypted_content(decrypted.get(), ret_code); @@ -91,9 +91,10 @@ TEST_F(EncryptionUtilTest, aes_test_by_case) { std::unique_ptr encrypt_1(new char[case_1.length()]); int length_1 = base64_decode(case_1.c_str(), case_1.length(), encrypt_1.get()); std::unique_ptr decrypted_1(new char[case_1.length()]); - int ret_code = EncryptionUtil::decrypt(AES_128_ECB, (unsigned char*)encrypt_1.get(), length_1, - (unsigned char*)_aes_key.c_str(), _aes_key.length(), - nullptr, true, (unsigned char*)decrypted_1.get()); + int ret_code = + EncryptionUtil::decrypt(EncryptionMode::AES_128_ECB, (unsigned char*)encrypt_1.get(), + length_1, (unsigned char*)_aes_key.c_str(), _aes_key.length(), + nullptr, 0, true, (unsigned char*)decrypted_1.get()); EXPECT_TRUE(ret_code > 0); std::string decrypted_content_1(decrypted_1.get(), ret_code); EXPECT_EQ(source_1, decrypted_content_1); @@ -101,9 +102,10 @@ TEST_F(EncryptionUtilTest, aes_test_by_case) { std::unique_ptr encrypt_2(new char[case_2.length()]); int length_2 = base64_decode(case_2.c_str(), case_2.length(), encrypt_2.get()); std::unique_ptr decrypted_2(new char[case_2.length()]); - ret_code = EncryptionUtil::decrypt(AES_128_ECB, (unsigned char*)encrypt_2.get(), length_2, - (unsigned char*)_aes_key.c_str(), _aes_key.length(), nullptr, - true, (unsigned char*)decrypted_2.get()); + ret_code = + EncryptionUtil::decrypt(EncryptionMode::AES_128_ECB, (unsigned char*)encrypt_2.get(), + length_2, (unsigned char*)_aes_key.c_str(), _aes_key.length(), + nullptr, 0, true, (unsigned char*)decrypted_2.get()); EXPECT_TRUE(ret_code > 0); std::string decrypted_content_2(decrypted_2.get(), ret_code); EXPECT_EQ(source_2, decrypted_content_2); @@ -118,9 +120,10 @@ TEST_F(EncryptionUtilTest, sm4_test_by_case) { std::unique_ptr encrypt_1(new char[case_1.length()]); int length_1 = base64_decode(case_1.c_str(), case_1.length(), encrypt_1.get()); std::unique_ptr decrypted_1(new char[case_1.length()]); - int ret_code = EncryptionUtil::decrypt(SM4_128_ECB, (unsigned char*)encrypt_1.get(), length_1, - (unsigned char*)_aes_key.c_str(), _aes_key.length(), - nullptr, true, (unsigned char*)decrypted_1.get()); + int ret_code = + EncryptionUtil::decrypt(EncryptionMode::SM4_128_ECB, (unsigned char*)encrypt_1.get(), + length_1, (unsigned char*)_aes_key.c_str(), _aes_key.length(), + nullptr, 0, true, (unsigned char*)decrypted_1.get()); EXPECT_TRUE(ret_code > 0); std::string decrypted_content_1(decrypted_1.get(), ret_code); EXPECT_EQ(source_1, decrypted_content_1); @@ -128,9 +131,10 @@ TEST_F(EncryptionUtilTest, sm4_test_by_case) { std::unique_ptr encrypt_2(new char[case_2.length()]); int length_2 = base64_decode(case_2.c_str(), case_2.length(), encrypt_2.get()); std::unique_ptr decrypted_2(new char[case_2.length()]); - ret_code = EncryptionUtil::decrypt(SM4_128_ECB, (unsigned char*)encrypt_2.get(), length_2, - (unsigned char*)_aes_key.c_str(), _aes_key.length(), nullptr, - true, (unsigned char*)decrypted_2.get()); + ret_code = + EncryptionUtil::decrypt(EncryptionMode::SM4_128_ECB, (unsigned char*)encrypt_2.get(), + length_2, (unsigned char*)_aes_key.c_str(), _aes_key.length(), + nullptr, 0, true, (unsigned char*)decrypted_2.get()); EXPECT_TRUE(ret_code > 0); std::string decrypted_content_2(decrypted_2.get(), ret_code); EXPECT_EQ(source_2, decrypted_content_2); @@ -146,17 +150,19 @@ TEST_F(EncryptionUtilTest, aes_with_iv_test_by_case) { std::unique_ptr encrypt_1(new char[case_1.length()]); int length_1 = base64_decode(case_1.c_str(), case_1.length(), encrypt_1.get()); std::unique_ptr decrypted_1(new char[case_1.length()]); - int ret_code = EncryptionUtil::decrypt(AES_128_CBC, (unsigned char*)encrypt_1.get(), length_1, - (unsigned char*)_aes_key.c_str(), _aes_key.length(), - iv.c_str(), true, (unsigned char*)decrypted_1.get()); + int ret_code = EncryptionUtil::decrypt( + EncryptionMode::AES_128_CBC, (unsigned char*)encrypt_1.get(), length_1, + (unsigned char*)_aes_key.c_str(), _aes_key.length(), iv.c_str(), iv.length(), true, + (unsigned char*)decrypted_1.get()); EXPECT_TRUE(ret_code > 0); std::string decrypted_content_1(decrypted_1.get(), ret_code); EXPECT_EQ(source_1, decrypted_content_1); std::unique_ptr decrypted_11(new char[case_1.length()]); - ret_code = EncryptionUtil::decrypt(AES_128_CBC, (unsigned char*)encrypt_1.get(), length_1, - (unsigned char*)_aes_key.c_str(), _aes_key.length(), - iv.c_str(), true, (unsigned char*)decrypted_11.get()); + ret_code = EncryptionUtil::decrypt(EncryptionMode::AES_128_CBC, (unsigned char*)encrypt_1.get(), + length_1, (unsigned char*)_aes_key.c_str(), + _aes_key.length(), iv.c_str(), iv.length(), true, + (unsigned char*)decrypted_11.get()); EXPECT_TRUE(ret_code > 0); std::string decrypted_content_11(decrypted_11.get(), ret_code); EXPECT_EQ(source_1, decrypted_content_11); @@ -164,17 +170,19 @@ TEST_F(EncryptionUtilTest, aes_with_iv_test_by_case) { std::unique_ptr encrypt_2(new char[case_2.length()]); int length_2 = base64_decode(case_2.c_str(), case_2.length(), encrypt_2.get()); std::unique_ptr decrypted_2(new char[case_2.length()]); - ret_code = EncryptionUtil::decrypt(AES_128_CBC, (unsigned char*)encrypt_2.get(), length_2, - (unsigned char*)_aes_key.c_str(), _aes_key.length(), - iv.c_str(), true, (unsigned char*)decrypted_2.get()); + ret_code = EncryptionUtil::decrypt(EncryptionMode::AES_128_CBC, (unsigned char*)encrypt_2.get(), + length_2, (unsigned char*)_aes_key.c_str(), + _aes_key.length(), iv.c_str(), iv.length(), true, + (unsigned char*)decrypted_2.get()); EXPECT_TRUE(ret_code > 0); std::string decrypted_content_2(decrypted_2.get(), ret_code); EXPECT_EQ(source_2, decrypted_content_2); std::unique_ptr decrypted_21(new char[case_2.length()]); - ret_code = EncryptionUtil::decrypt(AES_128_CBC, (unsigned char*)encrypt_2.get(), length_2, - (unsigned char*)_aes_key.c_str(), _aes_key.length(), - iv.c_str(), true, (unsigned char*)decrypted_21.get()); + ret_code = EncryptionUtil::decrypt(EncryptionMode::AES_128_CBC, (unsigned char*)encrypt_2.get(), + length_2, (unsigned char*)_aes_key.c_str(), + _aes_key.length(), iv.c_str(), iv.length(), true, + (unsigned char*)decrypted_21.get()); EXPECT_TRUE(ret_code > 0); std::string decrypted_content_21(decrypted_21.get(), ret_code); EXPECT_EQ(source_2, decrypted_content_21); @@ -192,9 +200,10 @@ TEST_F(EncryptionUtilTest, sm4_with_iv_test_by_case) { std::unique_ptr decrypted_1(new char[case_1.length()]); std::unique_ptr decrypted_11(new char[case_1.length()]); - int ret_code = EncryptionUtil::decrypt(SM4_128_CBC, (unsigned char*)encrypt_1.get(), length_1, - (unsigned char*)_aes_key.c_str(), _aes_key.length(), - iv.c_str(), true, (unsigned char*)decrypted_1.get()); + int ret_code = EncryptionUtil::decrypt( + EncryptionMode::SM4_128_CBC, (unsigned char*)encrypt_1.get(), length_1, + (unsigned char*)_aes_key.c_str(), _aes_key.length(), iv.c_str(), iv.length(), true, + (unsigned char*)decrypted_1.get()); EXPECT_TRUE(ret_code > 0); std::string decrypted_content_1(decrypted_1.get(), ret_code); EXPECT_EQ(source_1, decrypted_content_1); @@ -204,23 +213,26 @@ TEST_F(EncryptionUtilTest, sm4_with_iv_test_by_case) { std::unique_ptr decrypted_2(new char[case_2.length()]); std::unique_ptr decrypted_21(new char[case_2.length()]); - ret_code = EncryptionUtil::decrypt(SM4_128_CBC, (unsigned char*)encrypt_2.get(), length_2, - (unsigned char*)_aes_key.c_str(), _aes_key.length(), - iv.c_str(), true, (unsigned char*)decrypted_2.get()); + ret_code = EncryptionUtil::decrypt(EncryptionMode::SM4_128_CBC, (unsigned char*)encrypt_2.get(), + length_2, (unsigned char*)_aes_key.c_str(), + _aes_key.length(), iv.c_str(), iv.length(), true, + (unsigned char*)decrypted_2.get()); EXPECT_TRUE(ret_code > 0); std::string decrypted_content_2(decrypted_2.get(), ret_code); EXPECT_EQ(source_2, decrypted_content_2); - ret_code = EncryptionUtil::decrypt(SM4_128_CBC, (unsigned char*)encrypt_1.get(), length_1, - (unsigned char*)_aes_key.c_str(), _aes_key.length(), - iv.c_str(), true, (unsigned char*)decrypted_11.get()); + ret_code = EncryptionUtil::decrypt(EncryptionMode::SM4_128_CBC, (unsigned char*)encrypt_1.get(), + length_1, (unsigned char*)_aes_key.c_str(), + _aes_key.length(), iv.c_str(), iv.length(), true, + (unsigned char*)decrypted_11.get()); EXPECT_TRUE(ret_code > 0); std::string decrypted_content_11(decrypted_11.get(), ret_code); EXPECT_EQ(source_1, decrypted_content_11); - ret_code = EncryptionUtil::decrypt(SM4_128_CBC, (unsigned char*)encrypt_2.get(), length_2, - (unsigned char*)_aes_key.c_str(), _aes_key.length(), - iv.c_str(), true, (unsigned char*)decrypted_21.get()); + ret_code = EncryptionUtil::decrypt(EncryptionMode::SM4_128_CBC, (unsigned char*)encrypt_2.get(), + length_2, (unsigned char*)_aes_key.c_str(), + _aes_key.length(), iv.c_str(), iv.length(), true, + (unsigned char*)decrypted_21.get()); EXPECT_TRUE(ret_code > 0); std::string decrypted_content_21(decrypted_21.get(), ret_code); EXPECT_EQ(source_2, decrypted_content_21); diff --git a/be/test/vec/core/block_test.cpp b/be/test/vec/core/block_test.cpp index 70be195c69..5f222f5327 100644 --- a/be/test/vec/core/block_test.cpp +++ b/be/test/vec/core/block_test.cpp @@ -28,6 +28,7 @@ #include "runtime/row_batch.h" #include "runtime/string_value.h" #include "runtime/tuple_row.h" +#include "vec/columns/column_array.h" #include "vec/columns/column_decimal.h" #include "vec/columns/column_nullable.h" #include "vec/columns/column_string.h" @@ -145,7 +146,7 @@ TEST(BlockTest, RowBatchCovertToBlock) { } EXPECT_EQ(column2->get_int(i), k2++); EXPECT_EQ(column3->get_float64(i), k3); - EXPECT_STREQ(column4->get_data_at(i).data, std::to_string(k1).c_str()); + EXPECT_EQ(column4->get_data_at(i).to_string(), std::to_string(k1)); auto decimal_field = column5->operator[](i).get>(); DecimalV2Value decimalv2_num(std::to_string(k3)); diff --git a/be/test/vec/core/column_array_test.cpp b/be/test/vec/core/column_array_test.cpp index cc7be2f11d..b573f90eec 100644 --- a/be/test/vec/core/column_array_test.cpp +++ b/be/test/vec/core/column_array_test.cpp @@ -52,7 +52,7 @@ void check_array_data(const IColumn& arr, const std::vector& data) ASSERT_EQ(data_col->size(), data.size()); for (size_t i = 0; i < data_col->size(); ++i) { auto element = data_col->get_data_at(i); - ASSERT_EQ(std::string(element.data), data[i]); + ASSERT_EQ(std::string(element.data, element.size), data[i]); } } diff --git a/be/test/vec/core/column_complex_test.cpp b/be/test/vec/core/column_complex_test.cpp index ce9e4d60f3..6a6e823481 100644 --- a/be/test/vec/core/column_complex_test.cpp +++ b/be/test/vec/core/column_complex_test.cpp @@ -22,6 +22,7 @@ #include #include +#include "vec/core/block.h" #include "vec/data_types/data_type_bitmap.h" namespace doris::vectorized { TEST(ColumnComplexTest, BasicTest) { @@ -63,13 +64,14 @@ public: void check_serialize_and_deserialize(MutableColumnPtr& col) { auto column = assert_cast(col.get()); - auto size = _bitmap_type.get_uncompressed_serialized_bytes(*column); + auto size = + _bitmap_type.get_uncompressed_serialized_bytes(*column, Block::max_data_version); std::unique_ptr buf = std::make_unique(size); - auto result = _bitmap_type.serialize(*column, buf.get()); + auto result = _bitmap_type.serialize(*column, buf.get(), Block::max_data_version); ASSERT_EQ(result, buf.get() + size); auto column2 = _bitmap_type.create_column(); - _bitmap_type.deserialize(buf.get(), column2.get()); + _bitmap_type.deserialize(buf.get(), column2.get(), Block::max_data_version); check_bitmap_column(*column, *column2.get()); } diff --git a/be/test/vec/function/function_string_test.cpp b/be/test/vec/function/function_string_test.cpp index fe4b90fab8..4f28a0b940 100644 --- a/be/test/vec/function/function_string_test.cpp +++ b/be/test/vec/function/function_string_test.cpp @@ -700,9 +700,9 @@ TEST(function_string_test, function_aes_encrypt_test) { int cipher_len = strlen(src[i]) + 16; char p[cipher_len]; - int outlen = EncryptionUtil::encrypt(AES_128_ECB, (unsigned char*)src[i], - strlen(src[i]), (unsigned char*)key, strlen(key), - NULL, true, (unsigned char*)p); + int outlen = EncryptionUtil::encrypt( + EncryptionMode::AES_128_ECB, (unsigned char*)src[i], strlen(src[i]), + (unsigned char*)key, strlen(key), nullptr, 0, true, (unsigned char*)p); r[i] = std::string(p, outlen); } @@ -733,9 +733,10 @@ TEST(function_string_test, function_aes_encrypt_test) { init_vec.reset(new char[iv_len]); std::memset(init_vec.get(), 0, strlen(iv) + 1); memcpy(init_vec.get(), iv, strlen(iv)); - int outlen = EncryptionUtil::encrypt(AES_256_ECB, (unsigned char*)src[i], - strlen(src[i]), (unsigned char*)key, strlen(key), - init_vec.get(), true, (unsigned char*)p); + int outlen = + EncryptionUtil::encrypt(EncryptionMode::AES_256_ECB, (unsigned char*)src[i], + strlen(src[i]), (unsigned char*)key, strlen(key), + init_vec.get(), strlen(iv), true, (unsigned char*)p); r[i] = std::string(p, outlen); } @@ -766,9 +767,9 @@ TEST(function_string_test, function_aes_decrypt_test) { int cipher_len = strlen(src[i]) + 16; char p[cipher_len]; - int outlen = EncryptionUtil::encrypt(AES_128_ECB, (unsigned char*)src[i], - strlen(src[i]), (unsigned char*)key, strlen(key), - NULL, true, (unsigned char*)p); + int outlen = EncryptionUtil::encrypt( + EncryptionMode::AES_128_ECB, (unsigned char*)src[i], strlen(src[i]), + (unsigned char*)key, strlen(key), nullptr, 0, true, (unsigned char*)p); r[i] = std::string(p, outlen); } @@ -798,9 +799,10 @@ TEST(function_string_test, function_aes_decrypt_test) { init_vec.reset(new char[iv_len]); std::memset(init_vec.get(), 0, strlen(iv) + 1); memcpy(init_vec.get(), iv, strlen(iv)); - int outlen = EncryptionUtil::encrypt(AES_128_OFB, (unsigned char*)src[i], - strlen(src[i]), (unsigned char*)key, strlen(key), - init_vec.get(), true, (unsigned char*)p); + int outlen = + EncryptionUtil::encrypt(EncryptionMode::AES_128_OFB, (unsigned char*)src[i], + strlen(src[i]), (unsigned char*)key, strlen(key), + init_vec.get(), strlen(iv), true, (unsigned char*)p); r[i] = std::string(p, outlen); } DataSet data_set = { @@ -828,9 +830,9 @@ TEST(function_string_test, function_sm4_encrypt_test) { int cipher_len = strlen(src[i]) + 16; char p[cipher_len]; - int outlen = EncryptionUtil::encrypt(SM4_128_ECB, (unsigned char*)src[i], - strlen(src[i]), (unsigned char*)key, strlen(key), - NULL, true, (unsigned char*)p); + int outlen = EncryptionUtil::encrypt( + EncryptionMode::SM4_128_ECB, (unsigned char*)src[i], strlen(src[i]), + (unsigned char*)key, strlen(key), nullptr, 0, true, (unsigned char*)p); r[i] = std::string(p, outlen); } @@ -863,9 +865,10 @@ TEST(function_string_test, function_sm4_encrypt_test) { init_vec.reset(new char[iv_len]); std::memset(init_vec.get(), 0, strlen(iv) + 1); memcpy(init_vec.get(), iv, strlen(iv)); - int outlen = EncryptionUtil::encrypt(SM4_128_CTR, (unsigned char*)src[i], - strlen(src[i]), (unsigned char*)key, strlen(key), - init_vec.get(), true, (unsigned char*)p); + int outlen = + EncryptionUtil::encrypt(EncryptionMode::SM4_128_CTR, (unsigned char*)src[i], + strlen(src[i]), (unsigned char*)key, strlen(key), + init_vec.get(), strlen(iv), true, (unsigned char*)p); r[i] = std::string(p, outlen); } @@ -896,9 +899,9 @@ TEST(function_string_test, function_sm4_decrypt_test) { int cipher_len = strlen(src[i]) + 16; char p[cipher_len]; - int outlen = EncryptionUtil::encrypt(SM4_128_ECB, (unsigned char*)src[i], - strlen(src[i]), (unsigned char*)key, strlen(key), - NULL, true, (unsigned char*)p); + int outlen = EncryptionUtil::encrypt( + EncryptionMode::SM4_128_ECB, (unsigned char*)src[i], strlen(src[i]), + (unsigned char*)key, strlen(key), nullptr, 0, true, (unsigned char*)p); r[i] = std::string(p, outlen); } @@ -930,9 +933,10 @@ TEST(function_string_test, function_sm4_decrypt_test) { init_vec.reset(new char[iv_len]); std::memset(init_vec.get(), 0, strlen(iv) + 1); memcpy(init_vec.get(), iv, strlen(iv)); - int outlen = EncryptionUtil::encrypt(SM4_128_OFB, (unsigned char*)src[i], - strlen(src[i]), (unsigned char*)key, strlen(key), - init_vec.get(), true, (unsigned char*)p); + int outlen = + EncryptionUtil::encrypt(EncryptionMode::SM4_128_OFB, (unsigned char*)src[i], + strlen(src[i]), (unsigned char*)key, strlen(key), + init_vec.get(), strlen(iv), true, (unsigned char*)p); r[i] = std::string(p, outlen); } diff --git a/be/test/vec/utils/arrow_column_to_doris_column_test.cpp b/be/test/vec/utils/arrow_column_to_doris_column_test.cpp index c7aaae2310..608775ef13 100644 --- a/be/test/vec/utils/arrow_column_to_doris_column_test.cpp +++ b/be/test/vec/utils/arrow_column_to_doris_column_test.cpp @@ -39,6 +39,7 @@ #include "arrow/type_fwd.h" #include "arrow/type_traits.h" #include "gutil/casts.h" +#include "vec/columns/column_array.h" #include "vec/columns/column_nullable.h" #include "vec/data_types/data_type_decimal.h" #include "vec/data_types/data_type_factory.hpp" diff --git a/gensrc/proto/data.proto b/gensrc/proto/data.proto index f066dc4a5b..a041ca5ed2 100644 --- a/gensrc/proto/data.proto +++ b/gensrc/proto/data.proto @@ -66,4 +66,5 @@ message PBlock { optional bool compressed = 3 [default = false]; optional int64 uncompressed_size = 4; optional segment_v2.CompressionTypePB compression_type = 5 [default = SNAPPY]; + optional int32 data_version = 6 [default = -1]; }