[Performance](Variant) Improve load performance for variant type (#33890)
1. remove phmap for padding rows 2. add SimpleFieldVisitorToScarlarType for short circuit type deducing 3. correct type coercion for conflict types bettween integers 4. improve nullable column performance 5. remove shared_ptr dependancy for DataType use TypeIndex instead 6. Optimization by caching the order of fields (which is almost always the same) and a quick check to match the next expected field, instead of searching the hash table. benchmark: In clickbench data, load performance: 12m36.799s ->7m10.934s about 43% latency reduce In variant_p2/performance.groovy: 3min44s20 -> 1min15s80 about 66% latency reducy
This commit is contained in:
@ -45,6 +45,7 @@
|
||||
#include "util/defer_op.h"
|
||||
#include "util/simd/bits.h"
|
||||
#include "vec/aggregate_functions/aggregate_function.h"
|
||||
#include "vec/aggregate_functions/helpers.h"
|
||||
#include "vec/columns/column.h"
|
||||
#include "vec/columns/column_array.h"
|
||||
#include "vec/columns/column_nullable.h"
|
||||
@ -56,6 +57,7 @@
|
||||
#include "vec/common/field_visitors.h"
|
||||
#include "vec/common/schema_util.h"
|
||||
#include "vec/common/string_buffer.hpp"
|
||||
#include "vec/common/string_ref.h"
|
||||
#include "vec/core/column_with_type_and_name.h"
|
||||
#include "vec/core/field.h"
|
||||
#include "vec/core/types.h"
|
||||
@ -68,6 +70,7 @@
|
||||
#include "vec/data_types/data_type_nothing.h"
|
||||
#include "vec/data_types/data_type_nullable.h"
|
||||
#include "vec/data_types/get_least_supertype.h"
|
||||
#include "vec/json/path_in_data.h"
|
||||
|
||||
#ifdef __AVX2__
|
||||
#include "util/jsonb_parser_simd.h"
|
||||
@ -78,23 +81,22 @@
|
||||
namespace doris::vectorized {
|
||||
namespace {
|
||||
|
||||
DataTypePtr create_array_of_type(DataTypePtr type, size_t num_dimensions, bool is_nullable) {
|
||||
const DataTypeNullable* nullable = typeid_cast<const DataTypeNullable*>(type.get());
|
||||
if ((nullable &&
|
||||
typeid_cast<const ColumnObject::MostCommonType*>(nullable->get_nested_type().get())) ||
|
||||
typeid_cast<const ColumnObject::MostCommonType*>(type.get())) {
|
||||
DataTypePtr create_array_of_type(TypeIndex type, size_t num_dimensions, bool is_nullable) {
|
||||
if (type == ColumnObject::MOST_COMMON_TYPE_ID) {
|
||||
// JSONB type MUST NOT wrapped in ARRAY column, it should be top level.
|
||||
// So we ignored num_dimensions.
|
||||
return type;
|
||||
return is_nullable ? make_nullable(std::make_shared<ColumnObject::MostCommonType>())
|
||||
: std::make_shared<ColumnObject::MostCommonType>();
|
||||
}
|
||||
DataTypePtr result = DataTypeFactory::instance().create_data_type(type, is_nullable);
|
||||
for (size_t i = 0; i < num_dimensions; ++i) {
|
||||
type = std::make_shared<DataTypeArray>(std::move(type));
|
||||
result = std::make_shared<DataTypeArray>(result);
|
||||
if (is_nullable) {
|
||||
// wrap array with nullable
|
||||
type = make_nullable(type);
|
||||
result = make_nullable(result);
|
||||
}
|
||||
}
|
||||
return type;
|
||||
return result;
|
||||
}
|
||||
|
||||
DataTypePtr get_base_type_of_array(const DataTypePtr& type) {
|
||||
@ -149,6 +151,63 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
// Visitor that allows to get type of scalar field
|
||||
// but exclude fields contain complex field.This is a faster version
|
||||
// for FieldVisitorToScalarType which does not support complex field.
|
||||
class SimpleFieldVisitorToScalarType : public StaticVisitor<size_t> {
|
||||
public:
|
||||
size_t operator()(const Array& x) {
|
||||
throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Array type is not supported");
|
||||
}
|
||||
size_t operator()(const UInt64& x) {
|
||||
if (x <= std::numeric_limits<Int8>::max()) {
|
||||
type = TypeIndex::Int8;
|
||||
} else if (x <= std::numeric_limits<Int16>::max()) {
|
||||
type = TypeIndex::Int16;
|
||||
} else if (x <= std::numeric_limits<Int32>::max()) {
|
||||
type = TypeIndex::Int32;
|
||||
} else {
|
||||
type = TypeIndex::Int64;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
size_t operator()(const Int64& x) {
|
||||
if (x <= std::numeric_limits<Int8>::max() && x >= std::numeric_limits<Int8>::min()) {
|
||||
type = TypeIndex::Int8;
|
||||
} else if (x <= std::numeric_limits<Int16>::max() &&
|
||||
x >= std::numeric_limits<Int16>::min()) {
|
||||
type = TypeIndex::Int16;
|
||||
} else if (x <= std::numeric_limits<Int32>::max() &&
|
||||
x >= std::numeric_limits<Int32>::min()) {
|
||||
type = TypeIndex::Int32;
|
||||
} else {
|
||||
type = TypeIndex::Int64;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
size_t operator()(const JsonbField& x) {
|
||||
type = TypeIndex::JSONB;
|
||||
return 1;
|
||||
}
|
||||
size_t operator()(const Null&) {
|
||||
have_nulls = true;
|
||||
return 1;
|
||||
}
|
||||
template <typename T>
|
||||
size_t operator()(const T&) {
|
||||
type = TypeId<NearestFieldType<T>>::value;
|
||||
return 1;
|
||||
}
|
||||
void get_scalar_type(TypeIndex* data_type) const { *data_type = type; }
|
||||
bool contain_nulls() const { return have_nulls; }
|
||||
|
||||
bool need_convert_field() const { return false; }
|
||||
|
||||
private:
|
||||
TypeIndex type = TypeIndex::Nothing;
|
||||
bool have_nulls;
|
||||
};
|
||||
|
||||
/// Visitor that allows to get type of scalar field
|
||||
/// or least common type of scalars in array.
|
||||
/// More optimized version of FieldToDataType.
|
||||
@ -208,8 +267,10 @@ public:
|
||||
type_indexes.insert(TypeId<NearestFieldType<T>>::value);
|
||||
return 0;
|
||||
}
|
||||
void get_scalar_type(DataTypePtr* type) const {
|
||||
get_least_supertype<LeastSupertypeOnError::Jsonb>(type_indexes, type);
|
||||
void get_scalar_type(TypeIndex* type) const {
|
||||
DataTypePtr data_type;
|
||||
get_least_supertype<LeastSupertypeOnError::Jsonb>(type_indexes, &data_type);
|
||||
*type = data_type->get_type_id();
|
||||
}
|
||||
bool contain_nulls() const { return have_nulls; }
|
||||
bool need_convert_field() const { return field_types.size() > 1; }
|
||||
@ -221,20 +282,30 @@ private:
|
||||
};
|
||||
|
||||
} // namespace
|
||||
void get_field_info(const Field& field, FieldInfo* info) {
|
||||
FieldVisitorToScalarType to_scalar_type_visitor;
|
||||
|
||||
template <typename Visitor>
|
||||
void get_field_info_impl(const Field& field, FieldInfo* info) {
|
||||
Visitor to_scalar_type_visitor;
|
||||
apply_visitor(to_scalar_type_visitor, field);
|
||||
DataTypePtr type = nullptr;
|
||||
to_scalar_type_visitor.get_scalar_type(&type);
|
||||
TypeIndex type_id;
|
||||
to_scalar_type_visitor.get_scalar_type(&type_id);
|
||||
// array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]]
|
||||
*info = {
|
||||
type,
|
||||
type_id,
|
||||
to_scalar_type_visitor.contain_nulls(),
|
||||
to_scalar_type_visitor.need_convert_field(),
|
||||
apply_visitor(FieldVisitorToNumberOfDimensions(), field),
|
||||
};
|
||||
}
|
||||
|
||||
void get_field_info(const Field& field, FieldInfo* info) {
|
||||
if (field.is_complex_field()) {
|
||||
get_field_info_impl<FieldVisitorToScalarType>(field, info);
|
||||
} else {
|
||||
get_field_info_impl<SimpleFieldVisitorToScalarType>(field, info);
|
||||
}
|
||||
}
|
||||
|
||||
ColumnObject::Subcolumn::Subcolumn(MutableColumnPtr&& data_, DataTypePtr type, bool is_nullable_,
|
||||
bool is_root_)
|
||||
: least_common_type(type), is_nullable(is_nullable_), is_root(is_root_) {
|
||||
@ -285,8 +356,8 @@ void ColumnObject::Subcolumn::add_new_column_part(DataTypePtr type) {
|
||||
}
|
||||
|
||||
void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) {
|
||||
auto base_type = std::move(info.scalar_type);
|
||||
if (is_nothing(base_type)) {
|
||||
auto base_type = WhichDataType(info.scalar_type_id);
|
||||
if (base_type.is_nothing()) {
|
||||
insertDefault();
|
||||
return;
|
||||
}
|
||||
@ -295,7 +366,7 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) {
|
||||
if (is_nothing(least_common_type.get_base())) {
|
||||
column_dim = value_dim;
|
||||
}
|
||||
if (is_nothing(base_type)) {
|
||||
if (base_type.is_nothing()) {
|
||||
value_dim = column_dim;
|
||||
}
|
||||
bool type_changed = false;
|
||||
@ -305,29 +376,30 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) {
|
||||
"Dimension of types mismatched between inserted value and column, "
|
||||
"expected:{}, but meet:{} for type:{}",
|
||||
column_dim, value_dim, least_common_type.get()->get_name());
|
||||
base_type = std::make_shared<MostCommonType>();
|
||||
base_type = MOST_COMMON_TYPE_ID;
|
||||
value_dim = 0;
|
||||
type_changed = true;
|
||||
}
|
||||
if (is_nullable && !is_nothing(base_type)) {
|
||||
base_type = make_nullable(base_type);
|
||||
}
|
||||
|
||||
const auto& least_common_base_type = least_common_type.get_base();
|
||||
if (data.empty()) {
|
||||
add_new_column_part(create_array_of_type(std::move(base_type), value_dim, is_nullable));
|
||||
} else if (!least_common_base_type->equals(*base_type) && !is_nothing(base_type)) {
|
||||
if (!schema_util::is_conversion_required_between_integers(*base_type,
|
||||
*least_common_base_type)) {
|
||||
add_new_column_part(create_array_of_type(base_type.idx, value_dim, is_nullable));
|
||||
} else if (least_common_type.get_type_id() != base_type.idx && !base_type.is_nothing()) {
|
||||
if (schema_util::is_conversion_required_between_integers(base_type.idx,
|
||||
least_common_type.get_type_id())) {
|
||||
LOG_EVERY_N(INFO, 100) << "Conversion between " << getTypeName(base_type.idx) << " and "
|
||||
<< getTypeName(least_common_type.get_type_id());
|
||||
DataTypePtr base_data_type;
|
||||
TypeIndex base_data_type_id;
|
||||
get_least_supertype<LeastSupertypeOnError::Jsonb>(
|
||||
DataTypes {std::move(base_type), least_common_base_type}, &base_type);
|
||||
TypeIndexSet {base_type.idx, least_common_type.get_base_type_id()},
|
||||
&base_data_type);
|
||||
type_changed = true;
|
||||
base_data_type_id = base_data_type->get_type_id();
|
||||
if (is_nullable) {
|
||||
base_type = make_nullable(base_type);
|
||||
base_data_type = make_nullable(base_data_type);
|
||||
}
|
||||
if (!least_common_base_type->equals(*base_type)) {
|
||||
if (!least_common_type.get_base()->equals(*base_data_type)) {
|
||||
add_new_column_part(
|
||||
create_array_of_type(std::move(base_type), value_dim, is_nullable));
|
||||
create_array_of_type(base_data_type_id, value_dim, is_nullable));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -578,6 +650,14 @@ ColumnObject::Subcolumn::LeastCommonType::LeastCommonType(DataTypePtr type_)
|
||||
if (!WhichDataType(type).is_nothing()) {
|
||||
least_common_type_serder = type->get_serde();
|
||||
}
|
||||
type_id = type->is_nullable() ? assert_cast<const DataTypeNullable*>(type.get())
|
||||
->get_nested_type()
|
||||
->get_type_id()
|
||||
: type->get_type_id();
|
||||
base_type_id = base_type->is_nullable() ? assert_cast<const DataTypeNullable*>(base_type.get())
|
||||
->get_nested_type()
|
||||
->get_type_id()
|
||||
: base_type->get_type_id();
|
||||
}
|
||||
|
||||
ColumnObject::ColumnObject(bool is_nullable_, bool create_root_)
|
||||
@ -677,14 +757,12 @@ void ColumnObject::try_insert(const Field& field) {
|
||||
return;
|
||||
}
|
||||
const auto& object = field.get<const VariantMap&>();
|
||||
phmap::flat_hash_set<std::string> inserted;
|
||||
size_t old_size = size();
|
||||
for (const auto& [key_str, value] : object) {
|
||||
PathInData key;
|
||||
if (!key_str.empty()) {
|
||||
key = PathInData(key_str);
|
||||
}
|
||||
inserted.insert(key_str);
|
||||
if (!has_subcolumn(key)) {
|
||||
bool succ = add_sub_column(key, old_size);
|
||||
if (!succ) {
|
||||
@ -700,7 +778,7 @@ void ColumnObject::try_insert(const Field& field) {
|
||||
subcolumn->insert(value);
|
||||
}
|
||||
for (auto& entry : subcolumns) {
|
||||
if (!inserted.contains(entry->path.get_path())) {
|
||||
if (old_size == entry->data.size()) {
|
||||
entry->data.insertDefault();
|
||||
}
|
||||
}
|
||||
@ -749,16 +827,6 @@ Status ColumnObject::try_insert_indices_from(const IColumn& src, const int* indi
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
FieldInfo ColumnObject::Subcolumn::get_subcolumn_field_info() const {
|
||||
const auto& base_type = least_common_type.get_base();
|
||||
return FieldInfo {
|
||||
.scalar_type = base_type,
|
||||
.have_nulls = base_type->is_nullable(),
|
||||
.need_convert = false,
|
||||
.num_dimensions = least_common_type.get_dimensions(),
|
||||
};
|
||||
}
|
||||
|
||||
void ColumnObject::insert_range_from(const IColumn& src, size_t start, size_t length) {
|
||||
#ifndef NDEBUG
|
||||
check_consistency();
|
||||
@ -809,6 +877,33 @@ const ColumnObject::Subcolumn* ColumnObject::get_subcolumn(const PathInData& key
|
||||
return &node->data;
|
||||
}
|
||||
|
||||
const ColumnObject::Subcolumn* ColumnObject::get_subcolumn_with_cache(const PathInData& key,
|
||||
size_t key_index) const {
|
||||
// Optimization by caching the order of fields (which is almost always the same)
|
||||
// and a quick check to match the next expected field, instead of searching the hash table.
|
||||
if (_prev_positions.size() > key_index && _prev_positions[key_index].second != nullptr &&
|
||||
key == _prev_positions[key_index].first) {
|
||||
return _prev_positions[key_index].second;
|
||||
}
|
||||
const auto* subcolumn = get_subcolumn(key);
|
||||
if (key_index >= _prev_positions.size()) {
|
||||
_prev_positions.resize(key_index + 1);
|
||||
}
|
||||
if (subcolumn != nullptr) {
|
||||
_prev_positions[key_index] = std::make_pair(key, subcolumn);
|
||||
}
|
||||
return subcolumn;
|
||||
}
|
||||
|
||||
ColumnObject::Subcolumn* ColumnObject::get_subcolumn(const PathInData& key, size_t key_index) {
|
||||
return const_cast<ColumnObject::Subcolumn*>(get_subcolumn_with_cache(key, key_index));
|
||||
}
|
||||
|
||||
const ColumnObject::Subcolumn* ColumnObject::get_subcolumn(const PathInData& key,
|
||||
size_t key_index) const {
|
||||
return get_subcolumn_with_cache(key, key_index);
|
||||
}
|
||||
|
||||
ColumnObject::Subcolumn* ColumnObject::get_subcolumn(const PathInData& key) {
|
||||
const auto* node = subcolumns.find_leaf(key);
|
||||
if (node == nullptr) {
|
||||
@ -1238,6 +1333,7 @@ void ColumnObject::finalize(bool ignore_sparse) {
|
||||
}
|
||||
std::swap(subcolumns, new_subcolumns);
|
||||
doc_structure = nullptr;
|
||||
_prev_positions.clear();
|
||||
}
|
||||
|
||||
void ColumnObject::finalize() {
|
||||
@ -1356,6 +1452,7 @@ void ColumnObject::clear() {
|
||||
Subcolumns empty;
|
||||
std::swap(empty, subcolumns);
|
||||
num_rows = 0;
|
||||
_prev_positions.clear();
|
||||
}
|
||||
|
||||
void ColumnObject::revise_to(int target_num_rows) {
|
||||
|
||||
@ -35,6 +35,7 @@
|
||||
|
||||
#include "common/status.h"
|
||||
#include "olap/tablet_schema.h"
|
||||
#include "util/jsonb_document.h"
|
||||
#include "vec/columns/column.h"
|
||||
#include "vec/columns/subcolumn_tree.h"
|
||||
#include "vec/common/cow.h"
|
||||
@ -62,8 +63,8 @@ namespace doris::vectorized {
|
||||
/// It allows to recreate field with different number
|
||||
/// of dimensions or nullability.
|
||||
struct FieldInfo {
|
||||
/// The common type of of all scalars in field.
|
||||
DataTypePtr scalar_type;
|
||||
/// The common type id of of all scalars in field.
|
||||
TypeIndex scalar_type_id;
|
||||
/// Do we have NULL scalar in field.
|
||||
bool have_nulls;
|
||||
/// If true then we have scalars with different types in array and
|
||||
@ -72,6 +73,7 @@ struct FieldInfo {
|
||||
/// Number of dimension in array. 0 if field is scalar.
|
||||
size_t num_dimensions;
|
||||
};
|
||||
|
||||
void get_field_info(const Field& field, FieldInfo* info);
|
||||
/** A column that represents object with dynamic set of subcolumns.
|
||||
* Subcolumns are identified by paths in document and are stored in
|
||||
@ -91,6 +93,7 @@ public:
|
||||
|
||||
// Using jsonb type as most common type, since it's adopted all types of json
|
||||
using MostCommonType = DataTypeJsonb;
|
||||
constexpr static TypeIndex MOST_COMMON_TYPE_ID = TypeIndex::JSONB;
|
||||
class Subcolumn {
|
||||
public:
|
||||
Subcolumn() = default;
|
||||
@ -147,8 +150,6 @@ public:
|
||||
/// Returns last inserted field.
|
||||
Field get_last_field() const;
|
||||
|
||||
FieldInfo get_subcolumn_field_info() const;
|
||||
|
||||
/// Returns single column if subcolumn in finalizes.
|
||||
/// Otherwise -- undefined behaviour.
|
||||
IColumn& get_finalized_column();
|
||||
@ -176,6 +177,10 @@ public:
|
||||
|
||||
const DataTypePtr& get_base() const { return base_type; }
|
||||
|
||||
const TypeIndex& get_type_id() const { return type_id; }
|
||||
|
||||
const TypeIndex& get_base_type_id() const { return base_type_id; }
|
||||
|
||||
size_t get_dimensions() const { return num_dimensions; }
|
||||
|
||||
void remove_nullable() { type = doris::vectorized::remove_nullable(type); }
|
||||
@ -185,6 +190,8 @@ public:
|
||||
private:
|
||||
DataTypePtr type;
|
||||
DataTypePtr base_type;
|
||||
TypeIndex type_id;
|
||||
TypeIndex base_type_id;
|
||||
size_t num_dimensions = 0;
|
||||
DataTypeSerDeSPtr least_common_type_serder;
|
||||
};
|
||||
@ -227,6 +234,10 @@ private:
|
||||
// used for quickly row store encoding
|
||||
ColumnPtr rowstore_column;
|
||||
|
||||
using SubColumnWithName = std::pair<PathInData, const Subcolumn*>;
|
||||
// Cached search results for previous row (keyed as index in JSON object) - used as a hint.
|
||||
mutable std::vector<SubColumnWithName> _prev_positions;
|
||||
|
||||
public:
|
||||
static constexpr auto COLUMN_NAME_DUMMY = "_dummy";
|
||||
|
||||
@ -289,6 +300,9 @@ public:
|
||||
// return null if not found
|
||||
const Subcolumn* get_subcolumn(const PathInData& key) const;
|
||||
|
||||
// return null if not found
|
||||
const Subcolumn* get_subcolumn(const PathInData& key, size_t index_hint) const;
|
||||
|
||||
/** More efficient methods of manipulation */
|
||||
[[noreturn]] IColumn& get_data() {
|
||||
LOG(FATAL) << "Not implemented method get_data()";
|
||||
@ -302,6 +316,12 @@ public:
|
||||
// return null if not found
|
||||
Subcolumn* get_subcolumn(const PathInData& key);
|
||||
|
||||
// return null if not found
|
||||
Subcolumn* get_subcolumn(const PathInData& key, size_t index_hint);
|
||||
|
||||
// return null if not found
|
||||
const Subcolumn* get_subcolumn_with_cache(const PathInData& key, size_t index_hint) const;
|
||||
|
||||
void incr_num_rows() { ++num_rows; }
|
||||
|
||||
void incr_num_rows(size_t n) { num_rows += n; }
|
||||
|
||||
@ -109,36 +109,41 @@ Array create_empty_array_field(size_t num_dimensions) {
|
||||
return array;
|
||||
}
|
||||
|
||||
bool is_conversion_required_between_integers(const IDataType& lhs, const IDataType& rhs) {
|
||||
size_t get_size_of_interger(TypeIndex type) {
|
||||
switch (type) {
|
||||
case TypeIndex::Int8:
|
||||
return sizeof(int8_t);
|
||||
case TypeIndex::Int16:
|
||||
return sizeof(int16_t);
|
||||
case TypeIndex::Int32:
|
||||
return sizeof(int32_t);
|
||||
case TypeIndex::Int64:
|
||||
return sizeof(int64_t);
|
||||
case TypeIndex::Int128:
|
||||
return sizeof(int128_t);
|
||||
case TypeIndex::UInt8:
|
||||
return sizeof(uint8_t);
|
||||
case TypeIndex::UInt16:
|
||||
return sizeof(uint16_t);
|
||||
case TypeIndex::UInt32:
|
||||
return sizeof(uint32_t);
|
||||
case TypeIndex::UInt64:
|
||||
return sizeof(uint64_t);
|
||||
case TypeIndex::UInt128:
|
||||
return sizeof(uint128_t);
|
||||
default:
|
||||
LOG(FATAL) << "Unknown integer type: " << getTypeName(type);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
bool is_conversion_required_between_integers(const TypeIndex& lhs, const TypeIndex& rhs) {
|
||||
WhichDataType which_lhs(lhs);
|
||||
WhichDataType which_rhs(rhs);
|
||||
bool is_native_int = which_lhs.is_native_int() && which_rhs.is_native_int();
|
||||
bool is_native_uint = which_lhs.is_native_uint() && which_rhs.is_native_uint();
|
||||
return (is_native_int || is_native_uint) &&
|
||||
lhs.get_size_of_value_in_memory() <= rhs.get_size_of_value_in_memory();
|
||||
}
|
||||
|
||||
bool is_conversion_required_between_integers(FieldType lhs, FieldType rhs) {
|
||||
// We only support signed integers for semi-structure data at present
|
||||
// TODO add unsigned integers
|
||||
if (lhs == FieldType::OLAP_FIELD_TYPE_BIGINT) {
|
||||
return !(rhs == FieldType::OLAP_FIELD_TYPE_TINYINT ||
|
||||
rhs == FieldType::OLAP_FIELD_TYPE_SMALLINT ||
|
||||
rhs == FieldType::OLAP_FIELD_TYPE_INT || rhs == FieldType::OLAP_FIELD_TYPE_BIGINT);
|
||||
}
|
||||
if (lhs == FieldType::OLAP_FIELD_TYPE_INT) {
|
||||
return !(rhs == FieldType::OLAP_FIELD_TYPE_TINYINT ||
|
||||
rhs == FieldType::OLAP_FIELD_TYPE_SMALLINT ||
|
||||
rhs == FieldType::OLAP_FIELD_TYPE_INT);
|
||||
}
|
||||
if (lhs == FieldType::OLAP_FIELD_TYPE_SMALLINT) {
|
||||
return !(rhs == FieldType::OLAP_FIELD_TYPE_TINYINT ||
|
||||
rhs == FieldType::OLAP_FIELD_TYPE_SMALLINT);
|
||||
}
|
||||
if (lhs == FieldType::OLAP_FIELD_TYPE_TINYINT) {
|
||||
return !(rhs == FieldType::OLAP_FIELD_TYPE_TINYINT);
|
||||
}
|
||||
return true;
|
||||
return (!is_native_int && !is_native_uint) ||
|
||||
get_size_of_interger(lhs) > get_size_of_interger(rhs);
|
||||
}
|
||||
|
||||
Status cast_column(const ColumnWithTypeAndName& arg, const DataTypePtr& type, ColumnPtr* result) {
|
||||
|
||||
@ -34,6 +34,7 @@
|
||||
#include "vec/columns/column_object.h"
|
||||
#include "vec/core/columns_with_type_and_name.h"
|
||||
#include "vec/core/field.h"
|
||||
#include "vec/core/types.h"
|
||||
#include "vec/data_types/data_type.h"
|
||||
#include "vec/json/path_in_data.h"
|
||||
|
||||
@ -66,8 +67,7 @@ Status cast_column(const ColumnWithTypeAndName& arg, const DataTypePtr& type, Co
|
||||
/// If both of types are signed/unsigned integers and size of left field type
|
||||
/// is less than right type, we don't need to convert field,
|
||||
/// because all integer fields are stored in Int64/UInt64.
|
||||
bool is_conversion_required_between_integers(const IDataType& lhs, const IDataType& rhs);
|
||||
bool is_conversion_required_between_integers(FieldType lhs, FieldType rhs);
|
||||
bool is_conversion_required_between_integers(const TypeIndex& lhs, const TypeIndex& rhs);
|
||||
|
||||
struct ExtraInfo {
|
||||
// -1 indicates it's not a Frontend generated column
|
||||
|
||||
@ -493,6 +493,11 @@ public:
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool is_complex_field() const {
|
||||
return which == Types::Array || which == Types::Map || which == Types::Tuple ||
|
||||
which == Types::VariantMap;
|
||||
}
|
||||
|
||||
Field& operator=(Field&& rhs) {
|
||||
if (this != &rhs) {
|
||||
if (which != rhs.which) {
|
||||
|
||||
@ -148,36 +148,28 @@ void parse_json_to_variant(IColumn& column, const char* src, size_t length,
|
||||
}
|
||||
auto& [paths, values] = *result;
|
||||
assert(paths.size() == values.size());
|
||||
phmap::flat_hash_set<std::string> paths_set;
|
||||
size_t num_rows = column_object.size();
|
||||
size_t old_num_rows = column_object.size();
|
||||
for (size_t i = 0; i < paths.size(); ++i) {
|
||||
FieldInfo field_info;
|
||||
get_field_info(values[i], &field_info);
|
||||
if (is_nothing(field_info.scalar_type)) {
|
||||
if (WhichDataType(field_info.scalar_type_id).is_nothing()) {
|
||||
continue;
|
||||
}
|
||||
if (!paths_set.insert(paths[i].get_path()).second) {
|
||||
// return Status::DataQualityError(
|
||||
// fmt::format("Object has ambiguous path {}, {}", paths[i].get_path()));
|
||||
throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Object has ambiguous path {}",
|
||||
paths[i].get_path());
|
||||
if (column_object.get_subcolumn(paths[i], i) == nullptr) {
|
||||
column_object.add_sub_column(paths[i], old_num_rows);
|
||||
}
|
||||
|
||||
if (!column_object.has_subcolumn(paths[i])) {
|
||||
column_object.add_sub_column(paths[i], num_rows);
|
||||
}
|
||||
auto* subcolumn = column_object.get_subcolumn(paths[i]);
|
||||
auto* subcolumn = column_object.get_subcolumn(paths[i], i);
|
||||
if (!subcolumn) {
|
||||
throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Failed to find sub column {}",
|
||||
paths[i].get_path());
|
||||
}
|
||||
assert(subcolumn->size() == num_rows);
|
||||
DCHECK_EQ(subcolumn->size(), old_num_rows);
|
||||
subcolumn->insert(std::move(values[i]), std::move(field_info));
|
||||
}
|
||||
// /// Insert default values to missed subcolumns.
|
||||
const auto& subcolumns = column_object.get_subcolumns();
|
||||
for (const auto& entry : subcolumns) {
|
||||
if (!paths_set.contains(entry->path.get_path())) {
|
||||
if (entry->data.size() == old_num_rows) {
|
||||
entry->data.insertDefault();
|
||||
}
|
||||
}
|
||||
|
||||
36
regression-test/suites/variant_p2/performance.groovy
Normal file
36
regression-test/suites/variant_p2/performance.groovy
Normal file
@ -0,0 +1,36 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
suite("regression_test_variant_performance", "p2"){
|
||||
sql """CREATE TABLE IF NOT EXISTS var_perf (
|
||||
k bigint,
|
||||
v variant
|
||||
|
||||
)
|
||||
DUPLICATE KEY(`k`)
|
||||
DISTRIBUTED BY RANDOM BUCKETS 4
|
||||
properties("replication_num" = "1", "disable_auto_compaction" = "false");
|
||||
"""
|
||||
sql """
|
||||
insert into var_perf
|
||||
SELECT *, '{"field1":348,"field2":596,"field3":781,"field4":41,"field5":922,"field6":84,"field7":222,"field8":312,"field9":490,"field10":715,"field11":837,"field12":753,"field13":171,"field14":727,"field15":739,"field16":545,"field17":964,"field18":540,"field19":685,"field20":828,"field21":157,"field22":404,"field23":287,"field24":481,"field25":476,"field26":559,"field27":144,"field28":545,"field29":70,"field30":668,"field31":820,"field32":193,"field33":465,"field34":347,"field35":898,"field36":705,"field37":754,"field38":866,"field39":752,"field40":303,"field41":214,"field42":41,"field43":609,"field44":487,"field45":832,"field46":832,"field47":134,"field48":964,"field49":919,"field50":670,"field51":767,"field52":334,"field53":506,"field54":838,"field55":510,"field56":770,"field57":168,"field58":701,"field59":961,"field60":927,"field61":375,"field62":939,"field63":464,"field64":420,"field65":212,"field66":882,"field67":344,"field68":724,"field69":997,"field70":198,"field71":739,"field72":628,"field73":563,"field74":979,"field75":563,"field76":891,"field77":496,"field78":442,"field79":847,"field80":771,"field81":229,"field82":1023,"field83":184,"field84":563,"field85":980,"field86":191,"field87":426,"field88":527,"field89":945,"field90":552,"field91":454,"field92":728,"field93":631,"field94":191,"field95":148,"field96":679,"field97":955,"field98":934,"field99":258,"field100":442}'
|
||||
from numbers("number" = "10000000")
|
||||
union all
|
||||
SELECT *, '{"field1":201,"field2":465,"field3":977,"field4":101112,"field5":131415,"field6":216,"field7":192021,"field8":822324,"field9":525627,"field10":928930,"field11":413233,"field12":243536,"field13":373839,"field14":404142,"field15":434445,"field16":1464748,"field17":495051,"field18":525354,"field19":565657,"field20":1585960,"field21":616263,"field22":646566,"field23":676869,"field24":707172,"field25":737475,"field26":767778,"field27":798081,"field28":828384,"field29":858687,"field30":888990,"field31":919293,"field32":949596,"field33":979899,"field34":100101,"field35":103104,"field36":106107,"field37":109110,"field38":112113,"field39":115116,"field40":118119,"field41":121122,"field42":124125,"field43":127128,"field44":130131,"field45":133134,"field46":136137,"field47":139140,"field48":142143,"field49":145146,"field50":148149,"field51":151152,"field52":154155,"field53":157158,"field54":160161,"field55":163164,"field56":166167,"field57":169170,"field58":172173,"field59":175176,"field60":178179,"field61":181182,"field62":184185,"field63":187188,"field64":190191,"field65":193194,"field66":196197,"field67":199200,"field68":202203,"field69":205206,"field70":208209,"field71":211212,"field72":214215,"field73":217218,"field74":220221,"field75":223224,"field76":226227,"field77":229230,"field78":232233,"field79":235236,"field80":238239,"field81":241242,"field82":244245,"field83":247248,"field84":250251,"field85":253254,"field86":256257,"field87":259260,"field88":262263,"field89":265266,"field90":268269,"field91":271272,"field92":274275,"field93":277278,"field94":280281,"field95":283284,"field96":286287,"field97":289290,"field98":292293,"field99":295296,"field100":298299}'
|
||||
from numbers("number" = "10000000")
|
||||
"""
|
||||
}
|
||||
Reference in New Issue
Block a user