[performance](variant) support topn 2phase read for variant column (#28318)
[performance](variant) support topn 2phase read for variant column
This commit is contained in:
@ -229,7 +229,7 @@ public:
|
||||
private:
|
||||
Status extract_to(vectorized::MutableColumnPtr& dst, size_t nrows);
|
||||
|
||||
const TabletColumn& _col;
|
||||
TabletColumn _col;
|
||||
// may shared among different column iterators
|
||||
std::unique_ptr<StreamReader> _root_reader;
|
||||
};
|
||||
|
||||
@ -61,10 +61,12 @@
|
||||
#include "util/slice.h" // Slice
|
||||
#include "vec/columns/column.h"
|
||||
#include "vec/common/string_ref.h"
|
||||
#include "vec/core/field.h"
|
||||
#include "vec/data_types/data_type.h"
|
||||
#include "vec/data_types/data_type_factory.hpp"
|
||||
#include "vec/data_types/data_type_nullable.h"
|
||||
#include "vec/data_types/data_type_object.h"
|
||||
#include "vec/json/path_in_data.h"
|
||||
#include "vec/olap/vgeneric_iterators.h"
|
||||
|
||||
namespace doris {
|
||||
@ -332,17 +334,18 @@ Status Segment::_load_index_impl() {
|
||||
|
||||
// Return the storage datatype of related column to field.
|
||||
// Return nullptr meaning no such storage infomation for this column
|
||||
vectorized::DataTypePtr Segment::get_data_type_of(const Field& field, bool ignore_children) const {
|
||||
vectorized::DataTypePtr Segment::get_data_type_of(vectorized::PathInData path, bool is_nullable,
|
||||
bool ignore_children) const {
|
||||
// Path has higher priority
|
||||
if (!field.path().empty()) {
|
||||
auto node = _sub_column_tree.find_leaf(field.path());
|
||||
if (!path.empty()) {
|
||||
auto node = _sub_column_tree.find_leaf(path);
|
||||
if (node) {
|
||||
if (ignore_children || node->children.empty()) {
|
||||
return node->data.file_column_type;
|
||||
}
|
||||
}
|
||||
// it contains children or column missing in storage, so treat it as variant
|
||||
return field.is_nullable()
|
||||
return is_nullable
|
||||
? vectorized::make_nullable(std::make_shared<vectorized::DataTypeObject>())
|
||||
: std::make_shared<vectorized::DataTypeObject>();
|
||||
}
|
||||
@ -686,7 +689,8 @@ Status Segment::read_key_by_rowid(uint32_t row_id, std::string* key) {
|
||||
|
||||
bool Segment::same_with_storage_type(int32_t cid, const Schema& schema,
|
||||
bool ignore_children) const {
|
||||
auto file_column_type = get_data_type_of(*schema.column(cid), ignore_children);
|
||||
auto file_column_type = get_data_type_of(schema.column(cid)->path(),
|
||||
schema.column(cid)->is_nullable(), ignore_children);
|
||||
auto expected_type = Schema::get_data_type_ptr(*schema.column(cid));
|
||||
#ifndef NDEBUG
|
||||
if (file_column_type && !file_column_type->equals(*expected_type)) {
|
||||
@ -700,5 +704,58 @@ bool Segment::same_with_storage_type(int32_t cid, const Schema& schema,
|
||||
return same;
|
||||
}
|
||||
|
||||
Status Segment::seek_and_read_by_rowid(const TabletSchema& schema, SlotDescriptor* slot,
|
||||
uint32_t row_id, vectorized::MutableColumnPtr& result,
|
||||
OlapReaderStatistics& stats,
|
||||
std::unique_ptr<ColumnIterator>& iterator_hint) {
|
||||
StorageReadOptions storage_read_opt;
|
||||
storage_read_opt.io_ctx.reader_type = ReaderType::READER_QUERY;
|
||||
segment_v2::ColumnIteratorOptions opt {
|
||||
.use_page_cache = !config::disable_storage_page_cache,
|
||||
.file_reader = file_reader().get(),
|
||||
.stats = &stats,
|
||||
.io_ctx = io::IOContext {.reader_type = ReaderType::READER_QUERY},
|
||||
};
|
||||
std::vector<segment_v2::rowid_t> single_row_loc {row_id};
|
||||
if (!slot->column_paths().empty()) {
|
||||
vectorized::PathInData path(schema.column_by_uid(slot->col_unique_id()).name_lower_case(),
|
||||
slot->column_paths());
|
||||
auto storage_type = get_data_type_of(path, slot->is_nullable(), false);
|
||||
vectorized::MutableColumnPtr file_storage_column = storage_type->create_column();
|
||||
DCHECK(storage_type != nullptr);
|
||||
TabletColumn column = TabletColumn::create_materialized_variant_column(
|
||||
schema.column_by_uid(slot->col_unique_id()).name_lower_case(), slot->column_paths(),
|
||||
slot->col_unique_id());
|
||||
if (iterator_hint == nullptr) {
|
||||
RETURN_IF_ERROR(new_column_iterator(column, &iterator_hint, &storage_read_opt));
|
||||
RETURN_IF_ERROR(iterator_hint->init(opt));
|
||||
}
|
||||
RETURN_IF_ERROR(
|
||||
iterator_hint->read_by_rowids(single_row_loc.data(), 1, file_storage_column));
|
||||
// iterator_hint.reset(nullptr);
|
||||
// Get it's inner field, for JSONB case
|
||||
vectorized::Field field = remove_nullable(storage_type)->get_default();
|
||||
file_storage_column->get(0, field);
|
||||
result->insert(field);
|
||||
} else {
|
||||
int index = (slot->col_unique_id() >= 0) ? schema.field_index(slot->col_unique_id())
|
||||
: schema.field_index(slot->col_name());
|
||||
if (index < 0) {
|
||||
std::stringstream ss;
|
||||
ss << "field name is invalid. field=" << slot->col_name()
|
||||
<< ", field_name_to_index=" << schema.get_all_field_names();
|
||||
return Status::InternalError(ss.str());
|
||||
}
|
||||
storage_read_opt.io_ctx.reader_type = ReaderType::READER_QUERY;
|
||||
if (iterator_hint == nullptr) {
|
||||
RETURN_IF_ERROR(
|
||||
new_column_iterator(schema.column(index), &iterator_hint, &storage_read_opt));
|
||||
RETURN_IF_ERROR(iterator_hint->init(opt));
|
||||
}
|
||||
RETURN_IF_ERROR(iterator_hint->read_by_rowids(single_row_loc.data(), 1, result));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace segment_v2
|
||||
} // namespace doris
|
||||
|
||||
@ -39,11 +39,14 @@
|
||||
#include "olap/rowset/segment_v2/page_handle.h"
|
||||
#include "olap/schema.h"
|
||||
#include "olap/tablet_schema.h"
|
||||
#include "runtime/descriptors.h"
|
||||
#include "util/once.h"
|
||||
#include "util/slice.h"
|
||||
#include "vec/columns/column.h"
|
||||
#include "vec/columns/subcolumn_tree.h"
|
||||
#include "vec/data_types/data_type.h"
|
||||
#include "vec/data_types/data_type_nullable.h"
|
||||
#include "vec/json/path_in_data.h"
|
||||
|
||||
namespace doris {
|
||||
namespace vectorized {
|
||||
@ -123,6 +126,10 @@ public:
|
||||
|
||||
Status read_key_by_rowid(uint32_t row_id, std::string* key);
|
||||
|
||||
Status seek_and_read_by_rowid(const TabletSchema& schema, SlotDescriptor* slot, uint32_t row_id,
|
||||
vectorized::MutableColumnPtr& result, OlapReaderStatistics& stats,
|
||||
std::unique_ptr<ColumnIterator>& iterator_hint);
|
||||
|
||||
Status load_index();
|
||||
|
||||
Status load_pk_index_and_bf();
|
||||
@ -146,7 +153,8 @@ public:
|
||||
// ignore_chidren set to false will treat field as variant
|
||||
// when it contains children with field paths.
|
||||
// nullptr will returned if storage type does not contains such column
|
||||
std::shared_ptr<const vectorized::IDataType> get_data_type_of(const Field& filed,
|
||||
std::shared_ptr<const vectorized::IDataType> get_data_type_of(vectorized::PathInData path,
|
||||
bool is_nullable,
|
||||
bool ignore_children) const;
|
||||
|
||||
// Check is schema read type equals storage column type
|
||||
@ -157,8 +165,8 @@ public:
|
||||
bool can_apply_predicate_safely(int cid, Predicate* pred, const Schema& schema,
|
||||
ReaderType read_type) const {
|
||||
const Field* col = schema.column(cid);
|
||||
vectorized::DataTypePtr storage_column_type =
|
||||
get_data_type_of(*col, read_type != ReaderType::READER_QUERY);
|
||||
vectorized::DataTypePtr storage_column_type = get_data_type_of(
|
||||
col->path(), col->is_nullable(), read_type != ReaderType::READER_QUERY);
|
||||
if (storage_column_type == nullptr) {
|
||||
// Default column iterator
|
||||
return true;
|
||||
|
||||
@ -342,7 +342,8 @@ Status SegmentIterator::_init_impl(const StorageReadOptions& opts) {
|
||||
const Field* col = _schema->column(i);
|
||||
if (col) {
|
||||
auto storage_type = _segment->get_data_type_of(
|
||||
*col, _opts.io_ctx.reader_type != ReaderType::READER_QUERY);
|
||||
col->path(), col->is_nullable(),
|
||||
_opts.io_ctx.reader_type != ReaderType::READER_QUERY);
|
||||
if (storage_type == nullptr) {
|
||||
storage_type = vectorized::DataTypeFactory::instance().create_data_type(*col);
|
||||
}
|
||||
|
||||
@ -251,8 +251,8 @@ private:
|
||||
if (block_cid >= block->columns()) {
|
||||
continue;
|
||||
}
|
||||
vectorized::DataTypePtr storage_type =
|
||||
_segment->get_data_type_of(*_schema->column(cid), false);
|
||||
vectorized::DataTypePtr storage_type = _segment->get_data_type_of(
|
||||
_schema->column(cid)->path(), _schema->column(cid)->is_nullable(), false);
|
||||
if (storage_type && !storage_type->equals(*block->get_by_position(block_cid).type)) {
|
||||
// Do additional cast
|
||||
vectorized::MutableColumnPtr tmp = storage_type->create_column();
|
||||
|
||||
@ -554,6 +554,20 @@ void TabletColumn::init_from_pb(const ColumnPB& column) {
|
||||
}
|
||||
}
|
||||
|
||||
TabletColumn TabletColumn::create_materialized_variant_column(const std::string& root,
|
||||
const std::vector<std::string>& paths,
|
||||
int32_t parent_unique_id) {
|
||||
TabletColumn subcol;
|
||||
subcol.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
|
||||
subcol.set_is_nullable(true);
|
||||
subcol.set_unique_id(-1);
|
||||
subcol.set_parent_unique_id(parent_unique_id);
|
||||
vectorized::PathInData path(root, paths);
|
||||
subcol.set_path_info(path);
|
||||
subcol.set_name(path.get_path());
|
||||
return subcol;
|
||||
}
|
||||
|
||||
void TabletColumn::to_schema_pb(ColumnPB* column) const {
|
||||
column->set_unique_id(_unique_id);
|
||||
column->set_name(_col_name);
|
||||
|
||||
@ -36,6 +36,7 @@
|
||||
#include "gutil/stringprintf.h"
|
||||
#include "olap/olap_common.h"
|
||||
#include "runtime/define_primitive_type.h"
|
||||
#include "runtime/descriptors.h"
|
||||
#include "util/string_util.h"
|
||||
#include "vec/aggregate_functions/aggregate_function.h"
|
||||
#include "vec/common/string_utils/string_utils.h"
|
||||
@ -91,6 +92,11 @@ public:
|
||||
_type == FieldType::OLAP_FIELD_TYPE_QUANTILE_STATE ||
|
||||
_type == FieldType::OLAP_FIELD_TYPE_AGG_STATE;
|
||||
}
|
||||
// Such columns are not exist in frontend schema info, so we need to
|
||||
// add them into tablet_schema for later column indexing.
|
||||
static TabletColumn create_materialized_variant_column(const std::string& root,
|
||||
const std::vector<std::string>& paths,
|
||||
int32_t parent_unique_id);
|
||||
bool has_default_value() const { return _has_default_value; }
|
||||
std::string default_value() const { return _default_value; }
|
||||
size_t length() const { return _length; }
|
||||
|
||||
@ -85,6 +85,7 @@ SlotDescriptor::SlotDescriptor(const PSlotDescriptor& pdesc)
|
||||
_is_materialized(pdesc.is_materialized()),
|
||||
_is_key(pdesc.is_key()),
|
||||
_need_materialize(true),
|
||||
_column_paths(pdesc.column_paths().begin(), pdesc.column_paths().end()),
|
||||
_is_auto_increment(pdesc.is_auto_increment()) {}
|
||||
|
||||
void SlotDescriptor::to_protobuf(PSlotDescriptor* pslot) const {
|
||||
@ -103,6 +104,9 @@ void SlotDescriptor::to_protobuf(PSlotDescriptor* pslot) const {
|
||||
pslot->set_is_key(_is_key);
|
||||
pslot->set_is_auto_increment(_is_auto_increment);
|
||||
pslot->set_col_type(_col_type);
|
||||
for (const std::string& path : _column_paths) {
|
||||
pslot->add_column_paths(path);
|
||||
}
|
||||
}
|
||||
|
||||
vectorized::MutableColumnPtr SlotDescriptor::get_empty_mutable_column() const {
|
||||
|
||||
@ -25,6 +25,7 @@
|
||||
#include <butil/errno.h>
|
||||
#include <butil/iobuf.h>
|
||||
#include <fcntl.h>
|
||||
#include <fmt/core.h>
|
||||
#include <gen_cpp/MasterService_types.h>
|
||||
#include <gen_cpp/PaloInternalService_types.h>
|
||||
#include <gen_cpp/PlanNodes_types.h>
|
||||
@ -46,6 +47,7 @@
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
@ -73,6 +75,7 @@
|
||||
#include "olap/rowset/segment_v2/common.h"
|
||||
#include "olap/rowset/segment_v2/inverted_index_desc.h"
|
||||
#include "olap/rowset/segment_v2/segment.h"
|
||||
#include "olap/rowset/segment_v2/segment_iterator.h"
|
||||
#include "olap/segment_loader.h"
|
||||
#include "olap/storage_engine.h"
|
||||
#include "olap/tablet.h"
|
||||
@ -1711,6 +1714,38 @@ auto scope_timer_run(Func fn, int64_t* cost) -> decltype(fn()) {
|
||||
return res;
|
||||
}
|
||||
|
||||
struct IteratorKey {
|
||||
int64_t tablet_id;
|
||||
RowsetId rowset_id;
|
||||
uint64_t segment_id;
|
||||
int slot_id;
|
||||
|
||||
// unordered map std::equal_to
|
||||
bool operator==(const IteratorKey& rhs) const {
|
||||
return tablet_id == rhs.tablet_id && rowset_id == rhs.rowset_id &&
|
||||
segment_id == rhs.segment_id && slot_id == rhs.slot_id;
|
||||
}
|
||||
};
|
||||
|
||||
struct HashOfIteratorKey {
|
||||
size_t operator()(const IteratorKey& key) const {
|
||||
size_t seed = 0;
|
||||
seed = HashUtil::hash64(&key.tablet_id, sizeof(key.tablet_id), seed);
|
||||
seed = HashUtil::hash64(&key.rowset_id.hi, sizeof(key.rowset_id.hi), seed);
|
||||
seed = HashUtil::hash64(&key.rowset_id.mi, sizeof(key.rowset_id.mi), seed);
|
||||
seed = HashUtil::hash64(&key.rowset_id.lo, sizeof(key.rowset_id.lo), seed);
|
||||
seed = HashUtil::hash64(&key.segment_id, sizeof(key.segment_id), seed);
|
||||
seed = HashUtil::hash64(&key.slot_id, sizeof(key.slot_id), seed);
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
|
||||
struct IteratorItem {
|
||||
std::unique_ptr<ColumnIterator> iterator;
|
||||
// for holding the reference of segment to avoid use after release
|
||||
SegmentSharedPtr segment;
|
||||
};
|
||||
|
||||
Status PInternalServiceImpl::_multi_get(const PMultiGetRequest& request,
|
||||
PMultiGetResponse* response) {
|
||||
OlapReaderStatistics stats;
|
||||
@ -1735,6 +1770,7 @@ Status PInternalServiceImpl::_multi_get(const PMultiGetRequest& request,
|
||||
full_read_schema.append_column(TabletColumn(column_pb));
|
||||
}
|
||||
|
||||
std::unordered_map<IteratorKey, IteratorItem, HashOfIteratorKey> iterator_map;
|
||||
// read row by row
|
||||
for (size_t i = 0; i < request.row_locs_size(); ++i) {
|
||||
const auto& row_loc = request.row_locs(i);
|
||||
@ -1773,8 +1809,8 @@ Status PInternalServiceImpl::_multi_get(const PMultiGetRequest& request,
|
||||
},
|
||||
&acquire_segments_ms));
|
||||
// find segment
|
||||
auto it = std::find_if(segment_cache.get_segments().begin(),
|
||||
segment_cache.get_segments().end(),
|
||||
auto it = std::find_if(segment_cache.get_segments().cbegin(),
|
||||
segment_cache.get_segments().cend(),
|
||||
[&row_loc](const segment_v2::SegmentSharedPtr& seg) {
|
||||
return seg->id() == row_loc.segment_id();
|
||||
});
|
||||
@ -1802,37 +1838,28 @@ Status PInternalServiceImpl::_multi_get(const PMultiGetRequest& request,
|
||||
if (result_block.is_empty_column()) {
|
||||
result_block = vectorized::Block(desc.slots(), request.row_locs().size());
|
||||
}
|
||||
VLOG_DEBUG << "Read row location "
|
||||
<< fmt::format("{}, {}, {}, {}", row_location.tablet_id,
|
||||
row_location.row_location.rowset_id.to_string(),
|
||||
row_location.row_location.segment_id,
|
||||
row_location.row_location.row_id);
|
||||
for (int x = 0; x < desc.slots().size(); ++x) {
|
||||
int index = -1;
|
||||
if (desc.slots()[x]->col_unique_id() >= 0) {
|
||||
// light sc enabled
|
||||
index = full_read_schema.field_index(desc.slots()[x]->col_unique_id());
|
||||
} else {
|
||||
index = full_read_schema.field_index(desc.slots()[x]->col_name());
|
||||
}
|
||||
if (index < 0) {
|
||||
std::stringstream ss;
|
||||
ss << "field name is invalid. field=" << desc.slots()[x]->col_name()
|
||||
<< ", field_name_to_index=" << full_read_schema.get_all_field_names();
|
||||
return Status::InternalError(ss.str());
|
||||
}
|
||||
std::unique_ptr<segment_v2::ColumnIterator> column_iterator;
|
||||
auto row_id = static_cast<segment_v2::rowid_t>(row_loc.ordinal_id());
|
||||
vectorized::MutableColumnPtr column =
|
||||
result_block.get_by_position(x).column->assume_mutable();
|
||||
StorageReadOptions storage_read_opt;
|
||||
storage_read_opt.io_ctx.reader_type = ReaderType::READER_QUERY;
|
||||
RETURN_IF_ERROR(segment->new_column_iterator(full_read_schema.column(index),
|
||||
&column_iterator, &storage_read_opt));
|
||||
segment_v2::ColumnIteratorOptions opt {
|
||||
.use_page_cache = !config::disable_storage_page_cache,
|
||||
.file_reader = segment->file_reader().get(),
|
||||
.stats = &stats,
|
||||
.io_ctx = io::IOContext {.reader_type = ReaderType::READER_QUERY},
|
||||
};
|
||||
static_cast<void>(column_iterator->init(opt));
|
||||
std::vector<segment_v2::rowid_t> single_row_loc {
|
||||
static_cast<segment_v2::rowid_t>(row_loc.ordinal_id())};
|
||||
RETURN_IF_ERROR(column_iterator->read_by_rowids(single_row_loc.data(), 1, column));
|
||||
IteratorKey iterator_key {.tablet_id = tablet->tablet_id(),
|
||||
.rowset_id = rowset_id,
|
||||
.segment_id = row_loc.segment_id(),
|
||||
.slot_id = desc.slots()[x]->id()};
|
||||
IteratorItem& iterator_item = iterator_map[iterator_key];
|
||||
if (iterator_item.segment == nullptr) {
|
||||
// hold the reference
|
||||
iterator_map[iterator_key].segment = segment;
|
||||
}
|
||||
segment = iterator_item.segment;
|
||||
RETURN_IF_ERROR(segment->seek_and_read_by_rowid(full_read_schema, desc.slots()[x],
|
||||
row_id, column, stats,
|
||||
iterator_item.iterator));
|
||||
}
|
||||
}
|
||||
// serialize block if not empty
|
||||
@ -1852,11 +1879,13 @@ Status PInternalServiceImpl::_multi_get(const PMultiGetRequest& request,
|
||||
"hit_cached_pages:{}, total_pages_read:{}, compressed_bytes_read:{}, "
|
||||
"io_latency:{}ns, "
|
||||
"uncompressed_bytes_read:{},"
|
||||
"bytes_read:{},"
|
||||
"acquire_tablet_ms:{}, acquire_rowsets_ms:{}, acquire_segments_ms:{}, "
|
||||
"lookup_row_data_ms:{}",
|
||||
stats.cached_pages_num, stats.total_pages_num, stats.compressed_bytes_read,
|
||||
stats.io_ns, stats.uncompressed_bytes_read, acquire_tablet_ms,
|
||||
acquire_rowsets_ms, acquire_segments_ms, lookup_row_data_ms);
|
||||
stats.io_ns, stats.uncompressed_bytes_read, stats.bytes_read,
|
||||
acquire_tablet_ms, acquire_rowsets_ms, acquire_segments_ms,
|
||||
lookup_row_data_ms);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
@ -1865,6 +1894,7 @@ void PInternalServiceImpl::multiget_data(google::protobuf::RpcController* contro
|
||||
PMultiGetResponse* response,
|
||||
google::protobuf::Closure* done) {
|
||||
bool ret = _light_work_pool.try_offer([request, response, done, this]() {
|
||||
signal::set_signal_task_id(request->query_id());
|
||||
// multi get data by rowid
|
||||
MonotonicStopWatch watch;
|
||||
watch.start();
|
||||
|
||||
@ -645,11 +645,12 @@ void ColumnObject::for_each_subcolumn(ColumnCallback callback) {
|
||||
}
|
||||
|
||||
void ColumnObject::insert_from(const IColumn& src, size_t n) {
|
||||
const auto& src_v = assert_cast<const ColumnObject&>(src);
|
||||
const auto* src_v = check_and_get_column<ColumnObject>(src);
|
||||
// optimize when src and this column are scalar variant, since try_insert is inefficiency
|
||||
if (src_v.is_scalar_variant() && is_scalar_variant() &&
|
||||
src_v.get_root_type()->equals(*get_root_type()) && src_v.is_finalized() && is_finalized()) {
|
||||
assert_cast<ColumnNullable&>(*get_root()).insert_from(*src_v.get_root(), n);
|
||||
if (src_v != nullptr && src_v->is_scalar_variant() && is_scalar_variant() &&
|
||||
src_v->get_root_type()->equals(*get_root_type()) && src_v->is_finalized() &&
|
||||
is_finalized()) {
|
||||
assert_cast<ColumnNullable&>(*get_root()).insert_from(*src_v->get_root(), n);
|
||||
++num_rows;
|
||||
return;
|
||||
}
|
||||
|
||||
@ -58,6 +58,7 @@
|
||||
#include "vec/exec/scan/new_olap_scan_node.h"
|
||||
#include "vec/exec/scan/vscan_node.h"
|
||||
#include "vec/exprs/vexpr_context.h"
|
||||
#include "vec/json/path_in_data.h"
|
||||
#include "vec/olap/block_reader.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
@ -411,16 +412,6 @@ Status NewOlapScanner::_init_tablet_reader_params(
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
vectorized::PathInData NewOlapScanner::_build_path(SlotDescriptor* slot,
|
||||
const std::string& root_name) {
|
||||
PathInDataBuilder path_builder;
|
||||
path_builder.append(root_name, false);
|
||||
for (const std::string& path : slot->column_paths()) {
|
||||
path_builder.append(path, false);
|
||||
}
|
||||
return path_builder.build();
|
||||
}
|
||||
|
||||
Status NewOlapScanner::_init_variant_columns() {
|
||||
auto& tablet_schema = _tablet_reader_params.tablet_schema;
|
||||
// Parent column has path info to distinction from each other
|
||||
@ -434,16 +425,10 @@ Status NewOlapScanner::_init_variant_columns() {
|
||||
if (slot->type().is_variant_type()) {
|
||||
// Such columns are not exist in frontend schema info, so we need to
|
||||
// add them into tablet_schema for later column indexing.
|
||||
TabletColumn subcol;
|
||||
subcol.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
|
||||
subcol.set_is_nullable(true);
|
||||
subcol.set_unique_id(-1);
|
||||
subcol.set_parent_unique_id(slot->col_unique_id());
|
||||
PathInData path = _build_path(
|
||||
slot, tablet_schema->column_by_uid(slot->col_unique_id()).name_lower_case());
|
||||
subcol.set_path_info(path);
|
||||
subcol.set_name(path.get_path());
|
||||
if (tablet_schema->field_index(path) < 0) {
|
||||
TabletColumn subcol = TabletColumn::create_materialized_variant_column(
|
||||
tablet_schema->column_by_uid(slot->col_unique_id()).name_lower_case(),
|
||||
slot->column_paths(), slot->col_unique_id());
|
||||
if (tablet_schema->field_index(subcol.path_info()) < 0) {
|
||||
tablet_schema->append_column(subcol, TabletSchema::ColumnType::VARIANT);
|
||||
}
|
||||
}
|
||||
@ -465,8 +450,9 @@ Status NewOlapScanner::_init_return_columns() {
|
||||
int32_t index = 0;
|
||||
auto& tablet_schema = _tablet_reader_params.tablet_schema;
|
||||
if (slot->type().is_variant_type()) {
|
||||
index = tablet_schema->field_index(_build_path(
|
||||
slot, tablet_schema->column_by_uid(slot->col_unique_id()).name_lower_case()));
|
||||
index = tablet_schema->field_index(PathInData(
|
||||
tablet_schema->column_by_uid(slot->col_unique_id()).name_lower_case(),
|
||||
slot->column_paths()));
|
||||
} else {
|
||||
index = slot->col_unique_id() >= 0 ? tablet_schema->field_index(slot->col_unique_id())
|
||||
: tablet_schema->field_index(slot->col_name());
|
||||
|
||||
@ -93,7 +93,6 @@ private:
|
||||
const std::vector<FunctionFilter>& function_filters);
|
||||
|
||||
[[nodiscard]] Status _init_return_columns();
|
||||
vectorized::PathInData _build_path(SlotDescriptor* slot, const std::string& root_name);
|
||||
[[nodiscard]] Status _init_variant_columns();
|
||||
|
||||
std::vector<OlapScanRange*> _key_ranges;
|
||||
|
||||
@ -46,11 +46,22 @@ PathInData::PathInData(const PathInData& other) : path(other.path) {
|
||||
build_parts(other.get_parts());
|
||||
}
|
||||
|
||||
PathInData::PathInData(const std::string& root, const std::vector<std::string>& paths) {
|
||||
PathInDataBuilder path_builder;
|
||||
path_builder.append(root, false);
|
||||
for (const std::string& path : paths) {
|
||||
path_builder.append(path, false);
|
||||
}
|
||||
build_path(path_builder.get_parts());
|
||||
build_parts(path_builder.get_parts());
|
||||
}
|
||||
|
||||
PathInData::PathInData(const std::vector<std::string>& paths) {
|
||||
PathInDataBuilder path_builder;
|
||||
for (size_t i = 0; i < paths.size(); ++i) {
|
||||
path_builder.append(paths[i], false);
|
||||
}
|
||||
build_path(path_builder.get_parts());
|
||||
build_parts(path_builder.get_parts());
|
||||
}
|
||||
|
||||
|
||||
@ -61,6 +61,7 @@ public:
|
||||
explicit PathInData(std::string_view path_);
|
||||
explicit PathInData(const Parts& parts_);
|
||||
explicit PathInData(const std::vector<std::string>& paths);
|
||||
explicit PathInData(const std::string& root, const std::vector<std::string>& paths);
|
||||
PathInData(const PathInData& other);
|
||||
PathInData& operator=(const PathInData& other);
|
||||
static UInt128 get_parts_hash(const Parts& parts_);
|
||||
|
||||
@ -829,8 +829,7 @@ public class SelectStmt extends QueryStmt {
|
||||
LOG.debug("only support duplicate key or MOW model");
|
||||
return false;
|
||||
}
|
||||
if (!olapTable.getEnableLightSchemaChange() || !Strings.isNullOrEmpty(olapTable.getStoragePolicy())
|
||||
|| olapTable.hasVariantColumns()) {
|
||||
if (!olapTable.getEnableLightSchemaChange()) {
|
||||
return false;
|
||||
}
|
||||
if (getOrderByElements() != null) {
|
||||
|
||||
@ -38,6 +38,7 @@ message PSlotDescriptor {
|
||||
optional bool is_key = 12;
|
||||
optional bool is_auto_increment = 13;
|
||||
optional int32 col_type = 14 [default = 0];
|
||||
repeated string column_paths = 15;
|
||||
};
|
||||
|
||||
message PTupleDescriptor {
|
||||
|
||||
File diff suppressed because one or more lines are too long
@ -3,31 +3,37 @@
|
||||
0
|
||||
|
||||
-- !gh_data_2 --
|
||||
5000
|
||||
0
|
||||
|
||||
-- !gh_data_3 --
|
||||
0
|
||||
|
||||
-- !gh_data_4 --
|
||||
5000
|
||||
|
||||
-- !gh_data_5 --
|
||||
leonardomso/33-js-concepts 3
|
||||
ytdl-org/youtube-dl 3
|
||||
Bogdanp/neko 2
|
||||
bminossi/AllVideoPocsFromHackerOne 2
|
||||
disclose/diodata 2
|
||||
|
||||
-- !gh_data_4 --
|
||||
-- !gh_data_6 --
|
||||
14690758274
|
||||
|
||||
-- !gh_data_5 --
|
||||
-- !gh_data_7 --
|
||||
73453762334584
|
||||
|
||||
-- !gh_data_6 --
|
||||
-- !gh_data_8 --
|
||||
457806339
|
||||
|
||||
-- !gh_data_7 --
|
||||
-- !gh_data_9 --
|
||||
0
|
||||
|
||||
-- !gh_data_8 --
|
||||
-- !gh_data_10 --
|
||||
19829
|
||||
|
||||
-- !gh_data_9 --
|
||||
-- !gh_data_11 --
|
||||
49390617
|
||||
64890096
|
||||
10696700
|
||||
@ -39,19 +45,19 @@ disclose/diodata 2
|
||||
42386044
|
||||
73801003
|
||||
|
||||
-- !gh_data_10 --
|
||||
27 {"payload":{"action":"started"},"created_at":"2021-01-02T16:37:27Z","id":"14690746717","public":1,"actor":{"gravatar_id":"","display_login":"sergdudnik","url":"https://api.github.com/users/sergdudnik","id":16341546,"login":"sergdudnik","avatar_url":"https://avatars.githubusercontent.com/u/16341546?"},"repo":{"url":"https://api.github.com/repos/leonardomso/33-js-concepts","id":147350463,"name":"leonardomso/33-js-concepts"},"type":"WatchEvent"}
|
||||
36 {"payload":{"action":"started"},"created_at":"2021-01-02T16:37:27Z","id":"14690746732","public":1,"actor":{"gravatar_id":"","display_login":"juliusHuelsmann","url":"https://api.github.com/users/juliusHuelsmann","id":9212314,"login":"juliusHuelsmann","avatar_url":"https://avatars.githubusercontent.com/u/9212314?"},"repo":{"url":"https://api.github.com/repos/odeke-em/drive","id":26109545,"name":"odeke-em/drive"},"type":"WatchEvent"}
|
||||
46 {"payload":{"action":"started"},"created_at":"2021-01-02T16:37:27Z","id":"14690746749","org":{"gravatar_id":"","url":"https://api.github.com/orgs/GO-LiFE","id":38434522,"login":"GO-LiFE","avatar_url":"https://avatars.githubusercontent.com/u/38434522?"},"public":1,"actor":{"gravatar_id":"","display_login":"okbean","url":"https://api.github.com/users/okbean","id":75969386,"login":"okbean","avatar_url":"https://avatars.githubusercontent.com/u/75969386?"},"repo":{"url":"https://api.github.com/repos/GO-LiFE/GoFIT_SDK_Android","id":141905736,"name":"GO-LiFE/GoFIT_SDK_Android"},"type":"WatchEvent"}
|
||||
56 {"payload":{"action":"started"},"created_at":"2021-01-02T16:37:28Z","id":"14690746773","public":1,"actor":{"gravatar_id":"","display_login":"PWDream","url":"https://api.github.com/users/PWDream","id":4903755,"login":"PWDream","avatar_url":"https://avatars.githubusercontent.com/u/4903755?"},"repo":{"url":"https://api.github.com/repos/MrXujiang/h5-Dooring","id":289417971,"name":"MrXujiang/h5-Dooring"},"type":"WatchEvent"}
|
||||
86 {"payload":{"action":"started"},"created_at":"2021-01-02T16:37:29Z","id":"14690746843","public":1,"actor":{"gravatar_id":"","display_login":"Gui-Yom","url":"https://api.github.com/users/Gui-Yom","id":25181283,"login":"Gui-Yom","avatar_url":"https://avatars.githubusercontent.com/u/25181283?"},"repo":{"url":"https://api.github.com/repos/redsaph/cleartext","id":106453399,"name":"redsaph/cleartext"},"type":"WatchEvent"}
|
||||
98 {"payload":{"action":"started"},"created_at":"2021-01-02T16:37:29Z","id":"14690746866","org":{"gravatar_id":"","url":"https://api.github.com/orgs/sherlock-project","id":48293496,"login":"sherlock-project","avatar_url":"https://avatars.githubusercontent.com/u/48293496?"},"public":1,"actor":{"gravatar_id":"","display_login":"humaidk2","url":"https://api.github.com/users/humaidk2","id":12982026,"login":"humaidk2","avatar_url":"https://avatars.githubusercontent.com/u/12982026?"},"repo":{"url":"https://api.github.com/repos/sherlock-project/sherlock","id":162998479,"name":"sherlock-project/sherlock"},"type":"WatchEvent"}
|
||||
101 {"payload":{"action":"started"},"created_at":"2021-01-02T16:37:29Z","id":"14690746870","public":1,"actor":{"gravatar_id":"","display_login":"hasantezcan","url":"https://api.github.com/users/hasantezcan","id":32804505,"login":"hasantezcan","avatar_url":"https://avatars.githubusercontent.com/u/32804505?"},"repo":{"url":"https://api.github.com/repos/okandavut/react-spotify-nowplaying","id":326215605,"name":"okandavut/react-spotify-nowplaying"},"type":"WatchEvent"}
|
||||
112 {"payload":{"action":"started"},"created_at":"2021-01-02T16:37:30Z","id":"14690746899","public":1,"actor":{"gravatar_id":"","display_login":"nicholas-robertson","url":"https://api.github.com/users/nicholas-robertson","id":17681331,"login":"nicholas-robertson","avatar_url":"https://avatars.githubusercontent.com/u/17681331?"},"repo":{"url":"https://api.github.com/repos/sentriz/gonic","id":178435468,"name":"sentriz/gonic"},"type":"WatchEvent"}
|
||||
122 {"payload":{"action":"started"},"created_at":"2021-01-02T16:37:30Z","id":"14690746914","org":{"gravatar_id":"","url":"https://api.github.com/orgs/netlify-labs","id":47546088,"login":"netlify-labs","avatar_url":"https://avatars.githubusercontent.com/u/47546088?"},"public":1,"actor":{"gravatar_id":"","display_login":"javaniecampbell","url":"https://api.github.com/users/javaniecampbell","id":1676496,"login":"javaniecampbell","avatar_url":"https://avatars.githubusercontent.com/u/1676496?"},"repo":{"url":"https://api.github.com/repos/netlify-labs/react-netlify-identity-widget","id":182606378,"name":"netlify-labs/react-netlify-identity-widget"},"type":"WatchEvent"}
|
||||
169 {"payload":{"action":"started"},"created_at":"2021-01-02T16:37:32Z","id":"14690747028","org":{"gravatar_id":"","url":"https://api.github.com/orgs/microsoft","id":6154722,"login":"microsoft","avatar_url":"https://avatars.githubusercontent.com/u/6154722?"},"public":1,"actor":{"gravatar_id":"","display_login":"Yxnt","url":"https://api.github.com/users/Yxnt","id":10323352,"login":"Yxnt","avatar_url":"https://avatars.githubusercontent.com/u/10323352?"},"repo":{"url":"https://api.github.com/repos/microsoft/BotBuilder-Samples","id":68730444,"name":"microsoft/BotBuilder-Samples"},"type":"WatchEvent"}
|
||||
-- !gh_data_12 --
|
||||
27 {"url":"https://api.github.com/repos/leonardomso/33-js-concepts","id":147350463,"name":"leonardomso/33-js-concepts"}
|
||||
36 {"url":"https://api.github.com/repos/odeke-em/drive","id":26109545,"name":"odeke-em/drive"}
|
||||
46 {"url":"https://api.github.com/repos/GO-LiFE/GoFIT_SDK_Android","id":141905736,"name":"GO-LiFE/GoFIT_SDK_Android"}
|
||||
56 {"url":"https://api.github.com/repos/MrXujiang/h5-Dooring","id":289417971,"name":"MrXujiang/h5-Dooring"}
|
||||
86 {"url":"https://api.github.com/repos/redsaph/cleartext","id":106453399,"name":"redsaph/cleartext"}
|
||||
98 {"url":"https://api.github.com/repos/sherlock-project/sherlock","id":162998479,"name":"sherlock-project/sherlock"}
|
||||
101 {"url":"https://api.github.com/repos/okandavut/react-spotify-nowplaying","id":326215605,"name":"okandavut/react-spotify-nowplaying"}
|
||||
112 {"url":"https://api.github.com/repos/sentriz/gonic","id":178435468,"name":"sentriz/gonic"}
|
||||
122 {"url":"https://api.github.com/repos/netlify-labs/react-netlify-identity-widget","id":182606378,"name":"netlify-labs/react-netlify-identity-widget"}
|
||||
169 {"url":"https://api.github.com/repos/microsoft/BotBuilder-Samples","id":68730444,"name":"microsoft/BotBuilder-Samples"}
|
||||
|
||||
-- !gh_data_11 --
|
||||
-- !gh_data_13 --
|
||||
2051941 1
|
||||
10696700 1
|
||||
32271952 2
|
||||
@ -63,3 +69,27 @@ disclose/diodata 2
|
||||
64890096 1
|
||||
73801003 1
|
||||
|
||||
-- !gh_data_14 --
|
||||
27 14690746717 WatchEvent leonardomso/33-js-concepts
|
||||
36 14690746732 WatchEvent odeke-em/drive
|
||||
46 14690746749 WatchEvent GO-LiFE/GoFIT_SDK_Android
|
||||
56 14690746773 WatchEvent MrXujiang/h5-Dooring
|
||||
86 14690746843 WatchEvent redsaph/cleartext
|
||||
98 14690746866 WatchEvent sherlock-project/sherlock
|
||||
101 14690746870 WatchEvent okandavut/react-spotify-nowplaying
|
||||
112 14690746899 WatchEvent sentriz/gonic
|
||||
122 14690746914 WatchEvent netlify-labs/react-netlify-identity-widget
|
||||
169 14690747028 WatchEvent microsoft/BotBuilder-Samples
|
||||
|
||||
-- !gh_data_15 --
|
||||
user
|
||||
user
|
||||
user
|
||||
user
|
||||
user
|
||||
user
|
||||
user
|
||||
user
|
||||
user
|
||||
user
|
||||
|
||||
|
||||
@ -287,7 +287,7 @@ suite("regression_test_variant", "variant_type"){
|
||||
// 12. streamload remote file
|
||||
table_name = "logdata"
|
||||
create_table.call(table_name, "4")
|
||||
sql "set enable_two_phase_read_opt = false;"
|
||||
// sql "set enable_two_phase_read_opt = false;"
|
||||
// no sparse columns
|
||||
set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1")
|
||||
load_json_data.call(table_name, """${getS3Url() + '/load/logdata.json'}""")
|
||||
@ -340,7 +340,7 @@ suite("regression_test_variant", "variant_type"){
|
||||
qt_sql_36_1 "select cast(v:a as int), cast(v:b as int), cast(v:c as int) from ${table_name} order by k limit 10"
|
||||
sql "DELETE FROM ${table_name} WHERE k=1"
|
||||
sql "select * from ${table_name}"
|
||||
qt_sql_36_2 "select * from ${table_name} where k > 3 order by k desc limit 10"
|
||||
qt_sql_36_2 """select k, json_extract(cast(v as text), "\$.repo") from ${table_name} where k > 3 order by k desc limit 10"""
|
||||
sql "insert into ${table_name} select * from ${table_name}"
|
||||
sql """UPDATE ${table_name} set v = '{"updated_value" : 10}' where k = 2"""
|
||||
qt_sql_36_3 """select * from ${table_name} where k = 2"""
|
||||
@ -386,13 +386,13 @@ suite("regression_test_variant", "variant_type"){
|
||||
sql """insert into ${table_name} values (2, "abe", '{"c" : 1}')"""
|
||||
sql """insert into ${table_name} values (3, "abd", '{"d" : 1}')"""
|
||||
sql "delete from ${table_name} where k in (select k from variant_mow where k in (1, 2))"
|
||||
qt_sql_38 "select * from ${table_name} order by k"
|
||||
qt_sql_38 "select * from ${table_name} order by k limit 10"
|
||||
|
||||
// read text from sparse col
|
||||
set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "0.95")
|
||||
sql """insert into sparse_columns select 0, '{"a": 1123, "b" : [123, {"xx" : 1}], "c" : {"c" : 456, "d" : null, "e" : 7.111}, "zzz" : null, "oooo" : {"akakaka" : null, "xxxx" : {"xxx" : 123}}}' as json_str
|
||||
union all select 0, '{"a" : 1234, "xxxx" : "kaana", "ddd" : {"aaa" : 123, "mxmxm" : [456, "789"]}}' as json_str from numbers("number" = "4096") limit 4096 ;"""
|
||||
qt_sql_31 """select cast(v:xxxx as string) from sparse_columns where cast(v:xxxx as string) != 'null' limit 1;"""
|
||||
qt_sql_31 """select cast(v:xxxx as string) from sparse_columns where cast(v:xxxx as string) != 'null' order by k limit 1;"""
|
||||
sql "truncate table sparse_columns"
|
||||
set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "0.95")
|
||||
} finally {
|
||||
|
||||
@ -1,4 +1,6 @@
|
||||
set exec_mem_limit=8G;
|
||||
set enable_two_phase_read_opt = true;
|
||||
set topn_opt_limit_threshold = 1024;
|
||||
SELECT count() from ghdata;
|
||||
SELECT cast(v:repo.name as string), count() AS stars FROM ghdata WHERE cast(v:type as string) = 'WatchEvent' GROUP BY cast(v:repo.name as string) ORDER BY stars DESC, cast(v:repo.name as string) LIMIT 5;
|
||||
SELECT max(cast(cast(v:`id` as string) as bigint)) FROM ghdata;
|
||||
@ -6,8 +8,9 @@ SELECT sum(cast(cast(v:`id` as string) as bigint)) FROM ghdata;
|
||||
SELECT sum(cast(v:payload.member.id as bigint)) FROM ghdata;
|
||||
SELECT sum(cast(v:payload.pull_request.milestone.creator.site_admin as bigint)) FROM ghdata;
|
||||
SELECT sum(length(v:payload.pull_request.base.repo.html_url)) FROM ghdata;
|
||||
-- SELECT v:payload.commits.author.name FROM ghdata ORDER BY k LIMIT 10;
|
||||
SELECT v:payload.member.id FROM ghdata where cast(v:payload.member.id as string) is not null ORDER BY k LIMIT 10;
|
||||
-- select k, v:payload.commits.author.name AS name, e FROM ghdata as t lateral view explode(cast(v:payload.commits.author.name as array<string>)) tm1 as e order by k limit 5;
|
||||
select k, v from ghdata WHERE cast(v:type as string) = 'WatchEvent' order by k limit 10;
|
||||
SELECT cast(v:payload.member.id as bigint), count() FROM ghdata where cast(v:payload.member.id as bigint) is not null group by cast(v:payload.member.id as bigint) order by 1, 2 desc LIMIT 10;
|
||||
select k, json_extract(v, '$.repo') from ghdata WHERE cast(v:type as string) = 'WatchEvent' order by k limit 10;
|
||||
SELECT cast(v:payload.member.id as bigint), count() FROM ghdata where cast(v:payload.member.id as bigint) is not null group by cast(v:payload.member.id as bigint) order by 1, 2 desc LIMIT 10;
|
||||
select k, cast(v:`id` as string), cast(v:type as string), cast(v:repo.name as string) from ghdata WHERE cast(v:type as string) = 'WatchEvent' order by k limit 10;
|
||||
SELECT cast(v:payload.pusher_type as text) FROM ghdata where cast(v:payload.pusher_type as text) is not null ORDER BY k LIMIT 10;
|
||||
Reference in New Issue
Block a user