oceanbase/unittest/sql/engine/ob_fake_table_scan_vec_op.cpp
obdev b6773084c6 [FEAT MERGE] impl vectorization 2.0
Co-authored-by: Naynahs <cfzy002@126.com>
Co-authored-by: hwx65 <1780011298@qq.com>
Co-authored-by: oceanoverflow <oceanoverflow@gmail.com>
2023-12-22 03:43:22 +00:00

299 lines
12 KiB
C++

/**
* Copyright (c) 2021 OceanBase
* OceanBase CE is licensed under Mulan PubL v2.
* You can use this software according to the terms and conditions of the Mulan PubL v2.
* You may obtain a copy of Mulan PubL v2 at:
* http://license.coscl.org.cn/MulanPubL-2.0
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PubL v2 for more details.
*/
#define USING_LOG_PREFIX COMMON
#include "deps/oblib/src/common/object/ob_obj_type.h"
#include "test_op_engine.h"
#include "ob_fake_table_scan_vec_op.h"
#include "data_generator.h"
namespace oceanbase
{
namespace sql
{
int ObFakeTableScanVecOp::inner_open()
{
ObDataGenerator::get_instance().register_op(this);
std::string round;
ObTestOpConfig::get_instance().get_config("round", round);
if (!round.empty()) { max_round_ = current_round_ + std::stoi(round); }
return OB_SUCCESS;
}
int ObFakeTableScanVecOp::inner_get_next_batch(const int64_t max_row_cnt)
{
int ret = OB_SUCCESS;
clear_evaluated_flag();
uint64_t op_id = get_spec().get_id();
int64_t generate_random_value = 0;
bool is_duplicate = false;
const ObPushdownExprSpec &pd_expr_spec =
reinterpret_cast<const ObTableScanSpec *>(&spec_)->tsc_ctdef_.scan_ctdef_.pd_expr_spec_;
for (int j = 0; j < pd_expr_spec.access_exprs_.count(); j++) {
ObExpr *expr = pd_expr_spec.access_exprs_.at(j);
OB_FAIL(fill_random_data_into_expr_datum_frame(j, pd_expr_spec.access_exprs_.count(), expr,
ObTestOpConfig::get_instance().batch_size_, is_duplicate));
}
// random set skip
set_random_skip(current_round_, ObTestOpConfig::get_instance().batch_size_);
if (is_duplicate) { ObDataGenerator::get_instance().reset_temp_store(op_id, current_round_); }
current_round_++;
brs_.size_ = ObTestOpConfig::get_instance().batch_size_;
if (current_round_ == max_round_) { brs_.end_ = true; }
// print generate data
LOG_INFO("[DG] data generated by DataGenerator in ", K(current_round_ - 1));
if (op_id_2_output_streams_.count(op_id) == 0) {
std::string output_file_name = "generate_data_" + std::to_string(op_id) + ".data";
op_id_2_output_streams_[op_id].open(output_file_name.data(), std::ios::out | std::ios::trunc);
}
test::TestOpEngine::print_to_file(&brs_, this, pd_expr_spec.access_exprs_, false, &op_id_2_output_streams_[op_id]);
return ret;
}
int ObFakeTableScanVecOp::fill_random_data_into_expr_datum_frame(int expr_i, int expr_count, const ObExpr *expr,
const int output_row_count, bool &is_duplicate)
{
int ret = OB_SUCCESS;
LOG_DEBUG("generate_random_value for expr: ", K(*expr));
ObIVector *i_vec = NULL;
ObDatum *datums = NULL;
if (expr->enable_rich_format()) {
// vectorization 2.0 new operator
i_vec = expr->get_vector(eval_ctx_);
if (expr->is_fixed_length_data_) {
// for VEC_FIXED and VEC_UNIFORM
// we must first call init_vector because we need to use i_vec->set_xxx() which need use vector's meta data
OB_ASSERT(ObTestOpConfig::get_instance().digit_data_format_ == VEC_UNIFORM
|| ObTestOpConfig::get_instance().digit_data_format_ == VEC_FIXED);
expr->init_vector(eval_ctx_, ObTestOpConfig::get_instance().digit_data_format_, output_row_count, true);
} else {
// while for VEC_DISCRETE and VEC_CONTINUOUS it doesn't matter
OB_ASSERT(ObTestOpConfig::get_instance().string_data_format_ == VEC_UNIFORM
|| ObTestOpConfig::get_instance().string_data_format_ == VEC_DISCRETE
|| ObTestOpConfig::get_instance().string_data_format_ == VEC_CONTINUOUS);
expr->init_vector(eval_ctx_, ObTestOpConfig::get_instance().string_data_format_, output_row_count, true);
}
} else {
// vectorization 1.0 old operator
datums = expr->locate_datums_for_update(eval_ctx_, output_row_count);
}
// generate random data
get_random_data(expr_i, expr_count, expr, current_round_, output_row_count, expr->max_length_, is_duplicate);
int vec_continuous_offset = 0; // only used in VEC_CONTINUOUS
std::string vec_continuous_data; // use to store data of VEC_CONTINUOUS temporarily
ObDataGenerator::TempDataStore &data_store =
ObDataGenerator::get_instance().op_2_round_2_temp_store_[get_spec().get_id()][current_round_][expr_i];
for (int row = 0; row < output_row_count; row++) {
bool is_null = data_store.null_[row];
switch (expr->datum_meta_.get_type()) {
// why expr->datum_meta_.get_type() == ObInt32Type while expr->res_buf_len_ == 8 ?????
case ObInt32Type: {
int data = data_store.temp_int32_vector_[row];
if (i_vec != NULL) {
i_vec->set_int(row, static_cast<int64_t>(data));
if (is_null) { i_vec->set_null(row); }
} else {
datums[row].set_int32(data);
if (is_null) { datums[row].set_null(); }
}
break;
}
case ObIntType: {
int64_t data = data_store.temp_int64_vector_[row];
if (i_vec != NULL) {
i_vec->set_int(row, data);
if (is_null) { i_vec->set_null(row); }
} else {
datums[row].set_int(data);
if (is_null) { datums[row].set_null(); }
}
break;
}
case ObDoubleType: {
double data = data_store.temp_double_vector_[row];
if (i_vec != NULL) {
i_vec->set_double(row, data);
if (is_null) { i_vec->set_null(row); }
} else {
datums[row].set_double(data);
if (is_null) { datums[row].set_null(); }
}
break;
}
case ObNumberType: {
break;
}
case ObDecimalIntType: {
// have bug
// if decimal(20, 10)
// expr->res_buf_len_ = 16 is *not* equal to datums[x].len_ == 8
// where datums[x].len_ is set?
// i_vec->set_int(row,
// ObDataGenerator::get_instance().round_2_temp_store_[round_].first[expr_i].temp_int64_vector_[row]);
break;
}
case ObVarcharType:
case ObCharType: {
std::string tmp_str = data_store.temp_string_vector_[row];
int str_len = static_cast<int>(tmp_str.size());
if (i_vec != NULL) {
// vectorization 1.0 new operator
if (expr->get_format(eval_ctx_) == VEC_UNIFORM) {
if (is_null) {
i_vec->set_null(row);
} else {
i_vec->set_payload(row, tmp_str.data(), str_len);
}
} else {
if (is_null) { i_vec->set_null(row); }
if (expr->get_format(eval_ctx_) == VEC_DISCRETE) {
char **ptrs = expr->get_discrete_vector_ptrs(eval_ctx_);
int32_t *lens = expr->get_discrete_vector_lens(eval_ctx_);
ptrs[row] = expr->get_res_buf(eval_ctx_) + row * (expr->res_buf_len_);
lens[row] = str_len;
i_vec->set_payload(row, tmp_str.data(), str_len);
} else {
// VEC_CONTINUOUS
// offset[0] == 0 which is set in cg phase
vec_continuous_offset += str_len;
uint32_t *offset = expr->get_continuous_vector_offsets(eval_ctx_);
offset[row + 1] = vec_continuous_offset;
vec_continuous_data += tmp_str; // temporarily store data here
}
}
} else {
// vectorization 1.0 old operator
if (is_null) {
datums[row].set_null();
} else {
datums[row].set_string(tmp_str.data(), str_len);
}
}
break;
}
default: LOG_INFO("Can not generate random value so far for: ", K(expr->datum_meta_.get_type()));
}
}
if (expr->enable_rich_format() && expr->get_format(eval_ctx_) == VEC_CONTINUOUS && !vec_continuous_data.empty()) {
ObDynReserveBuf *drb = reinterpret_cast<ObDynReserveBuf *>(expr->get_continuous_vector_data(eval_ctx_));
// Todo: replace below logic when shengle handle continuous format memory
// in ObExpr::get_str_res_mem()
char *mem = NULL;
const int64_t alloc_size = next_pow2(vec_continuous_data.size());
if (OB_UNLIKELY(alloc_size > UINT32_MAX)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid argument", K(vec_continuous_data.size()), K(alloc_size), K(ret));
} else if (OB_ISNULL(mem = static_cast<char *>(eval_ctx_.alloc_expr_res(alloc_size)))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
LOG_WARN("allocate memory failed", K(ret), K(ret));
} else {
// When extend memory, the old memory can not free, because the old memory may
// still be referenced. see: ob_datum_cast.cpp::common_copy_string
if (0 == drb->len_) { drb->magic_ = ObDynReserveBuf::MAGIC_NUM; }
drb->len_ = alloc_size;
drb->mem_ = mem;
MEMCPY(drb->mem_, vec_continuous_data.data(), vec_continuous_data.size());
ObContinuousBase *cont_vec = static_cast<ObContinuousBase *>(expr->get_vector(eval_ctx_));
cont_vec->set_data(drb->mem_);
}
LOG_DEBUG("extend expr result memory", K(ret), K(vec_continuous_data.size()), K(alloc_size), KP(this), KP(mem));
}
return ret;
}
int ObFakeTableScanVecOp::get_random_data(int expr_i, int expr_count, const ObExpr *expr, const int round,
const int batch_size, const int len, bool &is_duplicate)
{
int ret = OB_SUCCESS;
uint64_t op_id = get_spec().get_id();
switch (expr->datum_meta_.get_type()) {
// why expr->datum_meta_.get_type() == ObInt32Type while expr->res_buf_len_ == 8 ?????
case ObInt32Type: {
ObDataGenerator::get_instance().generate_data<int>(op_id, expr_i, expr_count, round, batch_size, len, is_duplicate);
break;
}
case ObIntType: {
ObDataGenerator::get_instance().generate_data<int64_t>(op_id, expr_i, expr_count, round, batch_size, len,
is_duplicate);
break;
}
case ObDoubleType: {
ObDataGenerator::get_instance().generate_data<double>(op_id, expr_i, expr_count, round, batch_size, len,
is_duplicate);
break;
}
case ObNumberType: {
break;
}
case ObDecimalIntType: {
// have bug
// if decimal(20, 10)
// expr->res_buf_len_ = 16 is *not* equal to datums[x].len_ == 8
// where datums[x].len_ is set?
ObDataGenerator::get_instance().generate_data<int64_t>(op_id, expr_i, expr_count, round, batch_size, len,
is_duplicate);
break;
}
case ObVarcharType:
case ObCharType: {
ObDataGenerator::get_instance().generate_data<std::string>(op_id, expr_i, expr_count, round, batch_size, len,
is_duplicate);
break;
}
default: LOG_INFO("Can not generate random value so far for: ", K(expr->datum_meta_.get_type()));
}
return ret;
}
void ObFakeTableScanVecOp::set_random_skip(const int round, const int batch_size)
{
std::string generate_data_skips;
uint64_t op_id = get_spec().get_id();
if (ObDataGenerator::get_instance().op_2_round_2_skips_[op_id].count(round) == 0) {
std::uniform_int_distribution<int> u_i(0, 1);
bool if_skip;
for (int i = 0; i < batch_size; i++) {
if_skip =
ObDataGenerator::get_instance().zero_one_rand_by_probability(ObTestOpConfig::get_instance().skips_probability_);
generate_data_skips += std::to_string(!if_skip) + " ";
ObDataGenerator::get_instance().op_2_round_2_skips_[op_id][round].push_back(if_skip);
}
}
for (int i = 0; i < batch_size; i++) {
if (ObDataGenerator::get_instance().op_2_round_2_skips_[op_id][round][i] == true) { brs_.skip_->set(i); }
}
LOG_INFO("skips : ", K(generate_data_skips.data()));
return;
}
} // namespace sql
} // namespace oceanbase