[CP] [CP] Refine range filter selectivity

This commit is contained in:
xianyu-w 2024-06-17 08:55:46 +00:00 committed by ob-robot
parent b2e1da3455
commit bfed2bb77e
6 changed files with 234 additions and 18 deletions

View File

@ -12830,7 +12830,9 @@ int ObJoinOrder::init_est_sel_info_for_access_path(const uint64_t table_id,
global_part_ids,
scale_ratio,
last_analyzed,
is_stat_locked))) {
is_stat_locked,
table_partition_info_,
&table_meta_info_))) {
LOG_WARN("failed to add base table meta info", K(ret));
}
}

View File

@ -76,6 +76,8 @@ const double DEFAULT_AGG_EQ = 0.01;
// clob/blob like "xxx" 的默认选择率
const double DEFAULT_CLOB_LIKE_SEL = 0.05;
const double DEFAULT_ANTI_JOIN_SEL = 0.01;
// 范围谓词越界部分选择率,参考 SQLserver
const double DEFAULT_OUT_OF_BOUNDS_SEL = 0.3;
const double DEFAULT_INEQ_JOIN_SEL = 0.05;
} // namespace common

View File

@ -84,6 +84,9 @@ int OptTableMeta::assign(const OptTableMeta &other)
ds_level_ = other.ds_level_;
stat_locked_ = other.stat_locked_;
distinct_rows_ = other.distinct_rows_;
table_partition_info_ = other.table_partition_info_;
base_meta_info_ = other.base_meta_info_;
real_rows_ = other.real_rows_;
if (OB_FAIL(all_used_parts_.assign(other.all_used_parts_))) {
LOG_WARN("failed to assign all used parts", K(ret));
@ -110,7 +113,9 @@ int OptTableMeta::init(const uint64_t table_id,
ObIArray<uint64_t> &column_ids,
ObIArray<int64_t> &all_used_global_parts,
const double scale_ratio,
const OptSelectivityCtx &ctx)
const OptSelectivityCtx &ctx,
const ObTablePartitionInfo *table_partition_info,
const ObTableMetaInfo *base_meta_info)
{
int ret = OB_SUCCESS;
const ObTableSchema *table_schema = NULL;
@ -123,6 +128,9 @@ int OptTableMeta::init(const uint64_t table_id,
stat_type_ = stat_type;
scale_ratio_ = scale_ratio;
micro_block_count_ = micro_block_count;
table_partition_info_ = table_partition_info;
base_meta_info_ = base_meta_info;
real_rows_ = -1.0;
if (OB_FAIL(all_used_parts_.assign(all_used_part_id))) {
LOG_WARN("failed to assign all used partition ids", K(ret));
} else if (OB_FAIL(all_used_tablets_.assign(all_used_tablets))) {
@ -223,6 +231,34 @@ int OptTableMeta::add_column_meta_no_dup(const uint64_t column_id,
return ret;
}
int OptTableMeta::get_increase_rows_ratio(ObOptimizerContext &ctx, double &increase_rows_ratio) const
{
int ret = OB_SUCCESS;
increase_rows_ratio = 0.0;
if (real_rows_ >= 0) {
// do nothing
} else if (NULL == table_partition_info_ || NULL == base_meta_info_ ||
!base_meta_info_->has_opt_stat_ || ctx.use_default_stat()) {
const_cast<double &>(real_rows_) = rows_;
} else {
ObTableMetaInfo table_meta(ref_table_id_);
table_meta.assign(*base_meta_info_);
table_meta.table_row_count_ = 0.0;
table_meta.row_count_ = 0.0;
if (OB_FAIL(ObAccessPathEstimation::estimate_full_table_rowcount(ctx,
*table_partition_info_,
table_meta))) {
LOG_WARN("failed to estimate full table rowcount", K(ret));
} else {
const_cast<double &>(real_rows_) = table_meta.table_row_count_;
}
}
if (OB_SUCC(ret) && rows_ > OB_DOUBLE_EPSINON && real_rows_ > rows_) {
increase_rows_ratio = (real_rows_ - rows_) / rows_;
}
return ret;
}
const OptColumnMeta* OptTableMeta::get_column_meta(const uint64_t column_id) const
{
const OptColumnMeta* column_meta = NULL;
@ -280,7 +316,9 @@ int OptTableMetas::add_base_table_meta_info(OptSelectivityCtx &ctx,
ObIArray<int64_t> &all_used_global_parts,
const double scale_ratio,
int64_t last_analyzed,
bool is_stat_locked)
bool is_stat_locked,
const ObTablePartitionInfo *table_partition_info,
const ObTableMetaInfo *base_meta_info)
{
int ret = OB_SUCCESS;
ObSqlSchemaGuard *schema_guard = ctx.get_sql_schema_guard();
@ -293,7 +331,8 @@ int OptTableMetas::add_base_table_meta_info(OptSelectivityCtx &ctx,
LOG_WARN("failed to allocate place holder for table meta", K(ret));
} else if (OB_FAIL(table_meta->init(table_id, ref_table_id, table_type, rows, stat_type, micro_block_count,
*schema_guard, all_used_part_id, all_used_tablets,
column_ids, all_used_global_parts, scale_ratio, ctx))) {
column_ids, all_used_global_parts, scale_ratio, ctx,
table_partition_info, base_meta_info))) {
LOG_WARN("failed to init new tstat", K(ret));
} else {
table_meta->set_version(last_analyzed);
@ -936,6 +975,7 @@ int ObOptSelectivity::update_table_meta_info(const OptTableMetas &base_table_met
} else {
double origin_rows = table_meta->get_rows();
table_meta->set_rows(filtered_rows);
table_meta->clear_base_table_info();
if (filtered_rows >= origin_rows) {
// only update table rows
} else if (OB_FAIL(classify_quals(ctx, quals, all_predicate_sel, column_sel_infos))) {
@ -1104,13 +1144,14 @@ int ObOptSelectivity::get_column_range_sel(const OptTableMetas &table_metas,
const OptSelectivityCtx &ctx,
const ObColumnRefRawExpr &col_expr,
const ObRawExpr &qual,
const bool need_out_of_bounds,
double &selectivity)
{
int ret = OB_SUCCESS;
ObSEArray<ObRawExpr *, 1> quals;
if (OB_FAIL(quals.push_back(const_cast<ObRawExpr *>(&qual)))) {
LOG_WARN("failed to push back expr", K(ret));
} else if (OB_FAIL(get_column_range_sel(table_metas, ctx, col_expr, quals, selectivity))) {
} else if (OB_FAIL(get_column_range_sel(table_metas, ctx, col_expr, quals, need_out_of_bounds, selectivity))) {
LOG_WARN("failed to get column range selectivity", K(ret));
}
return ret;
@ -1120,6 +1161,7 @@ int ObOptSelectivity::get_column_range_sel(const OptTableMetas &table_metas,
const OptSelectivityCtx &ctx,
const ObColumnRefRawExpr &col_expr,
const ObIArray<ObRawExpr* > &quals,
const bool need_out_of_bounds,
double &selectivity)
{
int ret = OB_SUCCESS;
@ -1129,6 +1171,8 @@ int ObOptSelectivity::get_column_range_sel(const OptTableMetas &table_metas,
ObQueryRange query_range;
ObQueryRangeArray ranges;
ObSEArray<ColumnItem, 1> column_items;
bool use_hist = false;
ObOptColumnStatHandle handler;
if (OB_ISNULL(stmt)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("get null stmt", K(ret), K(stmt));
@ -1140,7 +1184,6 @@ int ObOptSelectivity::get_column_range_sel(const OptTableMetas &table_metas,
} else {
selectivity = 0.0;
double not_null_sel = 0;
ObOptColumnStatHandle handler;
if (OB_FAIL(get_column_ndv_and_nns(table_metas, ctx, col_expr, NULL, &not_null_sel))) {
LOG_WARN("failed to get column ndv and nns", K(ret));
} else if (OB_FAIL(get_histogram_by_column(table_metas, ctx, tid, cid, handler))) {
@ -1160,6 +1203,7 @@ int ObOptSelectivity::get_column_range_sel(const OptTableMetas &table_metas,
LOG_WARN("failed to get range sel by histogram", K(ret));
} else {
selectivity *= not_null_sel;
use_hist = true;
LOG_TRACE("Succeed to get range density ", K(selectivity), K(not_null_sel));
}
} else {
@ -1186,7 +1230,28 @@ int ObOptSelectivity::get_column_range_sel(const OptTableMetas &table_metas,
}
LOG_TRACE("Get column range sel", K(selectivity), K(quals));
}
if (OB_SUCC(ret) && need_out_of_bounds &&
((ctx.get_compat_version() >= COMPAT_VERSION_4_2_1_BP7 && ctx.get_compat_version() < COMPAT_VERSION_4_2_2) ||
(ctx.get_compat_version() >= COMPAT_VERSION_4_2_4 && ctx.get_compat_version() < COMPAT_VERSION_4_3_0) ||
ctx.get_compat_version() >= COMPAT_VERSION_4_3_2)) {
ObObj min_value;
ObObj max_value;
min_value.set_min_value();
max_value.set_max_value();
if (use_hist) {
int64_t cnt = handler.stat_->get_histogram().get_bucket_size();
min_value = handler.stat_->get_histogram().get(0).endpoint_value_;
max_value = handler.stat_->get_histogram().get(cnt - 1).endpoint_value_;
} else {
if (OB_FAIL(get_column_min_max(table_metas, ctx, col_expr, min_value, max_value))) {
LOG_WARN("failed to get column min max", K(ret));
}
}
if (FAILEDx(refine_out_of_bounds_sel(table_metas, ctx, col_expr, ranges,
min_value, max_value, selectivity))) {
LOG_WARN("failed to refine out of bounds sel", K(ret));
}
}
return ret;
}
@ -1371,6 +1436,7 @@ int ObOptSelectivity::calc_column_range_selectivity(const OptTableMetas &table_m
selectivity))) {
LOG_WARN("failed to do calc range selectivity", K(ret));
} else {
selectivity = std::max(selectivity, 1.0 / std::max(ndv, 1.0));
selectivity *= not_null_sel;
}
} else {
@ -1506,6 +1572,115 @@ double ObOptSelectivity::revise_range_sel(double selectivity,
return revise_between_0_1(selectivity);
}
int ObOptSelectivity::refine_out_of_bounds_sel(const OptTableMetas &table_metas,
const OptSelectivityCtx &ctx,
const ObColumnRefRawExpr &col_expr,
const ObQueryRangeArray &ranges,
const ObObj &min_val,
const ObObj &max_val,
double &selectivity)
{
int ret = OB_SUCCESS;
uint64_t table_id = col_expr.get_table_id();
uint64_t column_id = col_expr.get_column_id();
const OptTableMeta *table_meta = table_metas.get_table_meta_by_table_id(table_id);
double increase_rows_ratio = 0.0;
double not_null_sel = 0;
if (NULL == table_meta || min_val.is_min_value() || max_val.is_min_value() ||
min_val.is_max_value() || max_val.is_max_value()) {
// do nothing
} else if (OB_FAIL(table_meta->get_increase_rows_ratio(ctx.get_opt_ctx(), increase_rows_ratio))) {
LOG_WARN("failed to get extra rows", K(ret));
} else if (increase_rows_ratio < OB_DOUBLE_EPSINON) {
// do nothing
} else if (OB_FAIL(get_column_ndv_and_nns(table_metas, ctx, col_expr, NULL, &not_null_sel))) {
LOG_WARN("failed to get column ndv and nns", K(ret));
} else {
bool contain_inf = false;
double out_of_bounds_sel = 0.0;
for (int64_t i = 0; OB_SUCC(ret) && i < ranges.count(); ++i) {
const ObNewRange *range = NULL;
if (OB_ISNULL(range = ranges.at(i))) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("get null range", K(ret), K(i));
} else if (range->is_whole_range() || range->empty() ||
1 != range->get_start_key().get_obj_cnt() ||
1 != range->get_end_key().get_obj_cnt()) {
// do nothing
} else {
const ObObj &startobj = range->get_start_key().get_obj_ptr()[0];
const ObObj &endobj = range->get_end_key().get_obj_ptr()[0];
double tmp_sel = 0.0;
if (startobj.is_null() && endobj.is_null()) {
// do nothing
} else if (startobj.is_min_value() || endobj.is_max_value() ||
startobj.is_null() || endobj.is_null()) {
contain_inf = true;
} else if (OB_FAIL(get_single_range_out_of_bounds_sel(min_val, max_val, startobj, endobj, tmp_sel))) {
LOG_WARN("failed to calc single out of bounds sel", K(ret));
} else {
out_of_bounds_sel += tmp_sel;
}
}
}
selectivity += std::min(out_of_bounds_sel, increase_rows_ratio) * not_null_sel;
if (contain_inf) {
selectivity = std::max(selectivity, DEFAULT_OUT_OF_BOUNDS_SEL * increase_rows_ratio * not_null_sel);
}
selectivity = revise_between_0_1(selectivity);
LOG_TRACE("succeed to refine out of bounds selectivity",
K(selectivity), K(out_of_bounds_sel), K(contain_inf), K(increase_rows_ratio));
}
return ret;
}
int ObOptSelectivity::get_single_range_out_of_bounds_sel(const ObObj &min_val,
const ObObj &max_val,
const ObObj &start_val,
const ObObj &end_val,
double &selectivity)
{
int ret = OB_SUCCESS;
ObObj *new_start = NULL;
ObObj *new_end = NULL;
ObObj min_scalar;
ObObj max_scalar;
ObObj start_scalar;
ObObj end_scalar;
ObArenaAllocator tmp_alloc("ObOptSel");
selectivity = 0.0;
if (OB_FAIL(ObDbmsStatsUtils::truncate_string_for_opt_stats(&start_val, tmp_alloc, new_start)) ||
OB_FAIL(ObDbmsStatsUtils::truncate_string_for_opt_stats(&end_val, tmp_alloc, new_end))) {
LOG_WARN("failed to convert valid obj for opt stats", K(ret), K(start_val), K(end_val),
KPC(new_start), KPC(new_end));
} else if (OB_FAIL(ObOptEstObjToScalar::convert_objs_to_scalars(
&min_val, &max_val, new_start, new_end,
&min_scalar, &max_scalar, &start_scalar, &end_scalar))) {
LOG_WARN("failed to convert objs to scalars", K(ret));
} else {
double max_val = max_scalar.get_double();
double min_val = min_scalar.get_double();
double start_val = start_scalar.get_double();
double end_val = end_scalar.get_double();
double out_of_bounds_sel = 0.0;
if (max_val - min_val < OB_DOUBLE_EPSINON ||
end_val - start_val < OB_DOUBLE_EPSINON) {
// do nothing
} else if (start_val <= min_val && end_val >= max_val) {
// include the whole range
selectivity = 1.0;
} else if (start_val >= min_val && end_val <= max_val) {
// within the bound
selectivity = 0.0;
} else if (start_val < min_val) {
selectivity = (std::min(min_val, end_val) - start_val) / (max_val - min_val);
} else if (end_val > max_val) {
selectivity = (end_val - std::max(max_val, start_val)) / (max_val - min_val);
}
}
return ret;
}
int ObOptSelectivity::check_column_in_current_level_stmt(const ObDMLStmt *stmt,
const ObRawExpr &expr)
{

View File

@ -75,8 +75,7 @@ class OptSelectivityCtx
dependency_type_(FilterDependencyType::INDEPENDENT)
{ }
ObOptimizerContext &get_opt_ctx() { return opt_ctx_; }
const ObOptimizerContext &get_opt_ctx() const { return opt_ctx_; }
ObOptimizerContext &get_opt_ctx() const { return const_cast<ObOptimizerContext &>(opt_ctx_); }
const ObDMLStmt *get_stmt() const { return stmt_; }
const ObLogPlan *get_plan() const { return plan_; }
@ -277,7 +276,10 @@ public:
ds_level_(ObDynamicSamplingLevel::NO_DYNAMIC_SAMPLING),
all_used_global_parts_(),
scale_ratio_(1.0),
distinct_rows_(0.0)
distinct_rows_(0.0),
table_partition_info_(NULL),
base_meta_info_(NULL),
real_rows_(-1.0)
{}
int assign(const OptTableMeta &other);
@ -293,7 +295,9 @@ public:
common::ObIArray<uint64_t> &column_ids,
ObIArray<int64_t> &all_used_global_parts,
const double scale_ratio,
const OptSelectivityCtx &ctx);
const OptSelectivityCtx &ctx,
const ObTablePartitionInfo *table_partition_info,
const ObTableMetaInfo *base_meta_info);
// int update_stat(const double rows, const bool can_reduce, const bool can_enlarge);
@ -341,9 +345,17 @@ public:
share::schema::ObTableType get_table_type() const { return table_type_; }
// The ratio of the increase in the number of rows in the system table compared to the number of rows in the statistics.
int get_increase_rows_ratio(ObOptimizerContext &ctx, double &increase_rows_ratio) const;
void clear_base_table_info() {
table_partition_info_ = NULL;
base_meta_info_ = NULL;
real_rows_ = -1.0;
}
TO_STRING_KV(K_(table_id), K_(ref_table_id), K_(table_type), K_(rows), K_(stat_type), K_(ds_level),
K_(all_used_parts), K_(all_used_tablets), K_(pk_ids), K_(column_metas),
K_(all_used_global_parts), K_(scale_ratio), K_(stat_locked), K_(distinct_rows));
K_(all_used_global_parts), K_(scale_ratio), K_(stat_locked), K_(distinct_rows), K_(real_rows));
private:
uint64_t table_id_;
uint64_t ref_table_id_;
@ -365,6 +377,11 @@ private:
// only valid for child stmt meta of set distinct stmt
double distinct_rows_;
// only for base table
const ObTablePartitionInfo *table_partition_info_;
const ObTableMetaInfo *base_meta_info_;
double real_rows_;
};
struct OptSelectivityDSParam {
@ -397,7 +414,9 @@ public:
ObIArray<int64_t> &all_used_global_parts,
const double scale_ratio,
int64_t last_analyzed,
bool is_stat_locked);
bool is_stat_locked,
const ObTablePartitionInfo *table_partition_info,
const ObTableMetaInfo *base_meta_info);
int add_set_child_stmt_meta_info(const ObSelectStmt *parent_stmt,
const ObSelectStmt *child_stmt,
@ -517,6 +536,7 @@ public:
const OptSelectivityCtx &ctx,
const ObColumnRefRawExpr &col_expr,
const ObRawExpr &qual,
const bool need_out_of_bounds,
double &selectivity);
//param:As some expr, query range can't calc range, then range will be (min, max).
@ -526,6 +546,7 @@ public:
const OptSelectivityCtx &ctx,
const ObColumnRefRawExpr &col_expr,
const ObIArray<ObRawExpr* > &quals,
const bool need_out_of_bounds,
double &selectivity);
static int get_column_range_min_max(const OptSelectivityCtx &ctx,
@ -562,6 +583,20 @@ public:
bool include_start,
bool include_end);
static int refine_out_of_bounds_sel(const OptTableMetas &table_metas,
const OptSelectivityCtx &ctx,
const ObColumnRefRawExpr &col_expr,
const ObQueryRangeArray &ranges,
const ObObj &min_val,
const ObObj &max_val,
double &selectivity);
static int get_single_range_out_of_bounds_sel(const ObObj &min_val,
const ObObj &max_val,
const ObObj &start_val,
const ObObj &end_val,
double &selectivity);
static int check_column_in_current_level_stmt(const ObDMLStmt *stmt,
const ObRawExpr &expr);
static int column_in_current_level_stmt(const ObDMLStmt *stmt,

View File

@ -364,7 +364,7 @@ int ObCmpSelEstimator::get_range_cmp_sel(const OptTableMetas &table_metas,
const ObRawExpr *col_expr = left_expr->is_column_ref_expr() ? left_expr : right_expr;
if (OB_FAIL(ObOptSelectivity::get_column_range_sel(table_metas, ctx,
static_cast<const ObColumnRefRawExpr&>(*col_expr),
qual, selectivity))) {
qual, true, selectivity))) {
LOG_WARN("Failed to get column range sel", K(qual), K(ret));
}
} else if (T_OP_ROW == left_expr->get_expr_type() && T_OP_ROW == right_expr->get_expr_type()) {
@ -392,7 +392,7 @@ int ObCmpSelEstimator::get_range_cmp_sel(const OptTableMetas &table_metas,
const ObRawExpr *col_expr = (left_expr->is_column_ref_expr()) ? (left_expr) : (right_expr);
if (OB_FAIL(ObOptSelectivity::get_column_range_sel(table_metas, ctx,
static_cast<const ObColumnRefRawExpr&>(*col_expr),
qual, selectivity))) {
qual, true, selectivity))) {
LOG_WARN("failed to get column range sel", K(ret));
}
} else { /* no dothing */ }
@ -439,7 +439,7 @@ int ObBtwSelEstimator::get_btw_sel(const OptTableMetas &table_metas,
if (NULL != col_expr) {
if (OB_FAIL(ObOptSelectivity::get_column_range_sel(table_metas, ctx,
static_cast<const ObColumnRefRawExpr&>(*col_expr),
qual, selectivity))) {
qual, true, selectivity))) {
LOG_WARN("failed to get column range sel", K(ret));
}
}
@ -1611,7 +1611,7 @@ int ObLikeSelEstimator::get_sel(const OptTableMetas &table_metas,
LOG_WARN("unexpected expr", KPC(variable_));
} else if (OB_FAIL(ObOptSelectivity::get_column_range_sel(table_metas, ctx,
static_cast<const ObColumnRefRawExpr&>(*variable_),
qual, selectivity))) {
qual, false, selectivity))) {
LOG_WARN("Failed to get column range selectivity", K(ret));
}
} else if (is_lob_storage(variable_->get_data_type())) {
@ -1828,7 +1828,8 @@ int ObRangeSelEstimator::get_sel(const OptTableMetas &table_metas,
if (OB_ISNULL(column_expr_) || OB_UNLIKELY(range_exprs_.empty())) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("unexpected expr", KPC(this));
} else if (OB_FAIL(ObOptSelectivity::get_column_range_sel(table_metas, ctx, *column_expr_, range_exprs_, selectivity))) {
} else if (OB_FAIL(ObOptSelectivity::get_column_range_sel(
table_metas, ctx, *column_expr_, range_exprs_, true, selectivity))) {
LOG_WARN("failed to calc qual selectivity", KPC(column_expr_), K(range_exprs_), K(ret));
} else {
selectivity = ObOptSelectivity::revise_between_0_1(selectivity);

View File

@ -211,6 +211,7 @@ struct ObGlobalHint {
//#define COMPAT_VERSION_4_2_1_BP3 (oceanbase::common::cal_version(4, 2, 1, 3))
#define COMPAT_VERSION_4_2_1_BP4 (oceanbase::common::cal_version(4, 2, 1, 4))
#define COMPAT_VERSION_4_2_1_BP5 (oceanbase::common::cal_version(4, 2, 1, 5))
#define COMPAT_VERSION_4_2_1_BP7 (oceanbase::common::cal_version(4, 2, 1, 7))
#define COMPAT_VERSION_4_2_1_BP8 (oceanbase::common::cal_version(4, 2, 1, 8))
#define COMPAT_VERSION_4_2_2 (oceanbase::common::cal_version(4, 2, 2, 0))
#define COMPAT_VERSION_4_2_3 (oceanbase::common::cal_version(4, 2, 3, 0))