diff --git a/deps/oblib/src/lib/charset/ob_charset.cpp b/deps/oblib/src/lib/charset/ob_charset.cpp index 5a0b96282..58e41f88b 100644 --- a/deps/oblib/src/lib/charset/ob_charset.cpp +++ b/deps/oblib/src/lib/charset/ob_charset.cpp @@ -16,6 +16,7 @@ #include "lib/ob_define.h" #include "lib/worker.h" #include "common/ob_common_utility.h" +#include "lib/charset/str_uca_type.h" namespace oceanbase { @@ -1061,7 +1062,8 @@ int ObCharset::like_range(ObCollationType collation_type, char *min_str, size_t *min_str_len, char *max_str, - size_t *max_str_len) + size_t *max_str_len, + size_t *prefix_len /*= NULL*/) { int ret = OB_SUCCESS; if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID || @@ -1099,6 +1101,7 @@ int ObCharset::like_range(ObCollationType collation_type, // 上面的修改会引发这样的问题:'a\0' 会不在范围内,因为mysql的utf8特性使得'a\0' < 'a',所以范围不能这么修改 // 具体的修正还是由存储层来做 size_t res_size = *min_str_len < *max_str_len ? *min_str_len : *max_str_len; + size_t pre_len = 0; if (OB_ISNULL(cs->coll)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("unexpected error. invalid argument(s)", K(cs), K(cs->coll)); @@ -1112,8 +1115,11 @@ int ObCharset::like_range(ObCollationType collation_type, min_str, max_str, min_str_len, - max_str_len)) { + max_str_len, + &pre_len)) { ret = OB_EMPTY_RANGE; + } else if (prefix_len != NULL) { + *prefix_len = pre_len; } else { // *min_str_len = real_len; } @@ -3611,6 +3617,20 @@ bool ObCharset::is_cs_unicode(ObCollationType collation_type) return is_cs_unicode; } +bool ObCharset::is_cs_uca(ObCollationType collation_type) +{ + bool is_cs_uca = false; + if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID || + collation_type >= CS_TYPE_MAX) || + OB_ISNULL(ObCharset::charset_arr[collation_type])) { + LOG_WARN_RET(OB_INVALID_ARGUMENT, "unexpected error. invalid argument(s)", K(ret), K(collation_type), K(lbt())); + } else { + ObCharsetInfo *cs = static_cast(ObCharset::charset_arr[collation_type]); + is_cs_uca = (cs->uca != NULL) && (cs->uca->version == UCA_V900); + } + return is_cs_uca; +} + int ObCharset::get_replace_character(ObCollationType collation_type, int32_t &replaced_char_unicode) { int ret = OB_SUCCESS; diff --git a/deps/oblib/src/lib/charset/ob_charset.h b/deps/oblib/src/lib/charset/ob_charset.h index 11c91a51d..59c03cd3e 100644 --- a/deps/oblib/src/lib/charset/ob_charset.h +++ b/deps/oblib/src/lib/charset/ob_charset.h @@ -513,7 +513,8 @@ public: char *min_str, size_t *min_str_len, char *max_str, - size_t *max_str_len); + size_t *max_str_len, + size_t *prefix_len = NULL); static size_t strlen_char(ObCollationType collation_type, const char *str, int64_t str_len); @@ -717,6 +718,7 @@ public: static bool is_cs_nonascii(ObCollationType collation_type); static bool is_cs_unicode(ObCollationType collation_type); + static bool is_cs_uca(ObCollationType collation_type); static int get_replace_character(ObCollationType collation_type, int32_t &replaced_char_unicode); static bool is_cjk_charset(ObCollationType collation_type); static bool is_valid_connection_collation(ObCollationType collation_type); diff --git a/deps/oblib/src/lib/charset/ob_ctype.h b/deps/oblib/src/lib/charset/ob_ctype.h index 4a517c1db..da2331185 100644 --- a/deps/oblib/src/lib/charset/ob_ctype.h +++ b/deps/oblib/src/lib/charset/ob_ctype.h @@ -304,12 +304,14 @@ typedef struct ObCollationHandler //size_t (*strnxfrmlen)(const struct ObCharsetInfo *, size_t); // creates a LIKE range, for optimizer,query range模块使用到了 + // prifix_len should return **byte** length before the first '%' bool (*like_range)(const struct ObCharsetInfo *, const char *s, size_t s_length, pchar w_prefix, pchar w_one, pchar w_many, size_t res_length, char *min_str, char *max_str, - size_t *min_len, size_t *max_len); + size_t *min_len, size_t *max_len, + size_t *prefix_len); // wildcard comparison, for LIKE int (*wildcmp)(const struct ObCharsetInfo *, const char *str,const char *str_end, @@ -409,8 +411,8 @@ struct ObCharsetInfo #define ob_strnxfrm(cs, d, dl, s, sl) \ ((cs)->coll->strnxfrm((cs), (d), (dl), (dl), (s), (sl), MY_STRXFRM_PAD_WITH_SPACE)) #define ob_strnncoll(s, a, b, c, d) ((s)->coll->strnncoll((s), (a), (b), (c), (d), 0)) -#define ob_like_range(s, a, b, c, d, e, f, g, h, i, j) \ - ((s)->coll->like_range((s), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j))) +#define ob_like_range(s, a, b, c, d, e, f, g, h, i, j, k) \ + ((s)->coll->like_range((s), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), (k))) #define ob_wildcmp(cs,s,se,w,we,e,o,m) ((cs)->coll->wildcmp((cs),(s),(se),(w),(we),(e),(o),(m))) #define ob_strcasecmp(s, a, b) ((s)->coll->strcasecmp((s), (a), (b))) #define ob_charpos(cs, b, e, num) (cs)->cset->charpos((cs), (const char*) (b), (const char *)(e), (num)) @@ -551,11 +553,12 @@ size_t ob_scan_8bit(const ObCharsetInfo *cs, const char *b, const char *e, /* For 8-bit character set */ bool ob_like_range_simple(const ObCharsetInfo *cs, - const char *ptr, size_t ptr_length, - pbool escape, pbool w_one, pbool w_many, - size_t res_length, - char *min_str, char *max_str, - size_t *min_length, size_t *max_length); + const char *ptr, size_t ptr_length, + pbool escape, pbool w_one, pbool w_many, + size_t res_length, + char *min_str, char *max_str, + size_t *min_length, size_t *max_length, + size_t *prefix_length); bool ob_propagate_simple(const ObCharsetInfo *cs, const unsigned char *str, size_t len); @@ -576,7 +579,8 @@ bool ob_like_range_mb(const ObCharsetInfo *cs, pbool escape, pbool w_one, pbool w_many, size_t res_length, char *min_str,char *max_str, - size_t *min_length,size_t *max_length); + size_t *min_length,size_t *max_length, + size_t *prefix_length); int ob_wildcmp_mb(const ObCharsetInfo *cs, const char *str,const char *str_end, @@ -675,7 +679,7 @@ bool ob_like_range_generic(const ObCharsetInfo *cs, const char *ptr, size_t ptr_length, char escape, char w_one, char w_many, size_t res_length, char *min_str, char *max_str, size_t *min_length, - size_t *max_length); + size_t *max_length, size_t *prefix_length); size_t ob_strnxfrm_unicode(const ObCharsetInfo *cs, unsigned char *dst, size_t dstlen, unsigned int nweights, diff --git a/deps/oblib/src/lib/charset/ob_ctype_mb.cc b/deps/oblib/src/lib/charset/ob_ctype_mb.cc index 66a64ecec..d555f0b6c 100644 --- a/deps/oblib/src/lib/charset/ob_ctype_mb.cc +++ b/deps/oblib/src/lib/charset/ob_ctype_mb.cc @@ -60,13 +60,15 @@ bool ob_like_range_mb_help(const ObCharsetInfo *cs, size_t res_length, char **min_str_,char **max_str_, char **min_org_, char **min_end_, - size_t *min_length,size_t *max_length, char **max_end_) + size_t *min_length,size_t *max_length, + char **max_end_, size_t *prefix_length) { char *min_str = *min_str_; char *max_str = *max_str_; char *min_end = *min_end_; char *max_end = *max_end_; char *min_org = *min_org_; + *prefix_length = (size_t) (min_str - min_org); *min_length = ((!!(cs->state & OB_CS_BINSORT) || cs->pad_attribute == NO_PAD) ? (size_t) (min_str - min_org) : res_length); *max_length = res_length; do { @@ -87,7 +89,8 @@ bool ob_like_range_mb(const ObCharsetInfo *cs, pbool escape_char, pbool w_one, pbool w_many, size_t res_length, char *min_str,char *max_str, - size_t *min_length,size_t *max_length) + size_t *min_length,size_t *max_length, + size_t *prefix_length) { unsigned int mb_len; const char *end= ptr + ptr_length; @@ -101,7 +104,7 @@ bool ob_like_range_mb(const ObCharsetInfo *cs, if (*ptr == escape_char && ptr+1 != end) { ptr++; } else if (*ptr == w_one || *ptr == w_many) { - return ob_like_range_mb_help(cs,res_length, &min_str,&max_str, &min_org, &min_end, min_length, max_length, &max_end); + return ob_like_range_mb_help(cs,res_length, &min_str,&max_str, &min_org, &min_end, min_length, max_length, &max_end, prefix_length); } mb_len= ob_ismbchar(cs, ptr, end); if ( mb_len > 1) { @@ -116,11 +119,11 @@ bool ob_like_range_mb(const ObCharsetInfo *cs, if (contractions && ptr + 1 < end && ob_uca_can_be_contraction_head(contractions, (unsigned char) *ptr)) { if (ptr[1] == w_one || ptr[1] == w_many) { - return ob_like_range_mb_help(cs,res_length, &min_str,&max_str, &min_org, &min_end, min_length, max_length, &max_end); + return ob_like_range_mb_help(cs,res_length, &min_str,&max_str, &min_org, &min_end, min_length, max_length, &max_end, prefix_length); } else if (ob_uca_can_be_contraction_tail(contractions, (unsigned char) ptr[1]) && ob_uca_contraction2_weight(contractions, (unsigned char) ptr[0], ptr[1])) { if (max_char_len == 1 || min_str + 1 >= min_end) { - return ob_like_range_mb_help(cs,res_length, &min_str,&max_str, &min_org, &min_end, min_length, max_length, &max_end); + return ob_like_range_mb_help(cs,res_length, &min_str,&max_str, &min_org, &min_end, min_length, max_length, &max_end, prefix_length); } max_char_len--; *min_str++= *max_str++= *ptr++; @@ -130,7 +133,7 @@ bool ob_like_range_mb(const ObCharsetInfo *cs, } } - *min_length= *max_length = (size_t) (min_str - min_org); + *min_length= *max_length = *prefix_length = (size_t) (min_str - min_org); while (min_end != min_str) { *min_str++= *max_str++= ' '; } diff --git a/deps/oblib/src/lib/charset/ob_ctype_simple.cc b/deps/oblib/src/lib/charset/ob_ctype_simple.cc index edc3c210f..b38c7ccb1 100644 --- a/deps/oblib/src/lib/charset/ob_ctype_simple.cc +++ b/deps/oblib/src/lib/charset/ob_ctype_simple.cc @@ -688,7 +688,8 @@ bool ob_like_range_simple(const ObCharsetInfo *cs, pbool escape_char, pbool w_one, pbool w_many, size_t res_len, char *min_str,char *max_str, - size_t *min_len, size_t *max_len) + size_t *min_len, size_t *max_len, + size_t *prefix_len) { const char *end= ptr + ptr_len; char *min_org=min_str; @@ -705,6 +706,7 @@ bool ob_like_range_simple(const ObCharsetInfo *cs, *max_str++= (char) cs->max_sort_char; continue; } else if (*ptr == w_many) { + *prefix_len = min_str - min_org; *min_len= ((cs->state & OB_CS_BINSORT) ? (size_t) (min_str - min_org) : res_len); @@ -718,7 +720,7 @@ bool ob_like_range_simple(const ObCharsetInfo *cs, *min_str++= *max_str++ = *ptr; } - *min_len= *max_len = (size_t) (min_str - min_org); + *min_len= *max_len= *prefix_len = (size_t) (min_str - min_org); while (min_str != min_end) { *min_str++= *max_str++ = ' '; } diff --git a/deps/oblib/src/lib/charset/ob_ctype_utf16.cc b/deps/oblib/src/lib/charset/ob_ctype_utf16.cc index 510cd59a5..91c4555a8 100644 --- a/deps/oblib/src/lib/charset/ob_ctype_utf16.cc +++ b/deps/oblib/src/lib/charset/ob_ctype_utf16.cc @@ -997,7 +997,8 @@ ob_like_range_generic(const ObCharsetInfo *cs, char escape_char, char w_one, char w_many, size_t res_length, char *min_str,char *max_str, - size_t *min_length,size_t *max_length) + size_t *min_length,size_t *max_length, + size_t *prefix_length) { const char *min_org = min_str; const char *max_org = max_str; @@ -1058,6 +1059,7 @@ ob_like_range_generic(const ObCharsetInfo *cs, continue; } } else if ((ob_wc_t) w_many == wc) { + *prefix_length = (size_t) (min_str - min_org); *min_length= ((cs->state & OB_CS_BINSORT) ? (size_t) (min_str - min_org) : res_length); *max_length= res_length; goto PAD_MIN_MAX; @@ -1111,6 +1113,7 @@ ob_like_range_generic(const ObCharsetInfo *cs, } PAD_SET_LEN: + *prefix_length = (size_t) (min_str - min_org); *min_length= (size_t) (min_str - min_org); *max_length= (size_t) (max_str - max_org); diff --git a/deps/oblib/unittest/lib/charset/test_charset.cpp b/deps/oblib/unittest/lib/charset/test_charset.cpp index f4ec0fe77..8314aec24 100644 --- a/deps/oblib/unittest/lib/charset/test_charset.cpp +++ b/deps/oblib/unittest/lib/charset/test_charset.cpp @@ -908,9 +908,9 @@ TEST_F(TestCharset, basic_collation_handler_test) if (OB_NOT_NULL(cs->coll->like_range)) { char temp1[100]; char temp2[100]; - size_t len1, len2; + size_t len1, len2, prefix_len; fprintf(stdout, ">> like_range = %d for text = \"%s\", min = %.*s, max = %.*s\n", - cs->coll->like_range(cs, str, end-str, '\\', '_', '%', 100, temp1, temp2, &len1, &len2), utf8_str, + cs->coll->like_range(cs, str, end-str, '\\', '_', '%', 100, temp1, temp2, &len1, &len2, &prefix_len), utf8_str, (int)len1, temp1, (int)len2, temp2); } if (OB_NOT_NULL(cs->coll->wildcmp)) { diff --git a/src/sql/engine/expr/ob_expr_inner_decode_like.cpp b/src/sql/engine/expr/ob_expr_inner_decode_like.cpp index a6e55e41d..334603522 100644 --- a/src/sql/engine/expr/ob_expr_inner_decode_like.cpp +++ b/src/sql/engine/expr/ob_expr_inner_decode_like.cpp @@ -111,11 +111,20 @@ int ObExprInnerDecodeLike::eval_inner_decode_like(const ObExpr &expr, ObEvalCtx LOG_WARN("failed to check escape length", K(escape_str), K(escape_str.length())); LOG_USER_ERROR(OB_INVALID_ARGUMENT, "ESCAPE"); } else { + // For a pattern like 'aaa%' that ends with `%`, we will extract a precise range with some special handling: + // We need to fill the end key of the like range with the maximum character + // up to the target column's length to match the semantics of `%`. + // However, when the target column length is less than the effective prefix length of the pattern, + // the pattern gets truncated, resulting in an imprecise range and incorrect results. + // So, we need to ensure that the effective prefix of the pattern is not truncated + // to guarantee that the range is always precise. + int32_t range_str_len = col_len; //convert character counts to len in bytes - col_len = static_cast(col_len * mbmaxlen); - size_t min_str_len = col_len; - size_t max_str_len = col_len; + range_str_len = static_cast(range_str_len * mbmaxlen); + size_t min_str_len = range_str_len; + size_t max_str_len = range_str_len; size_t res_len = 0; + size_t prefix_len = 0; int32_t start_flag = is_start->get_int(); common::ObArenaAllocator &temp_allocator = tmp_alloc_g.get_allocator(); char *min_str_buf = NULL; @@ -133,26 +142,59 @@ int ObExprInnerDecodeLike::eval_inner_decode_like(const ObExpr &expr, ObEvalCtx static_cast(min_str_buf), &min_str_len, static_cast(max_str_buf), - &max_str_len))) { + &max_str_len, + &prefix_len))) { ret = OB_ERR_UNEXPECTED; LOG_WARN("calc like range failed", K(ret), K(pattern_str), K(escape_str), K(cs_type)); } else { - ObExprStrResAlloc res_alloc(expr, ctx); - char *buf = NULL; - if (is_start->get_int() == 1) { - res_buf = min_str_buf; - res_len = min_str_len; - } else { - res_buf = max_str_buf; - res_len = max_str_len; + if (prefix_len >= col_len && ObCharset::strlen_char(cs_type, min_str_buf, prefix_len) >= col_len) { + int32_t pattern_prefix_len = 0; // strlen_char of prefix + if (OB_FAIL(get_pattern_prefix_len(cs_type, + escape_str, + pattern_str, + pattern_prefix_len))) { + LOG_WARN("failed to get pattern prefix len", K(ret), K(pattern_str), K(escape_str)); + } else { + range_str_len = max(col_len, pattern_prefix_len); + range_str_len = static_cast(range_str_len * mbmaxlen); + min_str_len = range_str_len; + max_str_len = range_str_len; + if (OB_ISNULL(min_str_buf = (char*)temp_allocator.alloc(min_str_len))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("alloc memory failed", K(min_str_len)); + } else if (OB_ISNULL(max_str_buf = (char*)temp_allocator.alloc(max_str_len))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("alloc memory failed", K(max_str_len)); + } else if (OB_FAIL(ObCharset::like_range(cs_type, + pattern_str, + *(escape_str.ptr()), + static_cast(min_str_buf), + &min_str_len, + static_cast(max_str_buf), + &max_str_len))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("calc like range failed", K(ret), K(pattern_str), K(escape_str), K(cs_type)); + } + } } - buf = (char*)res_alloc.alloc(res_len); - if (OB_ISNULL(buf)) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_WARN("alloc memory failed", K(ret), K(min_str_len)); - } else { - MEMCPY(buf, res_buf, res_len); - expr_datum.set_string(buf, res_len); + if (OB_SUCC(ret)) { + ObExprStrResAlloc res_alloc(expr, ctx); + char *buf = NULL; + if (is_start->get_int() == 1) { + res_buf = min_str_buf; + res_len = min_str_len; + } else { + res_buf = max_str_buf; + res_len = max_str_len; + } + buf = (char*)res_alloc.alloc(res_len); + if (OB_ISNULL(buf)) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("alloc memory failed", K(ret), K(min_str_len)); + } else { + MEMCPY(buf, res_buf, res_len); + expr_datum.set_string(buf, res_len); + } } } } @@ -226,5 +268,46 @@ int ObExprInnerDecodeLike::cast_like_obj_if_needed(ObEvalCtx &ctx, const ObExpr } return ret; } + +int ObExprInnerDecodeLike::get_pattern_prefix_len(const ObCollationType &cs_type, + const ObString &escape_str, + const ObString &pattern_str, + int32_t &pattern_prefix_len) +{ + int ret = OB_SUCCESS; + int64_t mbmaxlen = 1; + pattern_prefix_len = 0; + if (OB_NOT_NULL(pattern_str.ptr()) && OB_NOT_NULL(escape_str.ptr()) && escape_str.length() == 1 && + cs_type != CS_TYPE_INVALID && cs_type < CS_TYPE_MAX) { + if (OB_FAIL(ObCharset::get_mbmaxlen_by_coll(cs_type, mbmaxlen))) { + LOG_WARN("fail to get mbmaxlen", K(ret), K(cs_type)); + } else { + ObArenaAllocator allocator; + size_t pattern_len = pattern_str.length(); + pattern_len = static_cast(pattern_len * mbmaxlen); + size_t min_str_len = pattern_len; + size_t max_str_len = pattern_len; + size_t prefix_len = pattern_len; + char *min_str_buf = NULL; + char *max_str_buf = NULL; + if (OB_ISNULL(min_str_buf = (char *)allocator.alloc(min_str_len))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("no enough memory", K(ret), K(pattern_len)); + } else if (OB_ISNULL(max_str_buf = (char *)allocator.alloc(max_str_len))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("no enough memory", K(ret), K(pattern_len)); + } else if (OB_FAIL(ObCharset::like_range(cs_type, pattern_str, *(escape_str.ptr()), + min_str_buf, &min_str_len, + max_str_buf, &max_str_len, + &prefix_len))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("failed to retrive like range", K(ret)); + } else { + pattern_prefix_len = ObCharset::strlen_char(cs_type, min_str_buf, prefix_len); + } + } + } + return ret; +} } } \ No newline at end of file diff --git a/src/sql/engine/expr/ob_expr_inner_decode_like.h b/src/sql/engine/expr/ob_expr_inner_decode_like.h index c4a4ba746..d7177834a 100644 --- a/src/sql/engine/expr/ob_expr_inner_decode_like.h +++ b/src/sql/engine/expr/ob_expr_inner_decode_like.h @@ -40,6 +40,11 @@ public: private: static int cast_like_obj_if_needed(ObEvalCtx &ctx, const ObExpr &pattern_expr, ObDatum *pattern_datum, const ObExpr &dst_expr, ObDatum * &cast_datum); + // get prefix string (without wildcards) length of like pattern + static int get_pattern_prefix_len(const ObCollationType &cs_type, + const ObString &escape_str, + const ObString &pattern_str, + int32_t &pattern_prefix_len); DISALLOW_COPY_AND_ASSIGN(ObExprInnerDecodeLike) const; }; } // namespace sql diff --git a/src/sql/rewrite/ob_query_range.cpp b/src/sql/rewrite/ob_query_range.cpp index 340bad926..dfce4a385 100644 --- a/src/sql/rewrite/ob_query_range.cpp +++ b/src/sql/rewrite/ob_query_range.cpp @@ -8424,6 +8424,9 @@ int ObQueryRange::get_like_range(const ObObj &pattern, void *max_str_buf = NULL; int32_t col_len = out_key_part.pos_.column_type_.get_accuracy().get_length(); ObCollationType cs_type = out_key_part.pos_.column_type_.get_collation_type(); + int32_t pattern_prefix_len = 0; + int32_t range_str_len = 0; + size_t prefix_len = 0; size_t min_str_len = 0; size_t max_str_len = 0; ObObj pattern_buf_obj; @@ -8474,6 +8477,7 @@ int ObQueryRange::get_like_range(const ObObj &pattern, } else if (escape_str.empty()) { escape_str.assign_ptr("\\", 1); } else { /* do nothing */ } + if (OB_FAIL(ret)) { // do nothing; } else if (OB_FAIL(ObCharset::get_mbmaxlen_by_coll(cs_type, mbmaxlen))) { @@ -8485,13 +8489,26 @@ int ObQueryRange::get_like_range(const ObObj &pattern, ret = OB_INVALID_ARGUMENT; LOG_WARN("failed to check escape length", K(escape_str), K(escape_str.length())); LOG_USER_ERROR(OB_INVALID_ARGUMENT, "ESCAPE"); - } else { } + } else if (OB_FAIL(get_pattern_prefix_len(cs_type, + escape_str, + pattern_str, + pattern_prefix_len))) { + LOG_WARN("failed to get pattern prefix len", K(ret), K(pattern_str), K(escape_str)); + } if (OB_SUCC(ret)) { + // For a pattern like 'aaa%' that ends with `%`, we will extract a precise range with some special handling: + // We need to fill the end key of the like range with the maximum character + // up to the target column's length to match the semantics of `%`. + // However, when the target column length is less than the effective prefix length of the pattern, + // the pattern gets truncated, resulting in an imprecise range and incorrect results. + // So, we need to ensure that the effective prefix of the pattern is not truncated + // to guarantee that the range is always precise. + range_str_len = col_len; //convert character counts to len in bytes - col_len = static_cast(col_len * mbmaxlen); - min_str_len = col_len; - max_str_len = col_len; + range_str_len = static_cast(range_str_len * mbmaxlen); + min_str_len = range_str_len; + max_str_len = range_str_len; if (OB_ISNULL(min_str_buf = allocator_.alloc(min_str_len))) { ret = OB_ALLOCATE_MEMORY_FAILED; LOG_ERROR("alloc memory failed", K(min_str_len)); @@ -8504,7 +8521,8 @@ int ObQueryRange::get_like_range(const ObObj &pattern, static_cast(min_str_buf), &min_str_len, static_cast(max_str_buf), - &max_str_len))) { + &max_str_len, + &prefix_len))) { //set whole range out_key_part.normal_keypart_->start_.set_min_value(); out_key_part.normal_keypart_->end_.set_max_value(); @@ -8514,25 +8532,57 @@ int ObQueryRange::get_like_range(const ObObj &pattern, out_key_part.normal_keypart_->always_true_ = true; ret = OB_SUCCESS; } else { - ObObj &start = out_key_part.normal_keypart_->start_; - ObObj &end = out_key_part.normal_keypart_->end_; - start.set_collation_type(out_key_part.pos_.column_type_.get_collation_type()); - start.set_string(out_key_part.pos_.column_type_.get_type(), - static_cast(min_str_buf), static_cast(min_str_len)); - end.set_collation_type(out_key_part.pos_.column_type_.get_collation_type()); - end.set_string(out_key_part.pos_.column_type_.get_type(), - static_cast(max_str_buf), static_cast(max_str_len)); - out_key_part.normal_keypart_->include_start_ = true; - out_key_part.normal_keypart_->include_end_ = true; - out_key_part.normal_keypart_->always_false_ = false; - out_key_part.normal_keypart_->always_true_ = false; + if (prefix_len >= col_len && ObCharset::strlen_char(cs_type, static_cast(min_str_buf), prefix_len) >= col_len) { + int32_t pattern_prefix_len = 0; // strlen_char of prefix + if (OB_FAIL(get_pattern_prefix_len(cs_type, + escape_str, + pattern_str, + pattern_prefix_len))) { + LOG_WARN("failed to get pattern prefix len", K(ret), K(pattern_str), K(escape_str)); + } else { + range_str_len = max(col_len, pattern_prefix_len); + range_str_len = static_cast(range_str_len * mbmaxlen); + min_str_len = range_str_len; + max_str_len = range_str_len; + if (OB_ISNULL(min_str_buf = allocator_.alloc(min_str_len))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("alloc memory failed", K(min_str_len)); + } else if (OB_ISNULL(max_str_buf = allocator_.alloc(max_str_len))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("alloc memory failed", K(max_str_len)); + } else if (OB_FAIL(ObCharset::like_range(cs_type, + pattern_str, + *(escape_str.ptr()), + static_cast(min_str_buf), + &min_str_len, + static_cast(max_str_buf), + &max_str_len))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("calc like range failed", K(ret), K(pattern_str), K(escape_str), K(cs_type)); + } + } + } + if (OB_SUCC(ret)) { + ObObj &start = out_key_part.normal_keypart_->start_; + ObObj &end = out_key_part.normal_keypart_->end_; + start.set_collation_type(out_key_part.pos_.column_type_.get_collation_type()); + start.set_string(out_key_part.pos_.column_type_.get_type(), + static_cast(min_str_buf), static_cast(min_str_len)); + end.set_collation_type(out_key_part.pos_.column_type_.get_collation_type()); + end.set_string(out_key_part.pos_.column_type_.get_type(), + static_cast(max_str_buf), static_cast(max_str_len)); + out_key_part.normal_keypart_->include_start_ = true; + out_key_part.normal_keypart_->include_end_ = true; + out_key_part.normal_keypart_->always_false_ = false; + out_key_part.normal_keypart_->always_true_ = false; - /// check if is precise - if (NULL != query_range_ctx_) { - query_range_ctx_->cur_expr_is_precise_ = - ObQueryRange::check_like_range_precise(pattern_str, - static_cast(max_str_buf), - max_str_len, *(escape_str.ptr())); + /// check if is precise + if (NULL != query_range_ctx_) { + query_range_ctx_->cur_expr_is_precise_ = + ObQueryRange::check_like_range_precise(pattern_str, + static_cast(max_str_buf), + max_str_len, *(escape_str.ptr())); + } } } if (NULL != min_str_buf) { @@ -9611,7 +9661,8 @@ int ObQueryRange::is_precise_like_range(const ObObjParam &pattern, char escape, if (pattern.is_string_type()) { ObString pattern_str = pattern.get_string(); if (cs_type == CS_TYPE_INVALID || cs_type >= CS_TYPE_MAX) { - }else if (OB_FAIL(ObCharset::get_mbmaxlen_by_coll(cs_type, mbmaxlen))) { + } else if (ObCharset::is_cs_uca(cs_type)) { + } else if (OB_FAIL(ObCharset::get_mbmaxlen_by_coll(cs_type, mbmaxlen))) { LOG_WARN("fail to get mbmaxlen", K(ret), K(cs_type), K(escape)); } else { ObArenaAllocator allocator; @@ -9646,6 +9697,47 @@ int ObQueryRange::is_precise_like_range(const ObObjParam &pattern, char escape, return ret; } +int ObQueryRange::get_pattern_prefix_len(const ObCollationType &cs_type, + const ObString &escape_str, + const ObString &pattern_str, + int32_t &pattern_prefix_len) +{ + int ret = OB_SUCCESS; + int64_t mbmaxlen = 1; + pattern_prefix_len = 0; + if (OB_NOT_NULL(pattern_str.ptr()) && OB_NOT_NULL(escape_str.ptr()) && escape_str.length() == 1 && + cs_type != CS_TYPE_INVALID && cs_type < CS_TYPE_MAX) { + if (OB_FAIL(ObCharset::get_mbmaxlen_by_coll(cs_type, mbmaxlen))) { + LOG_WARN("fail to get mbmaxlen", K(ret), K(cs_type)); + } else { + ObArenaAllocator allocator; + size_t pattern_len = pattern_str.length(); + pattern_len = static_cast(pattern_len * mbmaxlen); + size_t prefix_len = pattern_len; + size_t min_str_len = pattern_len; + size_t max_str_len = pattern_len; + char *min_str_buf = NULL; + char *max_str_buf = NULL; + if (OB_ISNULL(min_str_buf = (char *)allocator.alloc(min_str_len))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("no enough memory", K(ret), K(pattern_len)); + } else if (OB_ISNULL(max_str_buf = (char *)allocator.alloc(max_str_len))) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_WARN("no enough memory", K(ret), K(pattern_len)); + } else if (OB_FAIL(ObCharset::like_range(cs_type, pattern_str, *(escape_str.ptr()), + min_str_buf, &min_str_len, + max_str_buf, &max_str_len, + &prefix_len))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("failed to retrive like range", K(ret)); + } else { + pattern_prefix_len = ObCharset::strlen_char(cs_type, min_str_buf, prefix_len); + } + } + } + return ret; +} + int ObQueryRange::get_calculable_expr_val(const ObRawExpr *expr, ObObj &val, bool &is_valid, diff --git a/src/sql/rewrite/ob_query_range.h b/src/sql/rewrite/ob_query_range.h index 6164b40bd..dbe723c43 100644 --- a/src/sql/rewrite/ob_query_range.h +++ b/src/sql/rewrite/ob_query_range.h @@ -554,6 +554,11 @@ public: int64_t &range_prefix_count, bool &contain_always_false) const; virtual bool is_fast_nlj_range() const { return false; } + // get prefix string (without wildcards) length of like pattern + static int get_pattern_prefix_len(const ObCollationType &cs_type, + const ObString &escape_str, + const ObString &pattern_str, + int32_t &pattern_prefix_len); private: int init_query_range_ctx(common::ObIAllocator &allocator,