/** * Copyright (c) 2021 OceanBase * OceanBase CE is licensed under Mulan PubL v2. * You can use this software according to the terms and conditions of the Mulan PubL v2. * You may obtain a copy of Mulan PubL v2 at: * http://license.coscl.org.cn/MulanPubL-2.0 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. * See the Mulan PubL v2 for more details. */ #define USING_LOG_PREFIX SQL_ENG #include "sql/engine/expr/ob_expr_like.h" //#include "sql/engine/expr/ob_expr_promotion_util.h" #include "sql/engine/ob_exec_context.h" #include "share/object/ob_obj_cast.h" #include "lib/oblog/ob_log.h" #include "sql/session/ob_sql_session_info.h" #include "sql/engine/expr/ob_expr_lob_utils.h" namespace oceanbase { using namespace common; namespace common { OB_DECLARE_AVX2_SPECIFIC_CODE( class StringSearcher { private: static constexpr int AVX2_SIZE = sizeof(__m256i); public: StringSearcher() : pattern_(nullptr), pattern_end_(nullptr), pattern_len_(0) {} inline int init(const char *pattern, size_t len) { int ret = OB_SUCCESS; if (nullptr == pattern || 0 == len) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument. pattern is null.", K(ret), K(pattern), K(len)); } else { pattern_ = pattern; pattern_end_ = pattern_ + len; pattern_len_ = len; first_ = *pattern; vfirst_ = _mm256_set1_epi8(first_); if (2 <= pattern_len_) { last_ = *(pattern_end_ - 1); vlast_ = _mm256_set1_epi8(last_); } } return ret; } inline const char *get_pattern() { return pattern_; } inline const char *get_patterne_end() { return pattern_end_; } inline size_t get_pattern_length() { return pattern_len_; } public: // Determines if `pattern_` is a substring of `text`. inline int is_substring(const char *text, const char *text_end, bool &res) const { int ret = OB_SUCCESS; res = false; const char *text_cur = text; // `pattern_` will not be null because it is prepared in `set_instr_info()`. if (nullptr == pattern_ || 0 == pattern_len_) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument. pattern_ is null.", K(ret), K(pattern_), K(pattern_len_)); } else if (text == text_end) { // `text` is NULL, so `res` will be false. } else if (1 == pattern_len_) { // Here is the quick path when `pattern_len_` is 1. // All elements of `vfirst_`(__m128i) are first byte of `pattern_`. // We will compare `text` and `vfirst_` using avx(__mm256i) in each iteration. // Align the end of `text` based on `AVX2_SIZE`. const char *avx_end = text + ((text_end - text) & ~(AVX2_SIZE - 1)); for (; text_cur < avx_end; text_cur += AVX2_SIZE) { __m256i first_block = _mm256_loadu_si256(reinterpret_cast(text_cur)); __m256i first_cmp = _mm256_cmpeq_epi8(first_block, vfirst_); uint32_t mask = _mm256_movemask_epi8(first_cmp); if (0 != mask) { res = true; break; } } } else { // Here is the common path when `pattern_len_` is greater than 1. // First, find positions in `text` that are equal to the first and last byte of `pattern_` // at same time using avx(__mm256i). // Then, use func `memequal_opt` to compare middle bytes of `pattern_` and corresponding // bytes of `text`. // Align the end of `text` based on `AVX2_SIZE` and `pattern_len_`. const char *avx_end = text + ((text_end - (text + pattern_len_ - 1)) & ~(AVX2_SIZE - 1)); for (; !res && text_cur < avx_end; text_cur += AVX2_SIZE) { const char *last_cur = text_cur + pattern_len_ - 1; __m256i first_block = _mm256_loadu_si256(reinterpret_cast(text_cur)); __m256i last_block = _mm256_loadu_si256(reinterpret_cast(last_cur)); __m256i first_cmp = _mm256_cmpeq_epi8(first_block, vfirst_); __m256i last_cmp = _mm256_cmpeq_epi8(last_block, vlast_); uint32_t mask = _mm256_movemask_epi8(_mm256_and_si256(first_cmp, last_cmp)); while (mask != 0) { int offset = __builtin_ctz(mask); // The first and the last bytes match, so we don't need to compare them again. if (2 == pattern_len_ || memequal_opt(text_cur + offset + 1, pattern_ + 1, pattern_len_ - 2)) { res = true; break; } mask &= (mask - 1); } } } // Handle the tail of text. if (!res && text_end - text_cur >= pattern_len_) { res = NULL != MEMMEM(text_cur, text_end - text_cur, pattern_, pattern_len_); } return ret; } // Determines if `text` starts with `pattern_`. inline int start_with(const char *text, const char *text_end, bool &res) const { int ret = OB_SUCCESS; // pattern_ will not be null because it is prepared in set_instr_info(). if (nullptr == pattern_ || 0 == pattern_len_) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument. pattern_ is null.", K(ret), K(pattern_), K(pattern_len_)); } else if (pattern_len_ > text_end - text) { res = false; } else { res = memequal_opt(text, pattern_, pattern_len_); } return ret; } // Determines if `text` ends with `pattern_`. inline int end_with(const char *text, const char *text_end, bool &res) const { int ret = OB_SUCCESS; // pattern_ will not be null because it is prepared in set_instr_info(). if (nullptr == pattern_ || 0 == pattern_len_) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument. pattern_ is null.", K(ret), K(pattern_), K(pattern_len_)); } else if (pattern_len_ > text_end - text) { res = false; } else { res = memequal_opt(text_end - pattern_len_, pattern_, pattern_len_); } return ret; } // Determines if `text` equals with `pattern_`. inline int equal(const char *text, const char *text_end, bool &res) const { int ret = OB_SUCCESS; // pattern_ will not be null because it is prepared in set_instr_info(). if (nullptr == pattern_ || 0 == pattern_len_) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument. pattern_ is null.", K(ret), K(pattern_), K(pattern_len_)); } else if (pattern_len_ != text_end - text) { res = false; } else { res = memequal_opt(text, pattern_, pattern_len_); } return res; } inline bool memequal_opt(const char *s1, const char *s2, size_t n) const { switch (n) { case 1: return *s1 == *s2; case 2: return memequal_plain(s1, s2); case 3: return memequal_plain(s1, s2) && memequal_plain(s1 + 2, s2 + 2); case 4: return memequal_plain(s1, s2); case 5: return memequal_plain(s1, s2) && memequal_plain(s1 + 4, s2 + 4); case 6: return memequal_plain(s1, s2) && memequal_plain(s1 + 4, s2 + 4); case 7: return memequal_plain(s1, s2) && memequal_plain(s1 + 4, s2 + 4) && memequal_plain(s1 + 6, s2 + 6); case 8: return memequal_plain(s1, s2); default: break; } if (n <= 16) { return memequal_plain(s1, s2) && memequal_plain(s1 + n - 8, s2 + n - 8); } while (n >= 64) { if (memequal_sse<4>(s1, s2)) { s1 += 64; s2 += 64; n -= 64; } else { return false; } } switch (n / 16) { case 3: if (!memequal_sse<1>(s1 + 32, s2 + 32)) { return false; } case 2: if (!memequal_sse<1>(s1 + 16, s2 + 16)) { return false; } case 1: if (!memequal_sse<1>(s1, s2)) { return false; } } return memequal_sse<1>(s1 + n - 16, s2 + n - 16); } // compare the values of two int8_t, int16_t or other comparable plain types. template OB_INLINE bool memequal_plain(const char *p1, const char *p2) const { return *reinterpret_cast(p1) == *reinterpret_cast(p2); } // compare two values by sse, cnt means the count of __m128i to compare. template OB_INLINE bool memequal_sse(const char *p1, const char *p2) const { if (cnt == 1) { return 0xFFFF == _mm_movemask_epi8( _mm_cmpeq_epi8( _mm_loadu_si128(reinterpret_cast(p1)), _mm_loadu_si128(reinterpret_cast(p2)))); } if (cnt == 4) { return 0xFFFF == _mm_movemask_epi8( _mm_and_si128( _mm_and_si128( _mm_cmpeq_epi8( _mm_loadu_si128(reinterpret_cast(p1)), _mm_loadu_si128(reinterpret_cast(p2))), _mm_cmpeq_epi8( _mm_loadu_si128(reinterpret_cast(p1) + 1), _mm_loadu_si128(reinterpret_cast(p2) + 1))), _mm_and_si128( _mm_cmpeq_epi8( _mm_loadu_si128(reinterpret_cast(p1) + 2), _mm_loadu_si128(reinterpret_cast(p2) + 2)), _mm_cmpeq_epi8( _mm_loadu_si128(reinterpret_cast(p1) + 3), _mm_loadu_si128(reinterpret_cast(p2) + 3))))); } } private: // string to be searched for const char *pattern_; const char *pattern_end_; size_t pattern_len_; // first or last byte of `pattern_` uint8_t first_; uint8_t last_; // vector filled `first_` or `last_` __m256i vfirst_; __m256i vlast_; }; ) } namespace sql { #define PERCENT_SIGN_START(mode) (START_WITH_PERCENT_SIGN == mode || START_END_WITH_PERCENT_SIGN == mode) #define PERCENT_SIGN_END(mode) (END_WITH_PERCENT_SIGN == mode || START_END_WITH_PERCENT_SIGN == mode) int ObExprLike::InstrInfo::record_pattern(char *&pattern_buf, const ObString &pattern) { int ret = OB_SUCCESS; if (OB_LIKELY(pattern.length() <= instr_buf_length_)) { pattern_buf = instr_buf_; } else if (OB_ISNULL(allocator_)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("allocator is null", K(ret)); } else if (OB_ISNULL(pattern_buf = (char*)(allocator_->alloc(sizeof(char) * pattern.length() * 2)))) { ret = OB_ALLOCATE_MEMORY_FAILED; LOG_WARN("No more memories", K(ret)); } else { instr_buf_ = pattern_buf; instr_buf_length_ = pattern.length() * 2; } if (OB_SUCC(ret)) { MEMCPY(pattern_buf, pattern.ptr(), pattern.length()); } return ret; } int ObExprLike::InstrInfo::add_instr_info(const char *start, const uint32_t length) { int ret = OB_SUCCESS; if (OB_LIKELY(instr_cnt_ < instr_info_buf_size_)) { instr_starts_[instr_cnt_] = start; instr_lengths_[instr_cnt_] = length; instr_cnt_++; } else { const uint32_t init_buf_size = 8; const uint32_t new_buf_size = MAX(init_buf_size, instr_info_buf_size_ * 2); const char **new_instr_starts = NULL; uint32_t *new_instr_lengths = NULL; if (OB_ISNULL(new_instr_starts = static_cast(allocator_->alloc(sizeof(char *) * new_buf_size))) || OB_ISNULL(new_instr_lengths = static_cast(allocator_->alloc(sizeof(uint32_t) * new_buf_size)))) { ret = OB_ALLOCATE_MEMORY_FAILED; LOG_WARN("allocator memory failed", K(ret), K(new_instr_starts), K(new_buf_size)); } else { MEMCPY(new_instr_starts, instr_starts_, sizeof(char *) * instr_cnt_); MEMCPY(new_instr_lengths, instr_lengths_, sizeof(uint32_t) * instr_cnt_); instr_info_buf_size_ = new_buf_size; instr_starts_ = new_instr_starts; instr_lengths_ = new_instr_lengths; instr_starts_[instr_cnt_] = start; instr_lengths_[instr_cnt_] = length; instr_cnt_++; } } return ret; } ObExprLike::ObExprLike(ObIAllocator &alloc) : ObFuncExprOperator(alloc, T_OP_LIKE, N_LIKE, 3, VALID_FOR_GENERATED_COL, NOT_ROW_DIMENSION), is_pattern_literal_(false), is_text_literal_(true), is_escape_literal_(false), like_id_(-1) { need_charset_convert_ = false; } ObExprLike::~ObExprLike() { } int ObExprLike::assign(const ObExprOperator &other) { int ret = OB_SUCCESS; const ObExprLike *tmp_other = dynamic_cast(&other); if (OB_UNLIKELY(NULL == tmp_other)) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument. wrong type for other", K(ret), K(other)); } else if (OB_LIKELY(this != tmp_other)) { if (OB_FAIL(ObFuncExprOperator::assign(other))) { LOG_WARN("copy in Base class ObFuncExprOperator failed", K(ret)); } else { this->is_pattern_literal_ = tmp_other->is_pattern_literal_; this->is_text_literal_ = tmp_other->is_text_literal_; this->is_escape_literal_ = tmp_other->is_escape_literal_; this->like_id_ = tmp_other->like_id_; } } return ret; } // Oracle mode, the following character of escape character only support _ and % and self, or report error // check valid must be process first, even parttern or text is null, it will report error // eg: select 1 from dual where null like 'a' escape ''; // like: select 1 from t1 where '_%a' like 'a_a%aa' escape 'a'; --ok // select 1 from t1 where '_%' like 'aba%' escape 'a'; --error, ab is invalid // ORA-01424: missing or illegal character following the escape character template int ObExprLike::check_pattern_valid(const T &pattern, const T &escape, const ObCollationType escape_coll, ObCollationType coll_type, ObExecContext *exec_ctx, const uint64_t like_id, const bool check_optimization) { int ret = OB_SUCCESS; int32_t escape_wc = 0; const ObCharsetInfo *cs = NULL; ObString escape_val = escape.get_string(); ObString pattern_val = pattern.get_string(); ObExprLikeContext *like_ctx = NULL; if (is_static_engine) { // check_optimizaiton is true, only if pattern and escape are const. if (check_optimization && NULL == (like_ctx = static_cast (exec_ctx->get_expr_op_ctx(like_id)))) { if (OB_FAIL(exec_ctx->create_expr_op_ctx(like_id, like_ctx))) { LOG_WARN("failed to create operator ctx", K(ret), K(like_id)); } else { like_ctx->instr_info_.set_allocator(exec_ctx->get_allocator()); } } } else if (NULL != exec_ctx) { // When text, pattern and escape are all const, report error when create op ctx // If it's error, then don't optimize check int tmp_ret = OB_SUCCESS; if (NULL == (like_ctx = static_cast (exec_ctx->get_expr_op_ctx(like_id)))) { if (OB_SUCCESS != (tmp_ret = exec_ctx->create_expr_op_ctx(like_id, like_ctx))) { LOG_DEBUG("failed to create operator ctx", K(ret), K(like_id)); } else { like_ctx->instr_info_.set_allocator(exec_ctx->get_allocator()); } } } if (OB_FAIL(ret)) { LOG_WARN("failed to check pattern", K(ret), K(like_id)); } else if (!lib::is_oracle_mode()) { //don't check in mysql mode } else if (NULL != like_ctx && checked_already(*like_ctx, pattern.is_null(), pattern_val, escape.is_null(), escape_val)) { // skip check if pattern and escape are same as checked last time. //select * from t1 where exist (select * from t2 where 'abc' like t1.c1 escape t1.c2); //pattern t1.c1 and escape t1.c2 are const in subquery, but they may change. like_ctx->same_as_last = true; } else if (escape.is_null() || 1 != escape_val.length()) { ret = OB_ERR_INVALID_ESCAPE_CHAR_LENGTH; LOG_WARN("escape character must be character string of length 1", K(escape_val), K(ret)); } else if (OB_FAIL(calc_escape_wc(escape_coll, escape_val, escape_wc))) { LOG_WARN("fail to calc escape wc", K(escape_val), K(escape_coll)); } else if (OB_UNLIKELY(OB_ISNULL(cs = ObCharset::get_charset(coll_type)) || OB_ISNULL(cs->cset))) { ret = OB_ERR_UNEXPECTED; LOG_ERROR("unexpected error. invalid argument(s)",K(coll_type)); } else if (!pattern.is_null()) { const char *buf_start = pattern_val.ptr(); const char *buf_end = pattern_val.ptr() + pattern_val.length(); int error = 0; int32_t char_len = 0; bool is_valid = true; bool is_char_escape = false; bool pre_char_is_escape = false; while (OB_SUCC(ret) && buf_start < buf_end && is_valid) { char_len = static_cast(cs->cset->well_formed_len(cs, buf_start, buf_end, 1, &error)); if (OB_UNLIKELY(0 != error)) { ret = OB_ERR_INVALID_CHARACTER_STRING; LOG_WARN("well_formed_len failed. invalid char", K(buf_start), K(pattern_val), K(char_len)); } else if (OB_FAIL(is_escape(coll_type, buf_start, char_len, escape_wc, is_char_escape))) { LOG_WARN("fail to judge escape", K(escape_val), K(escape_coll)); } else if (is_char_escape) { // 连续两个escape char, like: select 1 from t1 where 'a' like 'aa' escape 'a'; -- it's ok if (pre_char_is_escape) { pre_char_is_escape = false; is_char_escape = false; } else { pre_char_is_escape = true; is_char_escape = false; } } else if (pre_char_is_escape) { // If pre char is escape char, then the following char must be '_' or '%' // Eg: select 1 from t1 where 'a' like 'a_a%' escape 'a'; -- it's ok ObString percent_str = ObCharsetUtils::get_const_str(coll_type, '%'); ObString underline_str = ObCharsetUtils::get_const_str(coll_type, '_'); const ObString pattern_char = ObString(char_len, buf_start); if (0 == pattern_char.compare(percent_str) || 0 == pattern_char.compare(underline_str)) { // it's ok } else { ret = OB_ERR_INVALID_CHAR_FOLLOWING_ESCAPE_CHAR; LOG_WARN("missing or illegal character following the escape character", K(escape_val), K(pattern_val), K(pattern_char), K(ret)); } pre_char_is_escape = false; } buf_start += char_len; }//end while if (pre_char_is_escape) { // Last character is escape character // // Eg: select 1 from t1 where 'a' like 'a_a' escape 'a'; -- it's error ret = OB_ERR_INVALID_CHAR_FOLLOWING_ESCAPE_CHAR; LOG_WARN("missing or illegal character following the escape character", K(escape_val), K(pattern_val), K(ret)); } if (OB_SUCC(ret) && NULL != like_ctx) { record_last_check(*like_ctx, pattern_val, escape_val, &exec_ctx->get_allocator()); } } return ret; } int ObExprLike::calc_result_type3(ObExprResType &type, ObExprResType &type1, ObExprResType &type2, ObExprResType &type3, ObExprTypeCtx &type_ctx) const { UNUSED(type_ctx); int ret = OB_SUCCESS; if (!type1.is_null() && !type2.is_null() && !type3.is_null() && (!is_type_valid(type1.get_type()) || !is_type_valid(type2.get_type()) || !is_type_valid(type3.get_type()))) { ret = OB_INVALID_ARGUMENT; LOG_WARN("the param is not castable", K(type1), K(type2), K(type3), K(ret)); } else if (OB_NOT_NULL(type_ctx.get_session()) && lib::is_oracle_mode()) { ObSEArray str_params; ObExprResType tmp_result_type; OZ(str_params.push_back(&type1)); OZ(aggregate_string_type_and_charset_oracle(*type_ctx.get_session(), str_params, tmp_result_type)); OZ(str_params.push_back(&type2)); OZ(deduce_string_param_calc_type_and_charset(*type_ctx.get_session(), tmp_result_type, str_params)); type3.set_calc_type(ObVarcharType); type3.set_calc_collation_type(type_ctx.get_session()->get_nls_collation()); type.set_int(); type.set_calc_type(type1.get_calc_type()); type.set_calc_collation_type(type1.get_calc_collation_type()); type.set_scale(DEFAULT_SCALE_FOR_INTEGER); type.set_precision(DEFAULT_PRECISION_FOR_BOOL); } else { type.set_int(); ObObjMeta types[2] = {type1, type2}; type.set_calc_type(ObVarcharType); type.set_scale(DEFAULT_SCALE_FOR_INTEGER); type.set_precision(DEFAULT_PRECISION_FOR_BOOL); if (!ob_is_text_tc(type1.get_type())) { type1.set_calc_type(ObVarcharType); } if (!ob_is_text_tc(type2.get_type())) { type2.set_calc_type(ObVarcharType); } type3.set_calc_type(ObVarcharType); type3.set_calc_collation_type(type3.get_collation_type()); if (lib::is_oracle_mode()) { if (OB_ISNULL(type_ctx.get_session())) { ret = OB_ERR_UNEXPECTED; LOG_WARN("session is null", K(ret)); } else { type.set_calc_collation_type(type_ctx.get_session()->get_nls_collation()); } } else { ret = aggregate_charsets_for_comparison(type.get_calc_meta(), types, 2, type_ctx); } type1.set_calc_collation_type(type.get_calc_collation_type()); type2.set_calc_collation_type(type.get_calc_collation_type()); type1.set_calc_collation_level(type.get_calc_collation_level()); type2.set_calc_collation_level(type.get_calc_collation_level()); ObExprOperator::calc_result_flag2(type, type1, type2); // ESCAPE is ignored } return ret; } int ObExprLike::set_instr_info(ObIAllocator *exec_allocator, const ObCollationType cs_type, const ObString &pattern, const ObString &escape, const ObCollationType escape_coll, ObExprLikeContext &like_ctx) { //If you feel tough to understand this func, //please feel free to refer here for more details : // int ret = OB_SUCCESS; like_ctx.instr_info_.reuse(); const ObCharsetInfo *cs = NULL; char *pattern_buf = nullptr; ObIAllocator *exec_cal_buf = exec_allocator; InstrInfo &instr_info = like_ctx.instr_info_; if (cs_type != CS_TYPE_UTF8MB4_BIN) { //we optimize the case in which cs_type == CS_TYPE_UTF8MB4_BIN only //just let it go } else if (OB_UNLIKELY(OB_ISNULL(cs = ObCharset::get_charset(cs_type)) || OB_ISNULL(cs->cset))) { ret = OB_ERR_UNEXPECTED; LOG_ERROR("unexpected error. invalid argument(s)",K(cs_type), K(pattern), K(escape)); } else if (OB_UNLIKELY(pattern.empty())) { //do nothing.just let it go. } else if (OB_ISNULL(exec_cal_buf)) { ret = OB_INVALID_ARGUMENT; LOG_WARN("Failed to get exec cal buf", K(ret)); } else { int32_t escape_wc = 0; if (OB_FAIL(instr_info.record_pattern(pattern_buf, pattern))) { LOG_WARN("record pattern failed", K(ret)); } else if (OB_FAIL(calc_escape_wc(escape_coll, escape, escape_wc))) { LOG_WARN("calc escape wc failed", K(ret), K(escape_coll), K(escape)); } else { //iterate pattern now const char *buf_start = pattern_buf; const char *buf_end = pattern_buf + pattern.length(); int error = 0; int32_t char_len = 0; bool is_char_escape = false; bool use_instr_mode = true; const char *instr_start = NULL; uint32_t instr_len = 0; bool percent_sign_exist = false; while (OB_SUCC(ret) && buf_start < buf_end && use_instr_mode) { char_len = static_cast(cs->cset->well_formed_len(cs, buf_start, buf_end, 1, &error)); is_char_escape = false; if (OB_UNLIKELY(0 != error)) { ret = OB_ERR_INVALID_CHARACTER_STRING; LOG_WARN("well_formed_len failed. invalid char", K(cs_type), K(buf_start), K(pattern), K(char_len)); } else if (OB_FAIL(is_escape(cs_type, buf_start, char_len, escape_wc, is_char_escape))) { LOG_WARN("check is escape failed", K(ret), K(escape_coll)); } else if (is_char_escape || (1 == char_len && '_' == *buf_start)) { //when there are "_" or escape in pattern //the case can not be optimized. use_instr_mode = false; // since cs_type is CS_TYPE_UTF8MB4_BIN, length of '%' must be 1. } else if ((1 == char_len && '%' == *buf_start)) { //percent sign percent_sign_exist = true; if (OB_LIKELY(instr_len > 0)) { if (OB_FAIL(instr_info.add_instr_info(instr_start, instr_len))) { LOG_WARN("add instr info failed", K(ret)); } instr_info.instr_total_length_ += instr_len; instr_len = 0; } buf_start += char_len; } else { //non-percent char if (0 == instr_len) { instr_start = buf_start; } buf_start += char_len; instr_len += char_len; } }//end while if (OB_SUCC(ret) && use_instr_mode && percent_sign_exist) { bool end_with_percent_sign = true; if (instr_len > 0) { // record last instr end_with_percent_sign = false; instr_info.instr_total_length_ += instr_len; if (OB_FAIL(instr_info.add_instr_info(instr_start, instr_len))) { LOG_WARN("add instr info failed", K(ret)); } } if (OB_UNLIKELY(instr_info.empty())) { instr_info.instr_mode_ = ALL_PERCENT_SIGN; } else { bool start_with_percent_sign = instr_info.instr_starts_[0] != pattern_buf; instr_info.instr_mode_ = start_with_percent_sign ? (end_with_percent_sign ? START_END_WITH_PERCENT_SIGN : START_WITH_PERCENT_SIGN) : (end_with_percent_sign ? END_WITH_PERCENT_SIGN : MIDDLE_PERCENT_SIGN); } }//end deduce instrmode }//end else } #if OB_USE_MULTITARGET_CODE // optimize for special patterns if (OB_SUCC(ret)) { if (1 == instr_info.instr_cnt_ && common::is_arch_supported(ObTargetArch::AVX2)) { void *buf = nullptr; if (OB_ISNULL(buf = exec_allocator->alloc(sizeof(StringSearcher)))) { ret = OB_ALLOCATE_MEMORY_FAILED; LOG_WARN("failed to allocator memory", K(ret)); } else if (FALSE_IT(like_ctx.string_searcher_ = new (buf) StringSearcher())) { // do nothing } else if (OB_FAIL(reinterpret_cast(like_ctx.string_searcher_)->init( instr_info.instr_starts_[0], instr_info.instr_lengths_[0]))) { LOG_WARN("failed to init string_searcher_", K(ret)); } } } #endif LOG_DEBUG("end set instr info", K(cs_type), K(pattern), K(escape), K(escape_coll), K(instr_info)); return ret; } template int ObExprLike::calc_with_instr_mode(T &result, const ObCollationType cs_type, const ObString &text, const ObExprLikeContext &like_ctx) { int ret = OB_SUCCESS; const InstrInfo instr_info = like_ctx.instr_info_; void *string_searcher = like_ctx.string_searcher_; const int32_t text_len = text.length(); if (OB_UNLIKELY(cs_type != CS_TYPE_UTF8MB4_BIN)) { ret = OB_INVALID_ARGUMENT; LOG_ERROR("invalid argument(s)", K(ret), K(cs_type), K(text)); } else if (OB_UNLIKELY(instr_info.empty())) { result.set_int(1); } else if (OB_UNLIKELY(text_len < instr_info.instr_total_length_)) { result.set_int(0); } else { int64_t res = 0; switch(instr_info.instr_mode_) { case START_WITH_PERCENT_SIGN: { res = match_with_instr_mode(text, instr_info, string_searcher); break; } case START_END_WITH_PERCENT_SIGN: { res = match_with_instr_mode(text, instr_info, string_searcher); break; } case END_WITH_PERCENT_SIGN: { res = match_with_instr_mode(text, instr_info, string_searcher); break; } case MIDDLE_PERCENT_SIGN: { res = match_with_instr_mode(text, instr_info, string_searcher); break; } default: { ret = OB_ERR_UNEXPECTED; LOG_ERROR("unexpected instr mode", K(ret), K(instr_info.instr_mode_), K(text)); break; } } if (OB_FAIL(ret)) { LOG_WARN("match with instr mode failed", K(ret), K(instr_info.instr_mode_), K(text)); } else { result.set_int(res); } } return ret; } int ObExprLike::calc_escape_wc(const ObCollationType escape_coll, const ObString &escape, int32_t &escape_wc) { int ret = OB_SUCCESS; size_t length = ObCharset::strlen_char(escape_coll, escape.ptr(), escape.length()); if (0 == length) { escape_wc = 0; } else if (1 != length) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument to ESCAPE", K(escape), K(length), K(ret)); } else if (OB_FAIL(ObCharset::mb_wc(escape_coll, escape, escape_wc))) { LOG_WARN("failed to convert escape to wc", K(ret), K(escape), K(escape_coll), K(escape_wc)); ret = OB_INVALID_ARGUMENT; } return ret; } int ObExprLike::is_escape(const ObCollationType cs_type, const char *buf_start, int32_t char_len, int32_t escape_wc, bool &res) { int ret = OB_SUCCESS; res = false; //once is_escape is called //we have to construct and destruct the string. //while, note that is_escape will not be called too frequently //so, never mind it ObString string(char_len, buf_start); int32_t wc = 0; if (OB_FAIL(ObCharset::mb_wc(cs_type, string, wc))) { LOG_WARN("failed to get wc", K(ret), K(string), K(cs_type)); ret = OB_INVALID_ARGUMENT; } else { res = (wc == escape_wc); } return ret; } OB_DEF_SERIALIZE(ObExprLike) { int ret = OB_SUCCESS; BASE_SER((ObExprLike, ObFuncExprOperator)); OB_UNIS_ENCODE(is_pattern_literal_); OB_UNIS_ENCODE(is_text_literal_); OB_UNIS_ENCODE(is_escape_literal_); OB_UNIS_ENCODE(like_id_); return ret; } OB_DEF_DESERIALIZE(ObExprLike) { int ret = OB_SUCCESS; BASE_DESER((ObExprLike, ObFuncExprOperator)); is_pattern_literal_ = false; is_text_literal_ = true; is_escape_literal_ = false; like_id_ = -1; OB_UNIS_DECODE(is_pattern_literal_); OB_UNIS_DECODE(is_text_literal_); OB_UNIS_DECODE(is_escape_literal_); OB_UNIS_DECODE(like_id_); return ret; } OB_DEF_SERIALIZE_SIZE(ObExprLike) { int64_t len = 0; BASE_ADD_LEN((ObExprLike, ObFuncExprOperator)); OB_UNIS_ADD_LEN(is_pattern_literal_); OB_UNIS_ADD_LEN(is_text_literal_); OB_UNIS_ADD_LEN(is_escape_literal_); OB_UNIS_ADD_LEN(like_id_); return len; } int ObExprLike::cg_expr(ObExprCGCtx &op_cg_ctx, const ObRawExpr &raw_expr, ObExpr &rt_expr) const { UNUSED(op_cg_ctx); int ret = OB_SUCCESS; const ObRawExpr *text_expr = NULL; const ObRawExpr *pattern_expr = NULL; const ObRawExpr *escape_expr = NULL; if (OB_UNLIKELY(3 != raw_expr.get_param_count())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("like op should have 3 arguments", K(raw_expr.get_param_count())); } else if (OB_ISNULL(text_expr = raw_expr.get_param_expr(0)) || OB_ISNULL(pattern_expr = raw_expr.get_param_expr(1)) || OB_ISNULL(escape_expr = raw_expr.get_param_expr(2))) { ret = OB_ERR_UNEXPECTED; LOG_ERROR("null pointer", K(text_expr), K(pattern_expr), K(escape_expr)); } else if (rt_expr.arg_cnt_ != 3 || OB_ISNULL(rt_expr.args_)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("like expr should have 3 arguments", K(ret), K(rt_expr.arg_cnt_), K(rt_expr.args_)); } else if (OB_ISNULL(rt_expr.args_[0]) || OB_ISNULL(rt_expr.args_[1]) || OB_ISNULL(rt_expr.args_[2])) { ret = OB_ERR_UNEXPECTED; LOG_WARN("child is null", K(ret), K(rt_expr.args_[0]), K(rt_expr.args_[1]), K(rt_expr.args_[2])); } else if (OB_UNLIKELY(!((ob_is_string_tc(rt_expr.args_[0]->datum_meta_.type_) || ob_is_text_tc(rt_expr.args_[0]->datum_meta_.type_) || ObNullType == rt_expr.args_[0]->datum_meta_.type_)))) { ret = OB_ERR_UNEXPECTED; LOG_WARN("unexpected param type", K(ret), K(rt_expr.args_[0]->datum_meta_)); } else if (OB_UNLIKELY(!(ob_is_string_tc(rt_expr.args_[1]->datum_meta_.type_) || ob_is_text_tc(rt_expr.args_[1]->datum_meta_.type_) || ObNullType == rt_expr.args_[1]->datum_meta_.type_))) { ret = OB_ERR_UNEXPECTED; LOG_WARN("unexpected param type", K(ret), K(rt_expr.args_[1]->datum_meta_)); } else if (OB_UNLIKELY(!(ObVarcharType == rt_expr.args_[2]->datum_meta_.type_ || ObNullType == rt_expr.args_[2]->datum_meta_.type_))) { ret = OB_ERR_UNEXPECTED; LOG_WARN("unexpected param type", K(ret), K(rt_expr.args_[2]->datum_meta_)); } else { //Do optimization even if pattern_expr/escape is pushdown parameter, pattern and escape are //checked whether the same as last time which is recorded in like_ctx for each row in execution. bool pattern_literal = pattern_expr->is_const_expr(); bool escape_literal = escape_expr->is_const_expr(); //do check and match optimization only if extra_ is 1. if (pattern_literal && escape_literal) { rt_expr.extra_ = 1; } else { rt_expr.extra_ = 0; } rt_expr.eval_func_ = ObExprLike::like_varchar; // Since pattern and escape are both literal in TPCH, only support vectorized eval with literal // pattern and escape now. // In the full vectorized implement of like expr, like_ctx will be useless. if (text_expr->is_vectorize_result() && !rt_expr.args_[1]->is_batch_result() && !rt_expr.args_[2]->is_batch_result()) { rt_expr.eval_batch_func_ = ObExprLike::eval_like_expr_batch_only_text_vectorized; rt_expr.eval_vector_func_ = ObExprLike::eval_like_expr_vector_only_text_vectorized; } } return ret; } template void ObExprLike::record_last_check(ObExprLikeContext &like_ctx, const ObString pattern_val, const ObString escape_val, ObIAllocator *buf_alloc) { if (is_static_engine) { const uint32_t init_len = 16; like_ctx.same_as_last = false; like_ctx.last_pattern_len_ = pattern_val.length(); if (pattern_val.length() > like_ctx.pattern_buf_len_) { if(0 == like_ctx.pattern_buf_len_) { like_ctx.pattern_buf_len_ = init_len; } while (pattern_val.length() > like_ctx.pattern_buf_len_) { like_ctx.pattern_buf_len_ *= 2; } like_ctx.last_pattern_ = (char*) (buf_alloc->alloc(sizeof(char) * like_ctx.pattern_buf_len_)); } MEMCPY(like_ctx.last_pattern_, pattern_val.ptr(), pattern_val.length()); like_ctx.last_escape_len_ = escape_val.length(); if (escape_val.length() > like_ctx.escape_buf_len_) { if(0 == like_ctx.escape_buf_len_) { like_ctx.escape_buf_len_ = init_len; } while (escape_val.length() > like_ctx.escape_buf_len_) { like_ctx.escape_buf_len_ *= 2; } like_ctx.last_escape_ = (char*) (buf_alloc->alloc(sizeof(char) * like_ctx.escape_buf_len_)); } MEMCPY(like_ctx.last_escape_, escape_val.ptr(), escape_val.length()); } else { like_ctx.set_checked(); } } template bool ObExprLike::checked_already(const ObExprLikeContext &like_ctx, bool null_pattern, const ObString pattern_val, bool null_escape, const ObString escape_val) { bool res = false; if (is_static_engine) { res = !null_pattern && !null_escape && escape_val.length() == like_ctx.last_escape_len_ && pattern_val.length() == like_ctx.last_pattern_len_ && 0 == MEMCMP(escape_val.ptr(), like_ctx.last_escape_, escape_val.length()) && 0 == MEMCMP(pattern_val.ptr(), like_ctx.last_pattern_, pattern_val.length()); } else { res = like_ctx.is_checked(); } LOG_DEBUG("like check already end", K(null_pattern), K(pattern_val), K(escape_val), K(res)); return res; } int ObExprLike::like_varchar_inner(const ObExpr &expr, ObEvalCtx &ctx, ObDatum &expr_datum, ObDatum &text, ObDatum &pattern, ObDatum &escape) { int ret = OB_SUCCESS; const bool do_optimization = expr.extra_; uint64_t like_id = static_cast(expr.expr_ctx_id_); const ObCollationType escape_coll = expr.args_[2]->datum_meta_.cs_type_; const ObCollationType coll_type = expr.args_[1]->datum_meta_.cs_type_; if (OB_FAIL(check_pattern_valid(pattern, escape, escape_coll, coll_type, &ctx.exec_ctx_, like_id, do_optimization))) { LOG_WARN("fail to check pattern string", K(pattern), K(escape), K(coll_type)); } else if (text.is_null() || pattern.is_null()) { expr_datum.set_null(); } else { ObString text_val = text.get_string(); ObString pattern_val = pattern.get_string(); ObString escape_val; if (escape.is_null() || escape.get_string().empty()) { bool is_no_backslash_escapes = false; IS_NO_BACKSLASH_ESCAPES(ctx.exec_ctx_.get_my_session()->get_sql_mode(), is_no_backslash_escapes); if (!is_no_backslash_escapes) { escape_val.assign_ptr("\\", 1); } } else { escape_val = escape.get_string(); } if (do_optimization && like_id != OB_INVALID_ID && (!text_val.empty()) && (!pattern_val.empty())) { ObExprLikeContext *like_ctx = NULL; if (NULL == (like_ctx = static_cast (ctx.exec_ctx_.get_expr_op_ctx(like_id)))) { ret = OB_ERR_UNEXPECTED; //like context should be created while checking validation. LOG_WARN("like context is null", K(ret), K(like_id)); } if (OB_SUCC(ret)) { if (OB_UNLIKELY((!is_oracle_mode() && !checked_already(*like_ctx, false, pattern_val, false, escape_val)) || (is_oracle_mode() && !like_ctx->same_as_last))) { if (OB_FAIL(set_instr_info(&ctx.exec_ctx_.get_allocator(), coll_type, pattern_val, escape_val, escape_coll, *like_ctx))) { LOG_WARN("failed to set instr info", K(ret), K(pattern_val), K(text_val)); } else if (like_ctx->is_instr_mode()) {//instr mode ret = calc_with_instr_mode(expr_datum, coll_type, text_val, *like_ctx); } else {//not instr mode ret = calc_with_non_instr_mode(expr_datum, coll_type, escape_coll, text_val, pattern_val, escape_val); } if (OB_SUCC(ret) && !is_oracle_mode()) { record_last_check(*like_ctx, pattern_val, escape_val, &ctx.exec_ctx_.get_allocator()); } } else if (like_ctx->is_instr_mode()) {//instr mode ret = calc_with_instr_mode(expr_datum, coll_type, text_val, *like_ctx); } else { //not instr mode ret = calc_with_non_instr_mode(expr_datum, coll_type, escape_coll, text_val, pattern_val, escape_val); } } } else { //normal path. no optimization here. ret = calc_with_non_instr_mode(expr_datum, coll_type, escape_coll, text_val, pattern_val, escape_val); } } return ret; } int ObExprLike::like_varchar(const ObExpr &expr, ObEvalCtx &ctx, ObDatum &expr_datum) { int ret = OB_SUCCESS; if (OB_FAIL(expr.eval_param_value(ctx))) { LOG_WARN("eval param value failed", K(ret)); } ObDatum &text = expr.locate_param_datum(ctx, 0); ObDatum &pattern = expr.locate_param_datum(ctx, 1); ObDatum &escape = expr.locate_param_datum(ctx, 2); // the third arg escape must be varchar if (OB_FAIL(ret)) { } else if (!ob_is_text_tc(expr.args_[0]->datum_meta_.type_) && !ob_is_text_tc(expr.args_[1]->datum_meta_.type_)) { ret = like_varchar_inner(expr, ctx, expr_datum, text, pattern, escape); } else { // text tc ObString text_val = text.get_string(); ObString pattern_val = pattern.get_string(); ObString escape_str = escape.get_string(); // Notice: should not change original datums // ToDo: @gehao Streaming like interfaces ObDatum text_inrow = text; // copy datum flags; ObDatum pattern_inrow = pattern; ObEvalCtx::TempAllocGuard tmp_alloc_g(ctx); common::ObArenaAllocator &temp_allocator = tmp_alloc_g.get_allocator(); if (OB_FAIL(ObTextStringHelper::read_real_string_data(temp_allocator, text, expr.args_[0]->datum_meta_, expr.args_[0]->obj_meta_.has_lob_header(), text_val))) { LOG_WARN("failed to read text", K(ret), K(text_val)); } else if (OB_FAIL(ObTextStringHelper::read_real_string_data(temp_allocator, pattern, expr.args_[1]->datum_meta_, expr.args_[1]->obj_meta_.has_lob_header(), pattern_val))) { LOG_WARN("failed to read pattern", K(ret), K(pattern_val)); } else { if (!text_inrow.is_null() && !text_inrow.is_nop()) { text_inrow.set_string(text_val); } if (!pattern_inrow.is_null() && !pattern_inrow.is_nop()) { pattern_inrow.set_string(pattern_val); } ret = like_varchar_inner(expr, ctx, expr_datum, text_inrow, pattern_inrow, escape); } } if (OB_FAIL(ret)) { LOG_WARN("failed to eval like varchar", K(ret)); } return ret; } template int64_t ObExprLike::match_with_instr_mode(const ObString &text, const InstrInfo instr_info, void *string_searcher) { int64_t res = 0; #if OB_USE_MULTITARGET_CODE // while `instr_info.instr_cnt_` is 1, try to optimize it with SIMD. if (1 == instr_info.instr_cnt_ && common::is_arch_supported(ObTargetArch::AVX2)) { res = match_with_instr_mode_by_simd(text, string_searcher); } else { res = match_with_instr_mode(text, instr_info); } #else res = match_with_instr_mode(text, instr_info); #endif return res; } template OB_INLINE int64_t ObExprLike::match_with_instr_mode(const ObString &text, const InstrInfo &instr_info) { int64_t res = 0; const char *text_ptr = text.ptr(); uint32_t text_len = text.length(); const char **instr_pos = instr_info.instr_starts_; const uint32_t *instr_len = instr_info.instr_lengths_; bool match = true; int64_t idx = 0; int64_t idx_end = percent_sign_end ? instr_info.instr_cnt_ : instr_info.instr_cnt_ - 1; // if not start with %, memcmp for first instr. if (!percent_sign_start) { if (text_len < instr_len[0]) { match = false; } else { int cmp = MEMCMP(text_ptr, instr_pos[0], instr_len[0]); match = 0 == cmp; text_ptr += instr_len[0]; text_len -= instr_len[0]; idx++; } } // memmem for str surrounded by % for (; idx < idx_end && match; idx++) { char *new_text = static_cast(MEMMEM(text_ptr, text_len, instr_pos[idx], instr_len[idx])); text_len -= new_text != NULL ? new_text - text_ptr + instr_len[idx] : 0; if (OB_UNLIKELY(text_len < 0)) { match = false; LOG_ERROR_RET(OB_ERR_UNEXPECTED, "unexpected result of memmem", K(text), K(ObString(instr_len[idx], instr_pos[idx]))); } else { match = new_text != NULL; text_ptr = new_text + instr_len[idx]; } } // if not end with %, memcmp for last instr if (match && !percent_sign_end) { if (text_len < instr_len[idx]) { match = false; } else { match = 0 == MEMCMP(text.ptr() + text.length() - instr_len[idx], instr_pos[idx], instr_len[idx]); } } res = match ? 1 : 0; return res; } // while `instr_info.instr_cnt_` is 1, optimize to calc substring, start_with, end_with or equal. template OB_INLINE int64_t ObExprLike::match_with_instr_mode_by_simd(const ObString &text, void *string_searcher) { bool res = false; #if OB_USE_MULTITARGET_CODE int ret = OB_SUCCESS; const char *text_ptr = text.ptr(); uint32_t text_len = text.length(); StringSearcher *string_searcher_ptr = reinterpret_cast(string_searcher); if (percent_sign_start && percent_sign_end) { ret = string_searcher_ptr->is_substring(text_ptr, text_ptr + text_len, res); } else if (!percent_sign_start && percent_sign_end) { ret = string_searcher_ptr->start_with(text_ptr, text_ptr + text_len, res); } else if (percent_sign_start && !percent_sign_end) { ret = string_searcher_ptr->end_with(text_ptr, text_ptr + text_len, res); } else { ret = string_searcher_ptr->equal(text_ptr, text_ptr + text_len, res); } if (OB_FAIL(ret)) { LOG_ERROR_RET(OB_ERR_UNEXPECTED, "failed to call string_searcher's func.", K(ret), K(percent_sign_start), K(percent_sign_end)); } #endif return res; } struct ObNonInstrModeMatcher { inline int64_t operator() (const ObCollationType coll_type, const ObString &text_val, const ObString &pattern_val, int32_t escape_wc, int &ret) { int64_t res = 0; if (OB_UNLIKELY(text_val.length() <= 0 && pattern_val.length() <= 0)) { // empty string res = 1; } else { bool b = ObCharset::wildcmp(coll_type, text_val, pattern_val, escape_wc, static_cast('_'), static_cast('%')); res = static_cast(b); } return res; } }; template int ObExprLike::match_text_batch(BATCH_EVAL_FUNC_ARG_DECL, const ObCollationType coll_type, const int32_t escape_wc, const ObString &pattern_val, const InstrInfo instr_info, void *string_searcher) { int ret = OB_SUCCESS; ObBitVector &eval_flags = expr.get_evaluated_flags(ctx); ObDatum *res_datums = expr.locate_batch_datums(ctx); ObDatum *text_datums = expr.args_[0]->locate_batch_datums(ctx); const int64_t step_size = sizeof(uint16_t) * CHAR_BIT; const ObObjType text_type = expr.args_[0]->datum_meta_.type_; // calc match result for each text for (int64_t i = 0; i < size && OB_SUCC(ret);) { const int64_t bit_vec_off = i / (CHAR_BIT * sizeof(uint16_t)); const uint16_t skip_v = skip.reinterpret_data()[bit_vec_off]; uint16_t &eval_v = eval_flags.reinterpret_data()[bit_vec_off]; if (i + step_size < size && (0 == (skip_v | eval_v))) { for (int64_t j = 0; OB_SUCC(ret) && j < step_size; i++, j++) { if (NullCheck && text_datums[i].is_null()) { res_datums[i].set_null(); } else if (!ob_is_text_tc(text_type)) { if (UseInstrMode) { int64_t res = ALL_PERCENT_SIGN == InstrMode ? 1 : match_with_instr_mode (text_datums[i].get_string(), instr_info, string_searcher); res_datums[i].set_int(res); } else { res_datums[i].set_int(ObNonInstrModeMatcher()(coll_type, text_datums[i].get_string(), pattern_val, escape_wc, ret)); } } else { // text tc ObEvalCtx::TempAllocGuard tmp_alloc_g(ctx); common::ObArenaAllocator &temp_allocator = tmp_alloc_g.get_allocator(); ObString text_val = text_datums[i].get_string(); if (OB_FAIL(ObTextStringHelper::read_real_string_data(temp_allocator, text_datums[i], expr.args_[0]->datum_meta_, expr.args_[0]->obj_meta_.has_lob_header(), text_val))) { LOG_WARN("failed to read text", K(ret), K(text_val)); } else if (UseInstrMode) { int64_t res = ALL_PERCENT_SIGN == InstrMode ? 1 : match_with_instr_mode (text_val, instr_info, string_searcher); res_datums[i].set_int(res); } else { res_datums[i].set_int(ObNonInstrModeMatcher()(coll_type, text_val, pattern_val, escape_wc, ret)); } } } if (OB_SUCC(ret)) { eval_v = 0xFFFF; } } else if (i + step_size < size && (0xFFFF == (skip_v | eval_v))) { i += step_size; } else { const int64_t new_size = std::min(size, i + step_size); for (; i < new_size && OB_SUCC(ret); i++) { if (!(skip.at(i) || eval_flags.at(i))) { if (NullCheck && text_datums[i].is_null()) { res_datums[i].set_null(); } else if (!ob_is_text_tc(text_type)) { if (UseInstrMode) { int64_t res = ALL_PERCENT_SIGN == InstrMode ? 1 : match_with_instr_mode (text_datums[i].get_string(), instr_info, string_searcher); res_datums[i].set_int(res); } else { res_datums[i].set_int(ObNonInstrModeMatcher()(coll_type, text_datums[i].get_string(), pattern_val, escape_wc, ret)); } } else { // text tc ObEvalCtx::TempAllocGuard tmp_alloc_g(ctx); common::ObArenaAllocator &temp_allocator = tmp_alloc_g.get_allocator(); ObString text_val = text_datums[i].get_string(); if (OB_FAIL(ObTextStringHelper::read_real_string_data(temp_allocator, text_datums[i], expr.args_[0]->datum_meta_, expr.args_[0]->obj_meta_.has_lob_header(), text_val))) { LOG_WARN("failed to read text", K(ret), K(text_val)); } else { if (UseInstrMode) { int64_t res = ALL_PERCENT_SIGN == InstrMode ? 1 : match_with_instr_mode (text_val, instr_info, string_searcher); res_datums[i].set_int(res); } else { res_datums[i].set_int(ObNonInstrModeMatcher()(coll_type, text_val, pattern_val, escape_wc, ret)); } } } eval_flags.set(i); } } } } return ret; } template int ObExprLike::match_text_vector(VECTOR_EVAL_FUNC_ARG_DECL, const ObCollationType coll_type, const int32_t escape_wc, const ObString &pattern_val, const InstrInfo instr_info, void *string_searcher) { int ret = OB_SUCCESS; ObBitVector &eval_flags = expr.get_evaluated_flags(ctx); const TextVec *text_vec = static_cast(expr.args_[0]->get_vector(ctx)); ResVec *res_vec = static_cast(expr.get_vector(ctx)); const ObObjType text_type = expr.args_[0]->datum_meta_.type_; // calc match result for each text for (int64_t i = bound.start(); i < bound.end() && OB_SUCC(ret); i++) { if (!(skip.at(i) || eval_flags.at(i))) { if (NullCheck && text_vec->is_null(i)) { res_vec->set_null(i); } else if (!ob_is_text_tc(text_type)) { if (UseInstrMode) { int64_t res = ALL_PERCENT_SIGN == InstrMode ? 1 : match_with_instr_mode (text_vec->get_string(i), instr_info, string_searcher); res_vec->set_int(i, res); } else { res_vec->set_int(i, ObNonInstrModeMatcher()(coll_type, text_vec->get_string(i), pattern_val, escape_wc, ret)); } } else { // text tc ObEvalCtx::TempAllocGuard tmp_alloc_g(ctx); common::ObArenaAllocator &temp_allocator = tmp_alloc_g.get_allocator(); ObString text_val = text_vec->get_string(i); if (OB_FAIL(ObTextStringHelper::read_real_string_data(temp_allocator, text_vec, expr.args_[0]->datum_meta_, expr.args_[0]->obj_meta_.has_lob_header(), text_val, i))) { LOG_WARN("failed to read text", K(ret), K(text_val)); } else if (UseInstrMode) { int64_t res = ALL_PERCENT_SIGN == InstrMode ? 1 : match_with_instr_mode (text_val, instr_info, string_searcher); res_vec->set_int(i, res); } else { res_vec->set_int(i, ObNonInstrModeMatcher()(coll_type, text_val, pattern_val, escape_wc, ret)); } } eval_flags.set(i); } } return ret; } int ObExprLike::like_text_vectorized_inner(const ObExpr &expr, ObEvalCtx &ctx, const ObBitVector &skip, const int64_t size, ObExpr &text, ObDatum *pattern_datum, ObDatum *escape_datum) { int ret = OB_SUCCESS; const bool do_optimization = true; uint64_t like_id = static_cast(expr.expr_ctx_id_); const ObCollationType coll_type = expr.args_[0]->datum_meta_.cs_type_; const ObCollationType escape_coll = expr.args_[2]->datum_meta_.cs_type_; if (OB_FAIL(check_pattern_valid(*pattern_datum, *escape_datum, escape_coll, coll_type, &ctx.exec_ctx_, like_id, do_optimization))) { LOG_WARN("check pattern valid failed", K(ret)); } else if (OB_FAIL(text.eval_batch(ctx, skip, size))) { LOG_WARN("eval text batch failed", K(ret)); } else if (OB_UNLIKELY(pattern_datum->is_null())) { ObDatum *res_datums = expr.locate_batch_datums(ctx); ObBitVector &eval_flags = expr.get_evaluated_flags(ctx); for (int64_t i = 0; i < size; i++) { if (!skip.contain(i)) { res_datums[i].set_null(); eval_flags.set(i); } } expr.get_eval_info(ctx).notnull_ = false; } else { ObString pattern_val = pattern_datum->get_string(); ObString escape_val; // check pattern is not null already, so result is null if and only if text is null. bool null_check = !expr.args_[0]->get_eval_info(ctx).notnull_; if (escape_datum->is_null() || escape_datum->get_string().empty()) { bool is_no_backslash_escapes = false; IS_NO_BACKSLASH_ESCAPES(ctx.exec_ctx_.get_my_session()->get_sql_mode(), is_no_backslash_escapes); if (!is_no_backslash_escapes) { escape_val.assign_ptr("\\", 1); } } else { escape_val = escape_datum->get_string(); } ObExprLikeContext *like_ctx = NULL; if (OB_ISNULL(like_ctx = static_cast (ctx.exec_ctx_.get_expr_op_ctx(like_id)))) { ret = OB_ERR_UNEXPECTED; //like context should be created while checking validation. LOG_WARN("like context is null", K(ret), K(like_id)); } else if (OB_UNLIKELY((!is_oracle_mode() && !checked_already(*like_ctx, false, pattern_val, false, escape_val)) || (is_oracle_mode() && !like_ctx->same_as_last))) { if (OB_FAIL(set_instr_info(&ctx.exec_ctx_.get_allocator(), coll_type, pattern_val, escape_val, escape_coll, *like_ctx))) { LOG_WARN("failed to set instr info", K(ret), K(pattern_val)); } else if (!is_oracle_mode()) { record_last_check(*like_ctx, pattern_val, escape_val, &ctx.exec_ctx_.get_allocator()); } } INSTR_MODE instr_mode = like_ctx->get_instr_mode(); const InstrInfo instr_info = like_ctx->instr_info_; void *string_searcher = like_ctx->string_searcher_; int32_t escape_wc = 0; LOG_DEBUG("set instr info inner end", K(coll_type), K(pattern_val), K(instr_mode), K(like_ctx->same_as_last)); if (OB_FAIL(ret)) { } else if (INVALID_INSTR_MODE == instr_mode && OB_FAIL(calc_escape_wc(escape_coll, escape_val, escape_wc))) { LOG_WARN("calc escape wc failed", K(ret)); LOG_USER_ERROR(OB_INVALID_ARGUMENT, "ESCAPE"); } else { #define MATCH_TEXT_BATCH_ARG_LIST expr, ctx, skip, size, coll_type, escape_wc, pattern_val, \ instr_info, string_searcher // it seems to take a lot of work to make eval_info.notnull_ correct and it may be removed. // so null_check variable is not used now, match_text_batch is called always with null check. #define CALL_MATCH_TEXT_BATCH(use_instr_mode, instr_mode) \ ret = match_text_batch(MATCH_TEXT_BATCH_ARG_LIST); switch (instr_mode) { case INVALID_INSTR_MODE: { CALL_MATCH_TEXT_BATCH(false, INVALID_INSTR_MODE) break; } case START_WITH_PERCENT_SIGN: { CALL_MATCH_TEXT_BATCH(true, START_WITH_PERCENT_SIGN) break; } case START_END_WITH_PERCENT_SIGN: { CALL_MATCH_TEXT_BATCH(true, START_END_WITH_PERCENT_SIGN) break; } case MIDDLE_PERCENT_SIGN: { CALL_MATCH_TEXT_BATCH(true, MIDDLE_PERCENT_SIGN); break; } case END_WITH_PERCENT_SIGN: { CALL_MATCH_TEXT_BATCH(true, END_WITH_PERCENT_SIGN) break; } case ALL_PERCENT_SIGN: { CALL_MATCH_TEXT_BATCH(true, ALL_PERCENT_SIGN) break; } default : { ret = OB_ERR_UNEXPECTED; LOG_ERROR("unexpected instr mode", K(ret), K(instr_mode), K(pattern_val)); break; } } if (OB_FAIL(ret)) { LOG_WARN("match text batch failed", K(ret), K(instr_mode), K(null_check)); } else { expr.get_eval_info(ctx).notnull_ = !null_check; } #undef MATCH_TEXT_BATCH_ARG_LIST #undef CALL_MATCH_TEXT_BATCH } } return ret; } template int ObExprLike::like_text_vectorized_inner_vec2(const ObExpr &expr, ObEvalCtx &ctx, const ObBitVector &skip, const EvalBound &bound, ObExpr &text, ObDatum *pattern_inrow) { int ret = OB_SUCCESS; const bool do_optimization = true; uint64_t like_id = static_cast(expr.expr_ctx_id_); const ObCollationType coll_type = expr.args_[0]->datum_meta_.cs_type_; const ObCollationType escape_coll = expr.args_[2]->datum_meta_.cs_type_; ConstUniformFormat *pattern_vector = static_cast(expr.args_[1]->get_vector(ctx)); ConstUniformFormat *escape_vector = static_cast(expr.args_[2]->get_vector(ctx)); if (OB_FAIL(check_pattern_valid(*pattern_inrow, escape_vector->get_datum(0), escape_coll, coll_type, &ctx.exec_ctx_, like_id, do_optimization))) { LOG_WARN("check pattern valid failed", K(ret)); } else if (OB_UNLIKELY(pattern_inrow->is_null())) { ResVec *res_vec = static_cast(expr.get_vector(ctx)); ObBitVector &eval_flags = expr.get_evaluated_flags(ctx); for (int64_t i = bound.start(); i < bound.end(); i++) { if (!skip.at(i)) { res_vec->set_null(i); eval_flags.set(i); } } expr.get_eval_info(ctx).notnull_ = false; } else { ObString pattern_val = pattern_inrow->get_string(); ObString escape_val; // check pattern is not null already, so result is null if and only if text is null. bool null_check = !expr.args_[0]->get_eval_info(ctx).notnull_; if (escape_vector->is_null(0) || escape_vector->get_string(0).empty()) { bool is_no_backslash_escapes = false; IS_NO_BACKSLASH_ESCAPES(ctx.exec_ctx_.get_my_session()->get_sql_mode(), is_no_backslash_escapes); if (!is_no_backslash_escapes) { escape_val.assign_ptr("\\", 1); } } else { escape_val = escape_vector->get_string(0); } ObExprLikeContext *like_ctx = NULL; if (OB_ISNULL(like_ctx = static_cast (ctx.exec_ctx_.get_expr_op_ctx(like_id)))) { ret = OB_ERR_UNEXPECTED; //like context should be created while checking validation. LOG_WARN("like context is null", K(ret), K(like_id)); } else if (OB_UNLIKELY((!is_oracle_mode() && !checked_already(*like_ctx, false, pattern_val, false, escape_val)) || (is_oracle_mode() && !like_ctx->same_as_last))) { if (OB_FAIL(set_instr_info(&ctx.exec_ctx_.get_allocator(), coll_type, pattern_val, escape_val, escape_coll, *like_ctx))) { LOG_WARN("failed to set instr info", K(ret), K(pattern_val)); } else if (!is_oracle_mode()) { record_last_check(*like_ctx, pattern_val, escape_val, &ctx.exec_ctx_.get_allocator()); } } INSTR_MODE instr_mode = like_ctx->get_instr_mode(); const InstrInfo instr_info = like_ctx->instr_info_; void *string_searcher = like_ctx->string_searcher_; int32_t escape_wc = 0; LOG_DEBUG("set instr info inner end", K(coll_type), K(pattern_val), K(instr_mode), K(like_ctx->same_as_last)); if (OB_FAIL(ret)) { } else if (INVALID_INSTR_MODE == instr_mode && OB_FAIL(calc_escape_wc(escape_coll, escape_val, escape_wc))) { LOG_WARN("calc escape wc failed", K(ret)); LOG_USER_ERROR(OB_INVALID_ARGUMENT, "ESCAPE"); } else { #define MATCH_TEXT_VECTOR_ARG_LIST expr, ctx, skip, bound, coll_type, escape_wc, pattern_val, \ instr_info, string_searcher // it seems to take a lot of work to make eval_info.notnull_ correct and it may be removed. // so null_check variable is not used now, match_text_batch is called always with null check. #define CALL_MATCH_TEXT_VECTOR(use_instr_mode, instr_mode) \ ret = match_text_vector(MATCH_TEXT_VECTOR_ARG_LIST); switch (instr_mode) { case INVALID_INSTR_MODE: { CALL_MATCH_TEXT_VECTOR(false, INVALID_INSTR_MODE) break; } case START_WITH_PERCENT_SIGN: { CALL_MATCH_TEXT_VECTOR(true, START_WITH_PERCENT_SIGN) break; } case START_END_WITH_PERCENT_SIGN: { CALL_MATCH_TEXT_VECTOR(true, START_END_WITH_PERCENT_SIGN) break; } case MIDDLE_PERCENT_SIGN: { CALL_MATCH_TEXT_VECTOR(true, MIDDLE_PERCENT_SIGN); break; } case END_WITH_PERCENT_SIGN: { CALL_MATCH_TEXT_VECTOR(true, END_WITH_PERCENT_SIGN) break; } case ALL_PERCENT_SIGN: { CALL_MATCH_TEXT_VECTOR(true, ALL_PERCENT_SIGN) break; } default : { ret = OB_ERR_UNEXPECTED; LOG_ERROR("unexpected instr mode", K(ret), K(instr_mode), K(pattern_val)); break; } } if (OB_FAIL(ret)) { LOG_WARN("match text batch failed", K(ret), K(instr_mode), K(null_check)); } else { expr.get_eval_info(ctx).notnull_ = !null_check; } #undef MATCH_TEXT_VECTOR_ARG_LIST #undef CALL_MATCH_TEXT_VECTOR } } return ret; } // only text is vectorized, check pattern validation and mode first, then try to match each text. int ObExprLike::eval_like_expr_batch_only_text_vectorized(BATCH_EVAL_FUNC_ARG_DECL) { int ret = OB_SUCCESS; ObExpr &text = *expr.args_[0]; ObExpr &pattern = *expr.args_[1]; ObExpr &escape = *expr.args_[2]; ObDatum *pattern_datum = NULL; ObDatum *escape_datum = NULL; if (OB_FAIL(pattern.eval(ctx, pattern_datum))) { LOG_WARN("eval pattern failed", K(ret)); } else if (OB_FAIL(escape.eval(ctx, escape_datum))) { LOG_WARN("eval escape failed", K(ret)); // the third arg escape must be varchar } else if ((!ob_is_text_tc(text.datum_meta_.type_) && !ob_is_text_tc(pattern.datum_meta_.type_))) { ret = like_text_vectorized_inner(expr, ctx, skip, size, text, pattern_datum, escape_datum); } else { ObDatum pattern_inrow; ObEvalCtx::TempAllocGuard tmp_alloc_g(ctx); common::ObArenaAllocator &temp_allocator = tmp_alloc_g.get_allocator(); ObString pattern_val = pattern_datum->get_string(); if (OB_FAIL(ObTextStringHelper::read_real_string_data(temp_allocator, *pattern_datum, expr.args_[1]->datum_meta_, expr.args_[1]->obj_meta_.has_lob_header(), pattern_val))) { LOG_WARN("failed to read pattern", K(ret), K(pattern_val)); } else { pattern_inrow = *pattern_datum; if (!pattern_inrow.is_null() && !pattern_inrow.is_nop()) { pattern_inrow.set_string(pattern_val); } ret = like_text_vectorized_inner(expr, ctx, skip, size, text, &pattern_inrow, escape_datum); } } if (OB_FAIL(ret)) { LOG_WARN("failed to eval_like_expr_batch_only_text_vectorized", K(ret)); } return ret; } template int ObExprLike::vector_like(VECTOR_EVAL_FUNC_ARG_DECL) { int ret = OB_SUCCESS; ObExpr &text = *expr.args_[0]; ObExpr &pattern = *expr.args_[1]; ObExpr &escape = *expr.args_[2]; const ConstUniformFormat *pattern_vector = static_cast(expr.args_[1]->get_vector(ctx)); ObDatum pattern_inrow = pattern_vector->get_datum(0); if ((!ob_is_text_tc(text.datum_meta_.type_) && !ob_is_text_tc(pattern.datum_meta_.type_))) { ret = like_text_vectorized_inner_vec2(expr, ctx, skip, bound, text, &pattern_inrow); } else { ObEvalCtx::TempAllocGuard tmp_alloc_g(ctx); ObString pattern_val = pattern_vector->get_string(0); common::ObArenaAllocator &temp_allocator = tmp_alloc_g.get_allocator(); if (OB_FAIL(ObTextStringHelper::read_real_string_data( temp_allocator, pattern_vector->get_datum(0), expr.args_[1]->datum_meta_, expr.args_[1]->obj_meta_.has_lob_header(), pattern_val))) { LOG_WARN("failed to read pattern", K(ret), K(pattern_val)); } else { if (!pattern_inrow.is_null() && !pattern_inrow.is_nop()) { pattern_inrow.set_string(pattern_val); } ret = like_text_vectorized_inner_vec2(expr, ctx, skip, bound, text, &pattern_inrow); } } return ret; } // only text is vectorized, check pattern validation and mode first, then try to match each text. int ObExprLike::eval_like_expr_vector_only_text_vectorized(VECTOR_EVAL_FUNC_ARG_DECL) { int ret = OB_SUCCESS; ObExpr &pattern = *expr.args_[1]; ObExpr &escape = *expr.args_[2]; if (OB_FAIL(pattern.eval_vector(ctx, skip, bound))) { LOG_WARN("eval pattern failed", K(ret)); } else if (OB_FAIL(escape.eval_vector(ctx, skip, bound))) { LOG_WARN("eval escape failed", K(ret)); // the third arg escape must be varchar } else if (OB_FAIL(expr.args_[0]->eval_vector(ctx, skip, bound))) { LOG_WARN("eval text batch failed", K(ret)); } else { VectorFormat text_format = expr.args_[0]->get_format(ctx); VectorFormat res_format = expr.get_format(ctx); if (VEC_DISCRETE == text_format && VEC_DISCRETE == res_format) { ret = vector_like(VECTOR_EVAL_FUNC_ARG_LIST); } else if (VEC_UNIFORM == text_format && VEC_DISCRETE == res_format) { ret = vector_like(VECTOR_EVAL_FUNC_ARG_LIST); } else if (VEC_CONTINUOUS == text_format && VEC_DISCRETE == res_format) { ret = vector_like(VECTOR_EVAL_FUNC_ARG_LIST); } else if (VEC_DISCRETE == text_format && VEC_UNIFORM == res_format) { ret = vector_like(VECTOR_EVAL_FUNC_ARG_LIST); } else if (VEC_UNIFORM == text_format && VEC_UNIFORM == res_format) { ret = vector_like(VECTOR_EVAL_FUNC_ARG_LIST); } else if (VEC_CONTINUOUS == text_format && VEC_UNIFORM == res_format) { ret = vector_like(VECTOR_EVAL_FUNC_ARG_LIST); } else { ret = vector_like(VECTOR_EVAL_FUNC_ARG_LIST); } } if (OB_FAIL(ret)) { LOG_WARN("failed to eval_like_expr_batch_only_text_vectorized", K(ret)); } return ret; } DEF_SET_LOCAL_SESSION_VARS(ObExprLike, raw_expr) { int ret = OB_SUCCESS; SET_LOCAL_SYSVAR_CAPACITY(1); EXPR_ADD_LOCAL_SYSVAR(share::SYS_VAR_COLLATION_CONNECTION); return ret; } } }