Files
oceanbase/src/sql/engine/expr/ob_expr_regexp_context.cpp
2023-04-13 09:05:30 +00:00

893 lines
38 KiB
C++

/**
* Copyright (c) 2021 OceanBase
* OceanBase CE is licensed under Mulan PubL v2.
* You can use this software according to the terms and conditions of the Mulan PubL v2.
* You may obtain a copy of Mulan PubL v2 at:
* http://license.coscl.org.cn/MulanPubL-2.0
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PubL v2 for more details.
*/
#define USING_LOG_PREFIX LIB
#include <stdlib.h>
#include <locale.h>
#include <cstring>
#include "lib/oblog/ob_log.h"
#include "lib/allocator/ob_malloc.h"
#include "lib/charset/ob_charset.h"
#include "sql/engine/expr/ob_expr_regexp_context.h"
#include "sql/engine/expr/ob_expr_util.h"
#include "sql/resolver/expr/ob_raw_expr_util.h"
#include "sql/session/ob_sql_session_info.h"
namespace oceanbase
{
using namespace common;
namespace sql
{
ObExprRegexContext::ObExprRegexContext()
: ObExprOperatorCtx(),
inited_(false),
cflags_(0),
regexp_engine_(NULL)
{
}
ObExprRegexContext::~ObExprRegexContext()
{
destroy();
}
void ObExprRegexContext::reset()
{
destroy();
}
void ObExprRegexContext::destroy()
{
if (inited_) {
inited_ = false;
cflags_ = 0;
if (regexp_engine_ != NULL) {
uregex_close(regexp_engine_);
regexp_engine_ = NULL;
}
}
}
int ObExprRegexContext::init(ObExprStringBuf &string_buf,
ObSQLSessionInfo *session_info,
const ObString &origin_pattern,
const uint32_t cflags,
const bool reusable,
const ObCollationType pattern_cs_type)
{
int ret = OB_SUCCESS;
int regex_error_num = 0;
ObString pattern;
ObString origin_pattern_utf16;
if (OB_UNLIKELY(inited_ && !reusable)) {
ret = OB_INIT_TWICE;
LOG_WARN("already inited", K(ret), K(this));
} else if (origin_pattern.length() < 0 ||
(origin_pattern.length() > 0 && OB_ISNULL(origin_pattern.ptr()))) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid param pattern", K(ret), K(origin_pattern));
} else if (CS_TYPE_UTF16_BIN != pattern_cs_type &&
CS_TYPE_UTF16_GENERAL_CI != pattern_cs_type) {
//pattern is nchar or nvarchar
if (origin_pattern.length() >= 2) {
if (OB_FAIL(ObExprUtil::convert_string_collation(origin_pattern,
pattern_cs_type,
origin_pattern_utf16,
ObCharset::is_bin_sort(pattern_cs_type) ? CS_TYPE_UTF16_BIN : CS_TYPE_UTF16_GENERAL_CI,
string_buf))) {
LOG_WARN("convert charset failed", K(ret));
}
} else {
//because uregex_open returns error if u_pattern_length is 0 or u_pattern is null,
//use ".{0}" to represent an empty pattern when the valid length of the pattern is 0,
//for example: regexp_count(convert(t1.c1, 'utf8'),'a')
ObString const_pattern(".{0}");
if (OB_FAIL(ObExprUtil::convert_string_collation(const_pattern,
CS_TYPE_UTF8MB4_BIN,
origin_pattern_utf16,
CS_TYPE_UTF16_BIN,
string_buf))) {
LOG_WARN("convert charset failed", K(ret));
}
}
} else {
origin_pattern_utf16 = origin_pattern;
}
if (OB_FAIL(ret)) {
} else if (OB_UNLIKELY(origin_pattern_utf16.length() % sizeof(UChar) != 0)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("invalid param, source text is null", K(ret), K(origin_pattern_utf16.length()));
} else if (OB_FAIL(preprocess_pattern(string_buf, origin_pattern_utf16, pattern))) {
LOG_WARN("failed to prepare process pattern", K(origin_pattern_utf16), K(pattern));
} else if (reusable && inited_ &&
pattern_ == ObString(0, pattern.length(), pattern.ptr())
&& cflags_ == cflags) {
// reuse the previous compile result.
} else {
if (inited_) { // reusable && pattern changed
reset();
}
pattern_allocator_.prepare(string_buf);
pattern_wc_allocator_.prepare(string_buf);
char *pattern_save = static_cast<char *>(pattern_allocator_.alloc(pattern.length()));
if (NULL == pattern_save) {
ret = OB_ALLOCATE_MEMORY_FAILED;
LOG_WARN("allocate memory failed", K(ret));
} else {
MEMCPY(pattern_save, pattern.ptr(), pattern.length());
pattern_.assign_ptr(pattern_save, pattern.length());
cflags_ = cflags;
}
int32_t u_pattern_length = 0;
UChar *u_pattern = NULL;
UParseError parse_error;
UErrorCode u_error_code = U_ZERO_ERROR;
int64_t regexp_stack_limit = 0;
int64_t regexp_time_limit = 0;
if (OB_FAIL(ret)) {
} else if (OB_FAIL(get_valid_unicode_string(string_buf, pattern, u_pattern, u_pattern_length))) {
LOG_WARN("failed to get valid unicode string", K(ret));
} else if (OB_ISNULL(u_pattern) || OB_ISNULL(session_info)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("get unexpcted null", K(ret), K(pattern), K(u_pattern_length), K(session_info));
} else if (OB_FAIL(session_info->get_regexp_stack_limit(regexp_stack_limit)) ||
OB_FAIL(session_info->get_regexp_time_limit(regexp_time_limit))) {
LOG_WARN("failed to get regexp_stack_limit or get_regexp_time_limit", K(ret),
K(regexp_stack_limit), K(regexp_time_limit));
} else {
regexp_engine_ = uregex_open(u_pattern, u_pattern_length, cflags, &parse_error, &u_error_code);
uregex_setStackLimit(regexp_engine_, regexp_stack_limit, &u_error_code);
uregex_setTimeLimit(regexp_engine_, regexp_time_limit, &u_error_code);
if (OB_FAIL(check_icu_regexp_status(u_error_code, &parse_error))) {
LOG_WARN("failed to check icu regexp status", K(ret));
if (regexp_engine_ != NULL) {
uregex_close(regexp_engine_);
regexp_engine_ = NULL;
}
} else {
inited_ = true;
}
}
}
return ret;
}
int ObExprRegexContext::match(ObExprStringBuf &string_buf,
const ObString &text,
const int64_t start,
bool &result) const
{
int ret = OB_SUCCESS;
UChar *u_text = NULL;
int32_t u_text_length = 0;
UErrorCode m_error_code = U_ZERO_ERROR;
result = false;
if (OB_UNLIKELY(!inited_) || OB_ISNULL(regexp_engine_)) {
ret = OB_NOT_INIT;
LOG_WARN("regexp context not inited yet", K(ret), K(inited_), K(regexp_engine_));
} else if (OB_FAIL(get_valid_unicode_string(string_buf, text, u_text, u_text_length))) {
LOG_WARN("failed to get valid unicode string", K(ret));
} else {
uregex_setText(regexp_engine_, u_text, u_text_length, &m_error_code);
result = uregex_find(regexp_engine_, start, &m_error_code);
if (OB_FAIL(check_icu_regexp_status(m_error_code))) {
LOG_WARN("failed to check icu regexp status", K(ret), K(u_errorName(m_error_code)));
} else {
LOG_TRACE("Succeed to match", K(start), K(text.length()), K(result));
}
}
return ret;
}
int ObExprRegexContext::find(ObExprStringBuf &string_buf,
const ObString &text,
const int64_t start,
const int64_t occurrence,
const int64_t return_option,
const int64_t subexpr,
int64_t &result) const
{
int ret = OB_SUCCESS;
UChar *u_text = NULL;
int32_t u_text_length = 0;
result = 0;
if (OB_UNLIKELY(!inited_) || OB_ISNULL(regexp_engine_)) {
ret = OB_NOT_INIT;
LOG_WARN("regexp context not inited yet", K(ret), K(inited_), K(regexp_engine_));
} else if (OB_FAIL(get_valid_unicode_string(string_buf, text, u_text, u_text_length))) {
LOG_WARN("failed to get valid unicode string", K(ret));
} else if (0 == u_text_length) {
//do nothing
} else {
UErrorCode m_error_code = U_ZERO_ERROR;
uregex_setText(regexp_engine_, u_text, u_text_length, &m_error_code);
bool found = uregex_find(regexp_engine_, start, &m_error_code);
for (int64_t i = 1; i < occurrence && found; ++i) {
found = uregex_findNext(regexp_engine_, &m_error_code);
}
if (lib::is_oracle_mode() && U_INDEX_OUTOFBOUNDS_ERROR == m_error_code) {
//compatible oracle, not throw error
} else if (OB_FAIL(check_icu_regexp_status(m_error_code))) {
LOG_WARN("failed to check icu regexp status", K(ret), K(u_errorName(m_error_code)));
} else if (found) {
int64_t start_pos = uregex_start(regexp_engine_, subexpr, &m_error_code) + 1;
int64_t end_pos = uregex_end(regexp_engine_, subexpr, &m_error_code) + 1;
if (lib::is_oracle_mode() && U_INDEX_OUTOFBOUNDS_ERROR == m_error_code) {
//compatible oracle, not throw error
} else if (OB_FAIL(check_icu_regexp_status(m_error_code))) {
LOG_WARN("failed to check icu regexp status", K(ret), K(u_errorName(m_error_code)));
} else {
result = return_option ? end_pos : start_pos;
LOG_TRACE("succeed to regexp instr", K(result), K(start), K(occurrence), K(return_option),
K(subexpr), K(text), K(text.length()), K(end_pos), K(start_pos));
}
} else {
result = 0;
LOG_TRACE("succeed to regexp instr", K(result), K(start), K(occurrence), K(return_option),
K(subexpr), K(text), K(text.length()));
}
}
return ret;
}
int ObExprRegexContext::count(ObExprStringBuf &string_buf,
const ObString &text,
const int32_t start,
int64_t &result) const
{
int ret = OB_SUCCESS;
UChar *u_text = NULL;
int32_t u_text_length = 0;
result = 0;
if (OB_UNLIKELY(!inited_) || OB_ISNULL(regexp_engine_)) {
ret = OB_NOT_INIT;
LOG_WARN("regexp context not inited yet", K(ret), K(inited_), K(regexp_engine_));
} else if (OB_FAIL(get_valid_unicode_string(string_buf, text, u_text, u_text_length))) {
LOG_WARN("failed to get valid unicode string", K(ret));
} else if (0 == u_text_length) {
//do nothing
} else {
UErrorCode m_error_code = U_ZERO_ERROR;
uregex_setText(regexp_engine_, u_text, u_text_length, &m_error_code);
bool found = uregex_find(regexp_engine_, start, &m_error_code);
result += found ? 1 : 0;
while (uregex_findNext(regexp_engine_, &m_error_code)) { ++result; }
if (lib::is_oracle_mode() && U_INDEX_OUTOFBOUNDS_ERROR == m_error_code) {
//compatible oracle, not throw error
result = 0;
LOG_TRACE("succeed get regexp count", K(found), K(result), K(text), K(start));
} else if (OB_FAIL(check_icu_regexp_status(m_error_code))) {
LOG_WARN("failed to check icu regexp status", K(ret), K(u_errorName(m_error_code)));
} else {
LOG_TRACE("succeed get regexp count", K(found), K(result), K(text), K(start));
}
}
return ret;
}
int ObExprRegexContext::substr(ObExprStringBuf &string_buf,
const ObString &text,
const int64_t start,
const int64_t occurrence,
const int64_t subexpr,
ObString &result) const
{
int ret = OB_SUCCESS;
UChar *u_text = NULL;
int32_t u_text_length = 0;
result.reset();
if (OB_UNLIKELY(!inited_) || OB_ISNULL(regexp_engine_)) {
ret = OB_NOT_INIT;
LOG_WARN("regexp context not inited yet", K(ret), K(inited_), K(regexp_engine_));
} else if (OB_FAIL(get_valid_unicode_string(string_buf, text, u_text, u_text_length))) {
LOG_WARN("failed to get valid unicode string", K(ret));
} else {
UErrorCode m_error_code = U_ZERO_ERROR;
int64_t start_pos = 0;
int64_t end_pos = 0;
uregex_setText(regexp_engine_, u_text, u_text_length, &m_error_code);
bool found = uregex_find(regexp_engine_, start, &m_error_code);
for (int64_t i = 1; i < occurrence && found; ++i) {
found = uregex_findNext(regexp_engine_, &m_error_code);
}
if (lib::is_oracle_mode() && U_INDEX_OUTOFBOUNDS_ERROR == m_error_code) {
//compatible oracle, not throw error
} else if (OB_FAIL(check_icu_regexp_status(m_error_code))) {
LOG_WARN("failed to check icu regexp status", K(ret), K(u_errorName(m_error_code)));
} else if (found) {
start_pos = uregex_start(regexp_engine_, subexpr, &m_error_code);
end_pos = uregex_end(regexp_engine_, subexpr, &m_error_code);
int64_t sublength = end_pos - start_pos;
if (lib::is_oracle_mode() && U_INDEX_OUTOFBOUNDS_ERROR == m_error_code) {
//compatible oracle, not throw error
} else if (OB_FAIL(check_icu_regexp_status(m_error_code))) {
LOG_WARN("failed to check icu regexp status", K(ret), K(u_errorName(m_error_code)));
} else if (sublength > 0) {
if (OB_UNLIKELY(sizeof(UChar) * end_pos > text.length())) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("get unexpected error", K(ret), K(sizeof(UChar) * end_pos), K(text.length()));
} else {
result.assign_ptr(text.ptr() + sizeof(UChar) * start_pos, sublength * sizeof(UChar));
}
}
}
LOG_TRACE("succeed to regexp instr", K(result), K(start), K(occurrence), K(start_pos), K(found),
K(end_pos), K(subexpr), K(text), K(text.length()));
}
return ret;
}
int ObExprRegexContext::replace(ObExprStringBuf &string_buf,
const ObString &text_string,
const ObString &replace_string,
const int64_t start,
const int64_t occurrence,
ObString &result) const
{
int ret = OB_SUCCESS;
UChar *u_text = NULL;
int32_t u_text_length = 0;
result.reset();
if (OB_UNLIKELY(!inited_) || OB_ISNULL(regexp_engine_)) {
ret = OB_NOT_INIT;
LOG_WARN("regexp context not inited yet", K(ret), K(inited_), K(regexp_engine_));
} else if (OB_FAIL(get_valid_unicode_string(string_buf, text_string, u_text, u_text_length))) {
LOG_WARN("failed to get valid unicode string", K(ret));
} else if (0 == u_text_length) {
result = text_string;
} else {
UChar *replace_buff = NULL;
int32_t buff_size = 0;
int32_t buff_pos = 0;
UErrorCode m_error_code = U_ZERO_ERROR;
UChar *u_replace = NULL;
int32_t u_replace_length = 0;
uregex_setText(regexp_engine_, u_text, u_text_length, &m_error_code);
bool found = uregex_find(regexp_engine_, start, &m_error_code);
int64_t end_of_previous_match = 0;
for (int i = 1; i < occurrence && found; ++i) {
end_of_previous_match = uregex_end(regexp_engine_, 0, &m_error_code);
found = uregex_findNext(regexp_engine_, &m_error_code);
}
if (lib::is_oracle_mode() && U_INDEX_OUTOFBOUNDS_ERROR == m_error_code) {
result = text_string;
} else if (OB_FAIL(check_icu_regexp_status(m_error_code))) {
LOG_WARN("failed to check icu regexp status", K(ret), K(u_errorName(m_error_code)));
} else if (!found) {
result = text_string;
} else if (OB_ISNULL(replace_buff = static_cast<UChar *>(string_buf.alloc(text_string.length())))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
LOG_WARN("alloc memory failed.", K(replace_buff), K(text_string.length()), K(ret));
} else if (OB_FAIL(get_valid_replace_string(string_buf, replace_string, u_replace, u_replace_length))) {
LOG_WARN("failed to get valid replace string", K(ret));
} else {
buff_size = text_string.length() / sizeof(UChar);
if (OB_FAIL(append_head(string_buf,
start > end_of_previous_match ? start : end_of_previous_match,
replace_buff,
buff_size,
buff_pos))) {
LOG_WARN("failed to append head", K(ret));
} else {
do {
if (OB_FAIL(append_replace_str(string_buf, u_replace, u_replace_length,
replace_buff, buff_size, buff_pos))) {
LOG_WARN("failed to append replace str", K(ret));
}
} while (OB_SUCC(ret) && occurrence == 0 && uregex_findNext(regexp_engine_, &m_error_code));
if (OB_SUCC(ret)) {
if (OB_FAIL(append_tail(string_buf, replace_buff, buff_size, buff_pos))) {
LOG_WARN("failed to append tail", K(ret));
} else {
for (int64_t i = 0; i < buff_pos; ++i) {
replace_buff[i] = ntohs(static_cast<uint16_t>(replace_buff[i]));
}
result.assign_ptr(static_cast<char*>((void*)replace_buff), buff_pos * sizeof(UChar));
}
}
}
LOG_TRACE("succeed to regexp replace", K(result), K(start), K(occurrence), K(found),
K(end_of_previous_match), K(buff_pos), K(text_string),
K(text_string.length()), K(replace_string),
K(replace_string.length()));
}
}
return ret;
}
int ObExprRegexContext::append_head(ObExprStringBuf &string_buf,
const int32_t current_pos,
UChar *&replace_buff,
int32_t &buff_size,
int32_t &buff_pos) const
{
int ret = OB_SUCCESS;
if (current_pos <= 0) {
//do nothing
} else {
int32_t text_length = 0;
UErrorCode m_error_code = U_ZERO_ERROR;
const UChar *text = uregex_getText(regexp_engine_, &text_length, &m_error_code);
if (m_error_code == U_ZERO_ERROR) {
if (OB_UNLIKELY(current_pos > text_length)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("get unexpected error", K(ret), K(current_pos), K(text_length));
} else if (buff_size - buff_pos < current_pos) {
int32_t required_buffer_size = (buff_pos + current_pos) * 2;
UChar *tmp_buff = NULL;
if (OB_ISNULL(tmp_buff = static_cast<UChar *>(string_buf.alloc(required_buffer_size * sizeof(UChar))))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
LOG_WARN("alloc memory failed.", K(tmp_buff), K(required_buffer_size), K(ret));
} else {
MEMCPY(tmp_buff, replace_buff, buff_pos * sizeof(UChar));
string_buf.free(replace_buff);
replace_buff = tmp_buff;
buff_size = required_buffer_size;
MEMCPY(replace_buff + buff_pos, text, current_pos * sizeof(UChar));
}
} else {
MEMCPY(replace_buff + buff_pos, text, current_pos * sizeof(UChar));
}
LOG_TRACE("succeed to append head", K(current_pos), K(text_length),
K(buff_pos), K(buff_size));
}
if (OB_FAIL(ret)) {
} else if (OB_FAIL(check_icu_regexp_status(m_error_code))) {
LOG_WARN("failed to check icu regexp status", K(u_errorName(m_error_code)), K(ret));
} else {
buff_pos += current_pos;
LOG_TRACE("succeed to append head", K(buff_pos), K(current_pos), K(buff_size));
}
}
return ret;
}
int ObExprRegexContext::append_replace_str(ObExprStringBuf &string_buf,
const UChar *u_replace,
const int32_t u_replace_length,
UChar *&replace_buff,
int32_t &buff_size,
int32_t &buff_pos) const
{
int ret = OB_SUCCESS;
int32_t capacity = buff_size - buff_pos;
UErrorCode m_error_code = U_ZERO_ERROR;
UChar *ptr = replace_buff + buff_pos;
int32_t replace_size = uregex_appendReplacement(regexp_engine_,
u_replace,
u_replace_length,
&ptr,
&capacity,
&m_error_code);
if (m_error_code == U_BUFFER_OVERFLOW_ERROR) {
m_error_code = U_ZERO_ERROR;
int32_t required_buffer_size = (buff_pos + replace_size) * 2;
UChar *tmp_buff = NULL;
if (OB_ISNULL(tmp_buff = static_cast<UChar *>(string_buf.alloc(required_buffer_size * sizeof(UChar))))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
LOG_WARN("alloc memory failed.", K(tmp_buff), K(required_buffer_size), K(ret));
} else {
MEMCPY(tmp_buff, replace_buff, buff_pos * sizeof(UChar));
string_buf.free(replace_buff);
replace_buff = tmp_buff;
buff_size = required_buffer_size;
capacity = buff_size - buff_pos;
ptr = &(replace_buff[0]) + buff_pos;
replace_size = uregex_appendReplacement(regexp_engine_,
u_replace,
u_replace_length,
&ptr,
&capacity,
&m_error_code);
}
}
if (OB_FAIL(ret)) {
} else if (OB_FAIL(check_icu_regexp_status(m_error_code))) {
LOG_WARN("failed to check icu regexp status", K(ret), K(u_errorName(m_error_code)));
} else {
buff_pos += replace_size;
LOG_TRACE("succeed to append append replace", K(buff_pos), K(replace_size), K(buff_size));
}
return ret;
}
int ObExprRegexContext::append_tail(ObExprStringBuf &string_buf,
UChar *&replace_buff,
int32_t &buff_size,
int32_t &buff_pos) const
{
int ret = OB_SUCCESS;
int32_t capacity = buff_size - buff_pos;
UErrorCode m_error_code = U_ZERO_ERROR;
UChar *ptr = replace_buff + buff_pos;
int32_t tail_size = uregex_appendTail(regexp_engine_, &ptr, &capacity, &m_error_code);
if (m_error_code == U_BUFFER_OVERFLOW_ERROR) {
m_error_code = U_ZERO_ERROR;
int32_t required_buffer_size = buff_pos + tail_size;
UChar *tmp_buff = NULL;
if (OB_ISNULL(tmp_buff = static_cast<UChar *>(string_buf.alloc(required_buffer_size * sizeof(UChar))))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
LOG_WARN("alloc memory failed.", K(tmp_buff), K(required_buffer_size), K(ret));
} else {
MEMCPY(tmp_buff, replace_buff, buff_pos * sizeof(UChar));
string_buf.free(replace_buff);
replace_buff = tmp_buff;
buff_size = required_buffer_size;
ptr = &(replace_buff[0]) + buff_pos;
capacity = buff_size - buff_pos;
tail_size = uregex_appendTail(regexp_engine_, &ptr, &capacity, &m_error_code);
}
}
if (OB_FAIL(ret)) {
} else if (OB_FAIL(check_icu_regexp_status(m_error_code))) {
LOG_WARN("failed to check icu regexp status", K(u_errorName(m_error_code)), K(ret));
} else {
buff_pos += tail_size;
LOG_TRACE("succeed to append tail", K(buff_pos), K(tail_size), K(buff_size));
}
return ret;
}
int ObExprRegexContext::check_icu_regexp_status(UErrorCode u_error_code,
const UParseError *parse_error/*=null*/) const
{
int ret = OB_SUCCESS;
//maybe we can break down all error types in the future, you can see UErrorCode in utypes.h file.
if (U_SUCCESS(u_error_code)) {
//do nothing
} else {
switch (u_error_code)
{
case U_REGEX_MISMATCHED_PAREN:
ret = OB_ERR_REGEXP_EPAREN;
LOG_WARN("unmatched parentheses in regular expression", K(ret));
break;
case U_REGEX_BAD_ESCAPE_SEQUENCE:
ret = OB_ERR_REGEXP_EESCAPE;
LOG_WARN("invalid escape \\ sequence in regular expression", K(ret));
break;
case U_REGEX_MISSING_CLOSE_BRACKET:
ret = OB_ERR_REGEXP_EBRACK;
LOG_WARN("nmatched bracket in regular expression", K(ret));
break;
case U_REGEX_RULE_SYNTAX:
if (parse_error != NULL) {
ObSqlString errmsg;
if (OB_FAIL(errmsg.append_fmt("%s, Syntax error in regular expression on line %d, character %d.",
u_errorName(u_error_code),
parse_error->line,
parse_error->offset))) {
LOG_WARN("failed to append fmt", K(ret));
} else {
ret = OB_ERR_REGEXP_ERROR;
LOG_WARN("Syntax error in regular expression", K(ret), K(u_errorName(u_error_code)));
LOG_USER_ERROR(OB_ERR_REGEXP_ERROR, errmsg.ptr());
}
} else {
ret = OB_ERR_REGEXP_ERROR;
LOG_WARN("other error in icu regexp", K(ret), K(u_errorName(u_error_code)));
LOG_USER_ERROR(OB_ERR_REGEXP_ERROR, u_errorName(u_error_code));
}
break;
default:
ret = OB_ERR_REGEXP_ERROR;
LOG_WARN("other error in icu regexp", K(ret), K(u_errorName(u_error_code)));
LOG_USER_ERROR(OB_ERR_REGEXP_ERROR, u_errorName(u_error_code));
break;
}
}
return ret;
}
//Oracle allow more, we consider optimizer following function
int ObExprRegexContext::preprocess_pattern(ObExprStringBuf &string_buf,
const ObString &origin_pattern,
ObString &pattern)
{
int ret = OB_SUCCESS;
if (lib::is_mysql_mode()) {
pattern = origin_pattern;
} else if (origin_pattern.length() / sizeof(UChar) > strlen("[^][:]")) {
/*oracle mode allow:
* regexp_substr('xxxx','[^][:]') <==> regexp_substr('xxxx','[^:]')
*/
ObString const_str1(strlen("[^][:]"), "[^][:]");
ObString u_const_str1;
ObArenaAllocator alloc("ObExprRegexp");
if (OB_FAIL(ObExprUtil::convert_string_collation(const_str1,
CS_TYPE_UTF8MB4_BIN,
u_const_str1,
CS_TYPE_UTF16_BIN,
alloc))) {
LOG_WARN("convert charset failed", K(ret));
} else {
bool is_continued = true;
bool need_transform = false;
const char *origin_buf = origin_pattern.ptr();
const int32_t origin_buf_len = origin_pattern.length();
int32_t begin_idx = -1;
for (int32_t i = 0; is_continued && i + u_const_str1.length() <= origin_buf_len; ++i) {
ObString tmp_str(u_const_str1.length(), origin_buf + i);
if (0 == tmp_str.compare(u_const_str1)) {
if (!need_transform) {
need_transform = true;
begin_idx = i;
i = i + u_const_str1.length() - 1;
} else {
need_transform = false;
is_continued = false;
}
}
}
if (need_transform) {
ObString const_str2(strlen("[^:]"), "[^:]");
ObString u_const_str2;
char *buf = NULL;
if (OB_UNLIKELY(begin_idx < 0 || begin_idx > origin_buf_len - u_const_str1.length())) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("get unexpected error", K(ret), K(begin_idx), K(origin_buf_len), K(u_const_str1.length()));
} else if (OB_FAIL(ObExprUtil::convert_string_collation(const_str2,
CS_TYPE_UTF8MB4_BIN,
u_const_str2,
CS_TYPE_UTF16_BIN,
alloc))) {
LOG_WARN("convert charset failed", K(ret));
} else if (OB_ISNULL(buf = static_cast<char *>(string_buf.alloc(
origin_buf_len - u_const_str1.length() + u_const_str2.length())))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
LOG_WARN("failed to alloc memory", K(origin_pattern), K(ret));
} else {
int32_t buf_len = 0;
MEMCPY(buf, origin_buf, begin_idx);
buf_len += begin_idx;
MEMCPY(buf + buf_len, u_const_str2.ptr(), u_const_str2.length());
buf_len += u_const_str2.length();
if (origin_buf_len - begin_idx - u_const_str1.length() > 0) {
MEMCPY(buf + buf_len,
origin_buf + begin_idx + u_const_str1.length(),
origin_buf_len - begin_idx - u_const_str1.length());
buf_len += origin_buf_len - begin_idx - u_const_str1.length();
}
pattern.assign_ptr(buf, buf_len);
LOG_TRACE("succeed to preprocess pattern", K(buf), K(buf_len));
}
} else {
pattern = origin_pattern;
LOG_TRACE("succeed to preprocess pattern", K(origin_pattern), K(pattern));
}
}
} else {
pattern = origin_pattern;
LOG_TRACE("succeed to preprocess pattern", K(origin_pattern), K(pattern));
}
if (OB_SUCC(ret)) {
LOG_TRACE("succeed to preprocess pattern", K(origin_pattern), K(pattern));
}
return ret;
}
int ObExprRegexContext::get_regexp_flags(const ObString &match_param,
const bool is_case_sensitive,
uint32_t& flags)
{
int ret = OB_SUCCESS;
const char *ptr = match_param.ptr();
int length = match_param.length();
flags = is_case_sensitive ? 0 : UREGEX_CASE_INSENSITIVE;
if (lib::is_oracle_mode()) {//compatible oracle
flags |= UREGEX_UNIX_LINES;
}
for (int i = 0; OB_SUCC(ret) && i < length; i++) {
char c = ptr[i];
switch (c) {
case 'c':
flags &= ~UREGEX_CASE_INSENSITIVE;
break;
case 'i':
flags |= UREGEX_CASE_INSENSITIVE;
break;
case 'm':
flags |= UREGEX_MULTILINE;
break;
case 'n':
flags |= UREGEX_DOTALL;
break;
case 'u':
if (lib::is_mysql_mode()) {//compatible oracle
flags |= UREGEX_UNIX_LINES;
} else {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid match param", K(match_param), K(c));
LOG_USER_ERROR(OB_INVALID_ARGUMENT, "use match param in regexp expression");
}
break;
case 'x':
if (lib::is_oracle_mode()) {//compatible oracle
flags |= UREGEX_COMMENTS;
} else {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid match param", K(match_param), K(c));
LOG_USER_ERROR(OB_INVALID_ARGUMENT, "use match param in regexp expression");
}
break;
default:
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid match param", K(match_param), K(c));
LOG_USER_ERROR(OB_INVALID_ARGUMENT, "use match param in regexp expression");
break;
}
}
return ret;
}
int ObExprRegexContext::get_valid_unicode_string(ObExprStringBuf &string_buf,
const ObString &origin_str,
UChar *&u_str,
int32_t &u_str_len) const
{
int ret = OB_SUCCESS;
int32_t buf_len = origin_str.empty() ? sizeof(UChar) : origin_str.length();
void *tmp_buf = NULL;
if (OB_UNLIKELY(buf_len % sizeof(UChar) != 0)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("invalid param, source text is null", K(ret), K(origin_str), K(origin_str.length()));
} else if (OB_ISNULL(tmp_buf = string_buf.alloc(buf_len))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
LOG_WARN("allocate memory failed", K(ret), K(tmp_buf));
} else {
MEMSET(tmp_buf, 0, buf_len);
MEMCPY(tmp_buf, origin_str.ptr(), origin_str.length());
u_str = static_cast<UChar *>(tmp_buf);
u_str_len = origin_str.length() / sizeof(UChar);
for (int64_t i = 0; i < u_str_len; ++i) {
u_str[i] = htons(static_cast<uint16_t>(u_str[i]));
}
}
return ret;
}
int ObExprRegexContext::get_valid_replace_string(ObIAllocator &alloc,
const ObString &origin_replace,
UChar *&u_replace,
int32_t &u_replace_len) const
{
int ret = OB_SUCCESS;
u_replace_len = 0;
u_replace = NULL;
int32_t buf_len = origin_replace.empty() ? sizeof(UChar) : origin_replace.length() * 2;
if (lib::is_mysql_mode()) {
if (OB_FAIL(get_valid_unicode_string(alloc, origin_replace, u_replace, u_replace_len))) {
LOG_WARN("failed to get valid unicode string", K(ret));
} else {/*do nothing*/}
} else if (OB_UNLIKELY(buf_len % sizeof(UChar) != 0)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("invalid param, source text is null", K(ret), K(origin_replace),
K(origin_replace.length()));
} else if (OB_ISNULL(u_replace = static_cast<UChar *>(alloc.alloc(buf_len)))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
LOG_WARN("allocate memory failed", K(ret), K(u_replace));
} else if (origin_replace.empty()) {
MEMSET(u_replace, 0, buf_len);
u_replace_len = 0;
LOG_TRACE("succeed to get valid replace string", K(u_replace_len));
} else {
//oracle mode replace string '\1' <==> '$1' in mysql mode, we need extra convert.
UErrorCode m_error_code = U_ZERO_ERROR;
int32_t group_count = uregex_groupCount(regexp_engine_, &m_error_code);
MEMSET(u_replace, 0, buf_len);
if (OB_FAIL(check_icu_regexp_status(m_error_code))) {
LOG_WARN("failed to check icu regexp status", K(ret), K(u_errorName(m_error_code)));
} else {
const UChar *tmp_buf = static_cast<const UChar *>((void*)origin_replace.ptr());
int32_t tmp_len = origin_replace.length() / sizeof(UChar);
int32_t max_u_replace_len = buf_len / sizeof(UChar);
int32_t backslash_cnt = 0;
for (int64_t i = 0; i < tmp_len; ++i) {
u_replace[u_replace_len++] = htons(static_cast<uint16_t>(tmp_buf[i]));
if (static_cast<uint16_t>(u_replace[u_replace_len - 1]) == 0x5c) {//'\'
bool is_continue = true;
++backslash_cnt;
while (i < tmp_len - 1 && is_continue) {
u_replace[u_replace_len++] = htons(static_cast<uint16_t>(tmp_buf[++i]));
if (static_cast<uint16_t>(u_replace[u_replace_len - 1]) == 0x5c) {
++backslash_cnt;
} else {
is_continue = false;
}
}
if (i < tmp_len && u_replace_len < max_u_replace_len) {
if (backslash_cnt % 2 == 1 &&
static_cast<uint16_t>(u_replace[u_replace_len - 1]) >= 0x31 &&
static_cast<uint16_t>(u_replace[u_replace_len - 1]) <= 0x39) {//'\1'=>'$1'
if (group_count > 0) {
if (static_cast<uint16_t>(u_replace[u_replace_len - 1]) - 0x30 > group_count) {
//if the specify group num bigger than the total count, just skip, compatible Oracle.
u_replace_len = u_replace_len - 2;
} else {
u_replace[u_replace_len - 2] = 0x24;
}
} else if (u_replace_len < max_u_replace_len) {
uint16_t tmp_val = static_cast<uint16_t>(u_replace[u_replace_len - 1]);
u_replace[u_replace_len - 1] = 0x5c;
u_replace[u_replace_len++] = tmp_val;
}
} else if (backslash_cnt % 2 == 0 &&
static_cast<uint16_t>(u_replace[u_replace_len - 1]) == 0x24 &&
u_replace_len < max_u_replace_len) {//'\\$' =>'\\\$'
u_replace[u_replace_len - 1] = 0x5c;
u_replace[u_replace_len++] = 0x24;
}
}
backslash_cnt = 0;
} else if (static_cast<uint16_t>(u_replace[u_replace_len - 1]) == 0x24 &&
u_replace_len < max_u_replace_len) {//'$' ==>'\$'
u_replace[u_replace_len - 1] = 0x5c;
u_replace[u_replace_len++] = 0x24;
} else {//reset
backslash_cnt = 0;
}
}
LOG_TRACE("succeed to get valid replace string", K(tmp_len), K(u_replace_len), K(group_count));
}
}
return ret;
}
int ObExprRegexContext::check_need_utf8(ObRawExpr *expr, bool &need_utf8)
{
int ret = OB_SUCCESS;
need_utf8 = false;
const ObRawExpr * real_expr = NULL;
if (OB_FAIL(ObRawExprUtils::get_real_expr_without_cast(expr, real_expr))) {
LOG_WARN("fail to get real expr without cast", K(ret));
} else if (OB_ISNULL(real_expr)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("real expr is invalid", K(ret), K(real_expr));
} else {
need_utf8 = real_expr->get_result_type().is_nchar() ||
real_expr->get_result_type().is_nvarchar2() ||
real_expr->get_result_type().is_blob();
}
return ret;
}
int ObExprRegexContext::check_binary_compatible(const ObExprResType *types, int64_t num) {
int ret = OB_SUCCESS;
if (OB_ISNULL(types)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("unexpected null", K(ret));
} else {
int64_t binary_param_idx = -1;
int64_t nobinary_param_idx = -1;
for (int64_t i = 0; i < num; ++i) {
if (ObExprRegexContext::is_binary_string(types[i])) {
binary_param_idx = i;
} else if (!ObExprRegexContext::is_binary_compatible(types[i])) {
nobinary_param_idx = i;
}
}
if (-1 != binary_param_idx && -1 != nobinary_param_idx) {
const char *coll_name1 = ObCharset::collation_name(types[binary_param_idx].get_collation_type());
const char *coll_name2 = ObCharset::collation_name(types[nobinary_param_idx].get_collation_type());
ObString collation1 = ObString::make_string(coll_name1);
ObString collation2 = ObString::make_string(coll_name2);
ret = OB_ERR_MYSQL_CHARACTER_SET_MISMATCH;
LOG_USER_ERROR(OB_ERR_MYSQL_CHARACTER_SET_MISMATCH, collation1.length(), collation1.ptr(), collation2.length(), collation2.ptr());
LOG_WARN("If one of the params is binary string, all of the params should be implicitly castable to binary charset.", K(ret), K(*types));
}
}
return ret;
}
}
}