Files
oceanbase/src/sql/engine/expr/ob_expr_soundex.cpp
2023-04-27 11:11:24 +00:00

385 lines
15 KiB
C++

/**
* Copyright (c) 2021 OceanBase
* OceanBase CE is licensed under Mulan PubL v2.
* You can use this software according to the terms and conditions of the Mulan PubL v2.
* You may obtain a copy of Mulan PubL v2 at:
* http://license.coscl.org.cn/MulanPubL-2.0
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PubL v2 for more details.
*/
#define USING_LOG_PREFIX SQL_ENG
#include "sql/engine/expr/ob_expr_soundex.h"
#include "sql/engine/ob_exec_context.h"
#include "sql/engine/expr/ob_expr_util.h"
#include "sql/engine/expr/ob_expr_lob_utils.h"
namespace oceanbase
{
namespace sql
{
using namespace common;
ObExprSoundex::ObExprSoundex(ObIAllocator &alloc)
: ObFuncExprOperator(alloc, T_FUN_SYS_SOUNDEX, N_SOUNDEX, 1, VALID_FOR_GENERATED_COL, NOT_ROW_DIMENSION) {}
int ObExprSoundex::calc_result_type1(
ObExprResType &type,
ObExprResType &type1,
ObExprTypeCtx &type_ctx) const
{
int ret = OB_SUCCESS;
const int64_t oracle_result_length = 4;
const ObSQLSessionInfo *session = NULL;
ObObjType param_calc_type = type1.get_type();
ObCollationType param_calc_cs_type = type1.get_collation_type();
ObObjType res_type = ObMaxType;
ObCollationType res_cs_type = CS_TYPE_INVALID;
int64_t res_length = OB_INVALID_SIZE;
const ObLengthSemantics res_len_semantics = LS_CHAR;
if (OB_ISNULL(session = type_ctx.get_session())) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("session is null", K(ret));
} else if (is_oracle_mode()) {
ObSessionNLSParams nls_param = session->get_session_nls_params();
if (type1.is_string_type()) {
res_type = type1.is_nstring() ? ObNVarchar2Type : ObVarcharType;
res_cs_type = type1.is_nstring() ? nls_param.nls_nation_collation_ : nls_param.nls_collation_;
if (ObCharset::is_cs_nonascii(type1.get_collation_type())) {
param_calc_cs_type = CS_TYPE_UTF8MB4_GENERAL_CI;
}
} else {
res_type = ObVarcharType;
res_cs_type = nls_param.nls_collation_;
param_calc_type = ObVarcharType;
param_calc_cs_type = CS_TYPE_UTF8MB4_GENERAL_CI;
}
res_length = oracle_result_length;
} else {
if (type1.is_string_type()) {
if (ObCharset::is_cs_nonascii(type1.get_collation_type())) {
param_calc_cs_type = CS_TYPE_UTF8MB4_GENERAL_CI;
}
if (type1.is_character_type()) {
res_type = ObVarcharType;
// min length of result is 4.
res_length = MAX(MIN_RESULT_LENGTH, type1.get_length());
} else if (ObTinyTextType == type1.get_type()) {
res_type = ObVarcharType;
res_length = OB_MAX_BINARY_LENGTH;
} else {
res_type = ObLongTextType;
res_length = OB_MAX_LONGTEXT_LENGTH;
}
res_cs_type = type1.get_collation_type();
} else {
param_calc_type = ObVarcharType;
param_calc_cs_type = CS_TYPE_UTF8MB4_GENERAL_CI;
res_type = ObVarcharType;
res_length = OB_MAX_BINARY_LENGTH;
res_cs_type = type_ctx.get_coll_type();
}
}
if (OB_SUCC(ret)) {
if (OB_UNLIKELY(ObMaxType == param_calc_type || CS_TYPE_INVALID == param_calc_cs_type
|| ObMaxType == res_type || CS_TYPE_INVALID == res_cs_type || res_length <= 0)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("some value not set", K(ret), K(type1), K(param_calc_type), K(param_calc_cs_type),
K(res_type), K(res_cs_type), K(res_length));
} else {
type1.set_calc_type(param_calc_type);
type1.set_calc_collation_type(param_calc_cs_type);
type.set_type(res_type);
type.set_collation_type(res_cs_type);
type.set_collation_level(type1.get_collation_level());
type.set_length(res_length);
type.set_length_semantics(res_len_semantics);
}
}
return ret;
}
/*
* convert character to soundex code. case insensitive.
* B, F, P, V => 1
* C, G, J, K, Q, S, X, Z => 2
* D, T => 3
* L => 4
* M, N => 5
* R => 6
*/
// 0 for A, E, I, O, U, H, W, Y and -1 for other character which will all be discarded.
int8_t ObExprSoundex::get_character_code(const int32_t wchar)
{
static const int64_t alphabet_num = 26;
const int32_t lower_alphabet_min = static_cast<int32_t>('a');
const int32_t lower_alphabet_max = static_cast<int32_t>('z');
const int32_t upper_alphabet_min = static_cast<int32_t>('A');
const int32_t upper_alphabet_max = static_cast<int32_t>('Z');
static const int8_t convert_map[alphabet_num] = {
0, 1, 2, 3, 0, 1, 2, /* a ~ g */
0, 0, 2, 2, 4, 5, 5, /* h ~ n */
0, 1, 2, 6, 2, 3, /* o ~ t */
0, 1, 0, 2, 0, 2 /* u ~ z */
};
int8_t res = -1;
if (wchar >= lower_alphabet_min && wchar <= lower_alphabet_max) {
res = convert_map[wchar - lower_alphabet_min];
} else if (wchar >= upper_alphabet_min && wchar <= upper_alphabet_max) {
res = convert_map[wchar - upper_alphabet_min];
}
return res;
}
int ObExprSoundex::convert_str_to_soundex(const ObString &input,
const ObCollationType input_cs_type,
const bool use_original_algo,
const bool fix_min_len,
char *buf, const int64_t len, int64_t &pos,
bool &is_first, int8_t &last_soundex_code)
{
int ret = OB_SUCCESS;
ObStringScanner scanner(input, input_cs_type);
ObString tmp_str;
int32_t wchar = 0;
int8_t soundex_code = 0;
int8_t pre_code = last_soundex_code;
if (OB_UNLIKELY(len < MIN_RESULT_LENGTH)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("buf is not enough", K(ret), K(len));
}
while(OB_SUCC(ret) && !(fix_min_len && pos >= MIN_RESULT_LENGTH)) {
if (OB_FAIL(scanner.next_character(tmp_str, wchar))) {
if (OB_UNLIKELY(OB_ITER_END != ret)) {
LOG_WARN("get next character failed", K(ret), K(input));
}
} else if (FALSE_IT(soundex_code = get_character_code(wchar))) {
} else if (0 == pos && is_first) {
if (soundex_code >= 0) {
// only expect alphabetic character as beginning of result.
if (wchar <= static_cast<int32_t>('z') && wchar >= static_cast<int32_t>('a')) {
buf[pos++] = wchar - 'a' + 'A';
} else if (wchar <= static_cast<int32_t>('Z') && wchar >= static_cast<int32_t>('A')) {
buf[pos++] = wchar;
} else {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("unexpected character", K(ret), K(wchar), K(soundex_code));
}
pre_code = soundex_code;
is_first = false;
} else {
// ignore nonalphabetic character
}
} else {
if (soundex_code > 0) {
// middle alphabetic character, ignore if same as pre_code.
if (pre_code != soundex_code) {
if (OB_UNLIKELY(pos >= len)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("buf is not enough", K(ret), K(len), K(pos), K(input));
} else {
buf[pos++] = soundex_code + '0';
pre_code = soundex_code;
}
}
} else {
// middle nonalphabetic character
// just ignore if use original algorithm, otherwise reset pre_code.
pre_code = use_original_algo ? pre_code : 0;
}
}
}
if (OB_ITER_END == ret) {
ret = OB_SUCCESS;
}
if (OB_SUCC(ret)) {
// pad zero if result shorter than MIN_RESULT_LENGTH and not empry
if (pos > 0) {
while(pos < MIN_RESULT_LENGTH) {
buf[pos++] = '0';
}
last_soundex_code = pre_code;
}
}
return ret;
}
int ObExprSoundex::calc(const ObString &input, const ObCollationType intput_cs_type,
const ObCollationType res_cs_type,
ObIAllocator &tmp_alloc, ObIAllocator &res_alloc,
ObString &out)
{
int ret = OB_SUCCESS;
const bool need_charset_convert = ObCharset::is_cs_nonascii(res_cs_type);
const bool is_oracle_mode = lib::is_oracle_mode();
const int64_t buf_len = is_oracle_mode ? MIN_RESULT_LENGTH : MAX(MIN_RESULT_LENGTH, input.length());
char *buf = NULL;
int64_t pos = 0;
if (need_charset_convert) {
buf = static_cast<char *>(tmp_alloc.alloc(buf_len));
} else {
buf = static_cast<char *>(res_alloc.alloc(buf_len));
}
bool is_first = true;
int8_t last_soundex_code = 0;
if (OB_ISNULL(buf)) {
ret = OB_ALLOCATE_MEMORY_FAILED;
LOG_WARN("allocate memory failed", K(ret), K(buf_len));
} else if (OB_FAIL(convert_str_to_soundex(input, intput_cs_type, !is_oracle_mode,
is_oracle_mode, buf, buf_len, pos,
is_first, last_soundex_code))) {
LOG_WARN("calc soundex failed", K(ret));
} else if (need_charset_convert) {
if (OB_FAIL(ObExprUtil::convert_string_collation(ObString(pos, buf),
CS_TYPE_UTF8MB4_GENERAL_CI,
out,
res_cs_type,
res_alloc))) {
LOG_WARN("convert string collation failed", K(ret));
}
} else {
out.assign_ptr(buf, pos);
}
return ret;
}
int ObExprSoundex::calc_text(const ObDatum &input_datum,
const ObObjType input_type,
const ObObjType res_type,
const ObCollationType input_cs_type,
const ObCollationType res_cs_type,
const bool input_has_lob_header,
ObIAllocator &tmp_alloc, ObIAllocator &res_alloc,
ObString &out,
bool has_lob_header)
{
int ret = OB_SUCCESS;
const bool need_charset_convert = ObCharset::is_cs_nonascii(res_cs_type);
const bool is_oracle_mode = lib::is_oracle_mode();
char *buf = NULL;
int64_t pos = 0;
ObTextStringIter input_iter(input_type, input_cs_type, input_datum.get_string(), input_has_lob_header);
ObTextStringResult out_result(res_type, has_lob_header, &res_alloc);
int64_t buf_size = 0;
int64_t data_len = 0;
if (OB_FAIL(input_iter.init(0, NULL, &tmp_alloc))) {
LOG_WARN("init input_iter failed ", K(ret), K(input_iter));
} else if (OB_FAIL(input_iter.get_byte_len(data_len))) {
LOG_WARN("get input iter data len failed ", K(ret), K(input_iter));
} else if (FALSE_IT(buf_size = is_oracle_mode ? MIN_RESULT_LENGTH : MAX(MIN_RESULT_LENGTH, data_len))) {
} else if (OB_FAIL(out_result.init(buf_size))) {
LOG_WARN("init lob result failed", K(ret), K(out_result), K(buf_size));
} else if (OB_FAIL(out_result.get_reserved_buffer(buf, buf_size))) {
LOG_WARN("get empty buffer failed", K(ret), K(buf_size));
} else if (OB_ISNULL(buf)) {
ret = OB_ALLOCATE_MEMORY_FAILED;
LOG_WARN("allocate memory failed", K(ret), K(buf_size));
} else {
bool is_first = true;
int8_t last_soundex_code = 0;
ObTextStringIterState state;
ObString input_data;
ObString block_out;
char *block_buf = NULL;
while (OB_SUCC(ret)
&& buf_size > 0
&& (state = input_iter.get_next_block(input_data)) == TEXTSTRING_ITER_NEXT) {
ObDataBuffer buf_alloc(buf, buf_size);
int64_t block_buf_len = is_oracle_mode
? MIN_RESULT_LENGTH : MAX(MIN_RESULT_LENGTH, input_data.length());
if (need_charset_convert) {
block_buf = static_cast<char *>(tmp_alloc.alloc(block_buf_len));
} else {
block_buf = static_cast<char *>(buf_alloc.alloc(block_buf_len));
}
if (OB_FAIL(convert_str_to_soundex(input_data, input_cs_type, !is_oracle_mode,
is_oracle_mode, block_buf, block_buf_len, pos,
is_first, last_soundex_code))) {
LOG_WARN("calc soundex failed", K(ret));
} else if (need_charset_convert) {
if (OB_FAIL(ObExprUtil::convert_string_collation(ObString(pos, block_buf),
CS_TYPE_UTF8MB4_GENERAL_CI,
block_out,
res_cs_type,
buf_alloc))) {
LOG_WARN("convert string collation failed", K(ret));
} else if (OB_FAIL(out_result.lseek(block_out.length(), 0))) {
LOG_WARN("result lseek failed", K(ret));
}
} else { // nocharset convert
if (OB_FAIL(out_result.lseek(pos, 0))) {
LOG_WARN("result lseek failed", K(ret));
}
}
if (OB_SUCC(ret)) {
if (need_charset_convert) {
buf = buf + block_out.length();
buf_size = buf_size - block_out.length();
} else {
buf = buf + pos;
buf_size = buf_size - pos;
}
}
}
out_result.get_result_buffer(out);
}
return ret;
}
int ObExprSoundex::cg_expr(ObExprCGCtx &, const ObRawExpr &, ObExpr &rt_expr) const
{
int ret = OB_SUCCESS;
CK(1 == rt_expr.arg_cnt_);
CK(NULL != rt_expr.args_ && NULL != rt_expr.args_[0]);
CK(ob_is_string_type(rt_expr.datum_meta_.type_));
rt_expr.eval_func_ = eval_soundex;
return ret;
}
int ObExprSoundex::eval_soundex(const ObExpr &expr, ObEvalCtx &ctx, ObDatum &expr_datum)
{
int ret = OB_SUCCESS;
ObDatum *param = NULL;
ObExprStrResAlloc expr_res_alloc(expr, ctx);
ObEvalCtx::TempAllocGuard tmp_alloc_guard(ctx);
if (OB_FAIL(expr.args_[0]->eval(ctx, param))) {
LOG_WARN("evaluate parameters failed", K(ret));
} else if (param->is_null()) {
expr_datum.set_null();
} else {
const ObCollationType res_cs_type = expr.datum_meta_.cs_type_;
const ObCollationType input_cs_type = expr.args_[0]->datum_meta_.cs_type_;
ObString out;
const ObObjType input_type = expr.args_[0]->datum_meta_.type_;
const ObObjType res_type = expr.datum_meta_.type_;
if (!ob_is_text_tc(input_type) && !ob_is_text_tc(res_type)) {
ret = calc(param->get_string(), input_cs_type, res_cs_type,
tmp_alloc_guard.get_allocator(),
expr_res_alloc, out);
} else { // text tc
const bool input_has_lob_header = expr.args_[0]->obj_meta_.has_lob_header();
bool has_lob_header = expr.obj_meta_.has_lob_header();
ret = calc_text(*param, input_type, res_type, input_cs_type, res_cs_type,
input_has_lob_header,
tmp_alloc_guard.get_allocator(),
expr_res_alloc, out, has_lob_header);
}
if (OB_FAIL(ret)) {
LOG_WARN("calc soundex failed", K(ret));
} else if (out.empty() && is_oracle_mode()) {
expr_datum.set_null();
} else {
expr_datum.set_string(out);
}
}
return ret;
}
} // sql
} // oceanbase