From 8bd4f54ea14f7513fef6507aa43863f6cb24a9f8 Mon Sep 17 00:00:00 2001 From: SevenJ-swj Date: Fri, 2 Jun 2023 11:17:36 +0000 Subject: [PATCH] fix some charset bugs --- deps/oblib/src/lib/charset/ob_ctype_bin_os.cc | 2 +- .../src/lib/charset/ob_ctype_gb18030_os.cc | 2 +- deps/oblib/src/lib/charset/ob_ctype_mb_os.cc | 2 +- .../src/lib/charset/ob_ctype_simple_os.cc | 2 +- .../oblib/src/lib/charset/ob_ctype_utf8_os.cc | 2 +- .../virtual_table/ob_show_create_table.cpp | 8 +- src/share/schema/ob_schema_printer.cpp | 14 +- src/share/schema/ob_schema_printer.h | 6 +- src/sql/engine/expr/ob_expr_like.cpp | 3 - src/sql/engine/expr/ob_expr_like.h | 4 - src/sql/engine/expr/ob_expr_replace.cpp | 126 ++++++++++-------- src/sql/engine/expr/ob_expr_replace.h | 1 + src/sql/ob_sql_utils.cpp | 6 +- 13 files changed, 98 insertions(+), 80 deletions(-) diff --git a/deps/oblib/src/lib/charset/ob_ctype_bin_os.cc b/deps/oblib/src/lib/charset/ob_ctype_bin_os.cc index f4aec97810..e381d1cfd5 100644 --- a/deps/oblib/src/lib/charset/ob_ctype_bin_os.cc +++ b/deps/oblib/src/lib/charset/ob_ctype_bin_os.cc @@ -248,7 +248,7 @@ int ob_wildcmp_bin_impl(const ObCharsetInfo *cs, int result= -1; while (wild_str != wild_end) { - while (*wild_str != w_many && *wild_str != w_one) { + while ((*wild_str == escape_char) || (*wild_str != w_many && *wild_str != w_one)) { if (*wild_str == escape_char && wild_str+1 != wild_end) { wild_str++; } diff --git a/deps/oblib/src/lib/charset/ob_ctype_gb18030_os.cc b/deps/oblib/src/lib/charset/ob_ctype_gb18030_os.cc index 59825db4de..cd301b7165 100644 --- a/deps/oblib/src/lib/charset/ob_ctype_gb18030_os.cc +++ b/deps/oblib/src/lib/charset/ob_ctype_gb18030_os.cc @@ -19654,7 +19654,7 @@ static int ob_wildcmp_gb18030_impl(const ObCharsetInfo *cs, const char *str, if ((w_len = get_code_and_length(cs, wild_str, wild_end, &w_gb)) == 0) return 1; - if (w_gb == w_many) { + if (w_gb != escape_char && w_gb == w_many) { result = 1; break; } diff --git a/deps/oblib/src/lib/charset/ob_ctype_mb_os.cc b/deps/oblib/src/lib/charset/ob_ctype_mb_os.cc index cba5518a80..dbb55e8b3e 100644 --- a/deps/oblib/src/lib/charset/ob_ctype_mb_os.cc +++ b/deps/oblib/src/lib/charset/ob_ctype_mb_os.cc @@ -151,7 +151,7 @@ int ob_wildcmp_mb_impl(const ObCharsetInfo *cs, { int result= -1; while (wild_str != wild_end) { - while (*wild_str != w_many && *wild_str != w_one) { + while ((*wild_str == escape_char) || (*wild_str != w_many && *wild_str != w_one)) { int l; if (*wild_str == escape_char && wild_str+1 != wild_end) { wild_str++; diff --git a/deps/oblib/src/lib/charset/ob_ctype_simple_os.cc b/deps/oblib/src/lib/charset/ob_ctype_simple_os.cc index 292649d6aa..62f5eef29b 100644 --- a/deps/oblib/src/lib/charset/ob_ctype_simple_os.cc +++ b/deps/oblib/src/lib/charset/ob_ctype_simple_os.cc @@ -949,7 +949,7 @@ static int ob_wildcmp_8bit_impl(const ObCharsetInfo* cs, const char* str_ptr, co int cmp_result = -1; while (wild_str != wild_end) { - while (*wild_str != w_many_char && *wild_str != w_one_char) { + while ((*wild_str == escape_char) || (*wild_str != w_many_char && *wild_str != w_one_char)) { if (*wild_str == escape_char && wild_str + 1 != wild_end) { wild_str++; } diff --git a/deps/oblib/src/lib/charset/ob_ctype_utf8_os.cc b/deps/oblib/src/lib/charset/ob_ctype_utf8_os.cc index 6338ffca48..be26158abe 100644 --- a/deps/oblib/src/lib/charset/ob_ctype_utf8_os.cc +++ b/deps/oblib/src/lib/charset/ob_ctype_utf8_os.cc @@ -2024,7 +2024,7 @@ static int ob_wildcmp_unicode_impl_help(const ObCharsetInfo *cs, ret = 1; *has_returned = 1; break; - } else if (w_wc == (ob_wc_t) w_many) { + } else if (w_wc != (ob_wc_t) escape_char && w_wc == (ob_wc_t) w_many) { result = 1; break; } diff --git a/src/observer/virtual_table/ob_show_create_table.cpp b/src/observer/virtual_table/ob_show_create_table.cpp index 6d3d4b167a..d26e407598 100644 --- a/src/observer/virtual_table/ob_show_create_table.cpp +++ b/src/observer/virtual_table/ob_show_create_table.cpp @@ -230,7 +230,10 @@ int ObShowCreateTable::fill_row_cells_inner(const uint64_t show_table_id, } else { const ObLengthSemantics default_length_semantics = session_->get_local_nls_length_semantics(); // get auto_increment from auto_increment service, not from table option - if (OB_FAIL(schema_printer.print_table_definition(effective_tenant_id_, + ObCharsetType charset_type = CHARSET_INVALID; + if (OB_FAIL(session_->get_character_set_results(charset_type))) { + LOG_WARN("get character set results failed", K(ret)); + } else if (OB_FAIL(schema_printer.print_table_definition(effective_tenant_id_, show_table_id, table_def_buf, table_def_buf_size, @@ -238,7 +241,8 @@ int ObShowCreateTable::fill_row_cells_inner(const uint64_t show_table_id, TZ_INFO(session_), default_length_semantics, false, - session_->get_sql_mode()))) { + session_->get_sql_mode(), + charset_type))) { SERVER_LOG(WARN, "Generate table definition failed", KR(ret), K(effective_tenant_id_), K(show_table_id)); } diff --git a/src/share/schema/ob_schema_printer.cpp b/src/share/schema/ob_schema_printer.cpp index cfe18751d3..d2d7e55b88 100644 --- a/src/share/schema/ob_schema_printer.cpp +++ b/src/share/schema/ob_schema_printer.cpp @@ -62,7 +62,8 @@ int ObSchemaPrinter::print_table_definition(const uint64_t tenant_id, const ObTimeZoneInfo *tz_info, const common::ObLengthSemantics default_length_semantics, bool agent_mode, - ObSQLMode sql_mode) const + ObSQLMode sql_mode, + ObCharsetType charset_type) const { //TODO(yaoying.yyy: refactor this function):consider index_position in @@ -118,7 +119,7 @@ int ObSchemaPrinter::print_table_definition(const uint64_t tenant_id, } if (OB_SUCC(ret)) { - if (OB_FAIL(print_table_definition_columns(*table_schema, buf, buf_len, pos, tz_info, default_length_semantics, agent_mode, sql_mode))) { + if (OB_FAIL(print_table_definition_columns(*table_schema, buf, buf_len, pos, tz_info, default_length_semantics, agent_mode, sql_mode, charset_type))) { SHARE_SCHEMA_LOG(WARN, "fail to print columns", K(ret), K(*table_schema)); } else if (OB_FAIL(print_table_definition_rowkeys(*table_schema, buf, buf_len, pos))) { SHARE_SCHEMA_LOG(WARN, "fail to print rowkeys", K(ret), K(*table_schema)); @@ -154,7 +155,8 @@ int ObSchemaPrinter::print_table_definition_columns(const ObTableSchema &table_s const ObTimeZoneInfo *tz_info, const common::ObLengthSemantics default_length_semantics, bool is_agent_mode, - ObSQLMode sql_mode) const + ObSQLMode sql_mode, + ObCharsetType charset_type) const { int ret = OB_SUCCESS; bool is_first_col = true; @@ -329,7 +331,11 @@ int ObSchemaPrinter::print_table_definition_columns(const ObTableSchema &table_s SHARE_SCHEMA_LOG(WARN, "fail to print sql literal", KPC(col), K(buf), K(buf_len), K(pos), K(ret)); } } else if (ob_is_string_tc(default_value.get_type())) { - if (OB_FAIL(databuff_printf(buf, buf_len, pos, "'%s'", to_cstring(ObHexEscapeSqlStr(default_value.get_string()))))) { + ObCollationType collation_type = ObCharset::get_default_collation(charset_type); + ObString out_str = default_value.get_string(); + if (OB_FAIL(ObCharset::charset_convert(allocator, default_value.get_string(), default_value.get_collation_type(), collation_type, out_str))) { + SHARE_SCHEMA_LOG(WARN, "fail to convert charset", K(ret)); + } else if (OB_FAIL(databuff_printf(buf, buf_len, pos, "'%s'", to_cstring(ObHexEscapeSqlStr(out_str))))) { SHARE_SCHEMA_LOG(WARN, "fail to print default value of string tc", K(ret)); } } else if (OB_FAIL(default_value.print_varchar_literal(buf, buf_len, pos, tz_info))) { diff --git a/src/share/schema/ob_schema_printer.h b/src/share/schema/ob_schema_printer.h index ac3c7a1fa6..90feee5c7f 100644 --- a/src/share/schema/ob_schema_printer.h +++ b/src/share/schema/ob_schema_printer.h @@ -69,7 +69,8 @@ public: const common::ObTimeZoneInfo *tz_info, const common::ObLengthSemantics default_length_semantics, bool agent_mode, - ObSQLMode sql_mode = SMO_DEFAULT) const; + ObSQLMode sql_mode = SMO_DEFAULT, + ObCharsetType charset_type = ObCharsetType::CHARSET_UTF8MB4) const; int print_table_index_stroing( const share::schema::ObTableSchema *index_schema, const share::schema::ObTableSchema *table_schema, @@ -136,7 +137,8 @@ public: const common::ObTimeZoneInfo *tz_info, const common::ObLengthSemantics default_length_semantics, bool is_agent_mode = false, - ObSQLMode sql_mode = SMO_DEFAULT) const; + ObSQLMode sql_mode = SMO_DEFAULT, + ObCharsetType charset_type = ObCharsetType::CHARSET_UTF8MB4) const; int print_generated_column_definition(const ObColumnSchemaV2 &gen_col, char *buf, int64_t buf_len, diff --git a/src/sql/engine/expr/ob_expr_like.cpp b/src/sql/engine/expr/ob_expr_like.cpp index e578e5166b..fea40ce7b1 100644 --- a/src/sql/engine/expr/ob_expr_like.cpp +++ b/src/sql/engine/expr/ob_expr_like.cpp @@ -829,9 +829,6 @@ struct ObNonInstrModeMatcher if (OB_UNLIKELY(text_val.length() <= 0 && pattern_val.length() <= 0)) { // empty string res = 1; - } else if (OB_UNLIKELY(CS_TYPE_UTF8MB4_BIN != coll_type && escape_wc == static_cast('%'))) { - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "escape %"); } else { bool b = ObCharset::wildcmp(coll_type, text_val, pattern_val, escape_wc, static_cast('_'), static_cast('%')); diff --git a/src/sql/engine/expr/ob_expr_like.h b/src/sql/engine/expr/ob_expr_like.h index 62d4eb49c1..e1ac4e273c 100644 --- a/src/sql/engine/expr/ob_expr_like.h +++ b/src/sql/engine/expr/ob_expr_like.h @@ -229,10 +229,6 @@ int ObExprLike::calc_with_non_instr_mode(T &result, } else if (text_val.length() <= 0 && pattern_val.length() <= 0) { // empty string result.set_int(1); - } else if (OB_UNLIKELY(CS_TYPE_UTF8MB4_BIN != coll_type && escape_wc == static_cast('%'))) { - // when cs_type is not utf8mb4_bin and escape = %, there is a bug of wildcmp - ret = OB_NOT_SUPPORTED; - LOG_USER_ERROR(OB_NOT_SUPPORTED, "escape %"); } else { bool b = ObCharset::wildcmp(coll_type, text_val, pattern_val, escape_wc, static_cast('_'), static_cast('%')); diff --git a/src/sql/engine/expr/ob_expr_replace.cpp b/src/sql/engine/expr/ob_expr_replace.cpp index 9b9323a05b..6c84a928dd 100644 --- a/src/sql/engine/expr/ob_expr_replace.cpp +++ b/src/sql/engine/expr/ob_expr_replace.cpp @@ -121,6 +121,7 @@ int ObExprReplace::calc_result_typeN(ObExprResType &type, } int ObExprReplace::replace(ObString &ret_str, + const ObCollationType cs_type, const ObString &text, const ObString &from, const ObString &to, @@ -139,74 +140,82 @@ int ObExprReplace::replace(ObString &ret_str, OB_UNLIKELY(from == to)) { ret_str = text; } else { - int64_t length_text = text.length(); - int64_t length_from = from.length(); - int64_t length_to = to.length(); - int64_t tot_length = 0;//total length for the result. - //locations is used to track the locations of 'from' in 'text' ObSEArray locations(common::ObModIds::OB_SQL_EXPR_REPLACE, common::OB_MALLOC_NORMAL_BLOCK_SIZE); - int64_t start_pos = 1;// the last parameter of locate starts from 1 NOT 0. - uint32_t index = 0; - int64_t count = 0; - while (OB_SUCC(ret)) { //while(1) will be better in terms of performance - index = ObCharset::locate(CS_TYPE_BINARY, text.ptr(), length_text, - from.ptr(), length_from, - start_pos); - if (0 != index && OB_SUCC(locations.push_back(index))) { - start_pos = index + length_from; + const char *buf_start = text.ptr(); + const char *buf_end = text.ptr() + text.length(); + const ObCharsetInfo *cs = NULL; + int error = 0; + if (OB_UNLIKELY(OB_ISNULL(cs = ObCharset::get_charset(cs_type)) || + OB_ISNULL(cs->cset))) { + ret = OB_ERR_UNEXPECTED; + LOG_ERROR("unexpected error. invalid argument(s)", K(cs_type)); + } + int32_t char_len = 0; + int32_t next_char_len = 0; + while (OB_SUCC(ret) && OB_LIKELY(error == 0) && buf_start + char_len < buf_end) { + char_len += static_cast(cs->cset->well_formed_len(cs, buf_start + char_len, buf_end, 1, &error)); + if (OB_UNLIKELY(0 != error)) { + bool is_null = false; + //mysql strict mode will return null, otherwise will return something + //so we should get session to acquire if is_strict mode here. + //we now set is_strict=false. + if (OB_FAIL(ObSQLUtils::check_well_formed_str(text, cs_type, ret_str, is_null, false, false))) { + LOG_WARN("check well formed str failed", K(ret)); + } + } else if (next_char_len == 0 && FALSE_IT(next_char_len = char_len)) { + } else if (char_len < from.length()) { + //do nothing + } else if (char_len > from.length()) { + buf_start += next_char_len; + char_len = 0; + next_char_len = 0; + } else if (0 == MEMCMP(buf_start, from.ptr(), char_len)) { + ret = locations.push_back(buf_start - text.ptr()); + buf_start += char_len; + char_len = 0; + next_char_len = 0; } else { - break; + buf_start += next_char_len; + char_len = 0; + next_char_len = 0; } } - if (OB_FAIL(ret)) { - LOG_WARN("push back failed", K(ret)); - } else if (0 == (count = locations.count())) { - //no 'from' at all. + int64_t tot_length = 0; + if (OB_UNLIKELY(error != 0)) { + } else if (OB_FAIL(ret)) { + ret_str.reset(); + } else if (locations.count() == 0) { ret_str = text; - } else if (OB_UNLIKELY((OB_MAX_VARCHAR_LENGTH - length_text) / count < (length_to - length_from))) { + } else if (OB_UNLIKELY((OB_MAX_VARCHAR_LENGTH - text.length()) / locations.count() < (to.length() - from.length()))) { ret = OB_ERR_VARCHAR_TOO_LONG; LOG_ERROR("Result of replace() was larger than OB_MAX_VARCHAR_LENGTH.", - K(length_text), K(length_from), K(length_to), K(OB_MAX_VARCHAR_LENGTH), K(ret)); + K(text.length()), K(to.length()), K(from.length()), K(OB_MAX_VARCHAR_LENGTH), K(ret)); + ret_str.reset(); + } else if (OB_UNLIKELY((tot_length = text.length() + (to.length() - from.length()) * locations.count()) <= 0)) { + // tot_length equals to 0 indicates that length_to is zero and "to" is empty string ret_str.reset(); } else { - // Avoid realloc - if (OB_UNLIKELY((tot_length = length_text + (length_to - length_from) * count) <= 0)) { - // tot_length equals to 0 indicates that length_to is zero and "to" is empty string - ret_str.reset(); - } else { - char *buf = static_cast(string_buf.alloc(tot_length)); - if (OB_ISNULL(buf)) { - ret = OB_ALLOCATE_MEMORY_FAILED; - LOG_ERROR("alloc memory failed.", K(tot_length), K(ret)); - } else { - // Core function - int64_t pos = 0; - const char * const text_ptr_start = text.ptr(); - const char * const text_ptr_end = text.ptr() + length_text; - const char *text_ptr_lower = text.ptr(); - const char *text_ptr_upper = text.ptr(); - const char *to_ptr = to.ptr(); - char *tmp_buf = buf; - for (int64_t i = 0; i < count; ++i) { - pos = locations.at(i); - text_ptr_upper = text_ptr_start + pos - 1; - MEMCPY(tmp_buf, text_ptr_lower, text_ptr_upper - text_ptr_lower); - tmp_buf += text_ptr_upper - text_ptr_lower; - text_ptr_lower = text_ptr_upper + length_from; - - MEMCPY(tmp_buf, to_ptr, length_to); - tmp_buf += length_to; - } - if (text_ptr_lower < text_ptr_end) { - //deal with the tail parts of text - //such as text="xxxxxxxABCxxxxxABC1234" and from="ABC" - //we should also copy the "1234" to destination - MEMCPY(tmp_buf, text_ptr_lower, text_ptr_end - text_ptr_lower); - } - ret_str.assign_ptr(buf, static_cast(tot_length)); - } + char *buf = static_cast(string_buf.alloc(tot_length)); + int pos = 0; + int text_pos = 0; + if (OB_ISNULL(buf)) { + ret = OB_ALLOCATE_MEMORY_FAILED; + LOG_ERROR("alloc memory failed.", K(tot_length), K(ret)); } + + for (int i = 0; OB_SUCC(ret) && i < locations.count(); i++) { + MEMCPY(buf + pos, text.ptr() + text_pos, locations.at(i) - text_pos); + pos += locations.at(i) - text_pos; + text_pos = locations.at(i); + MEMCPY(buf + pos, to.ptr(), to.length()); + pos += to.length(); + text_pos += from.length(); + } + if (OB_SUCC(ret) && text_pos < text.length()) { + MEMCPY(buf + pos, text.ptr() + text_pos, text.length() - text_pos); + } + ret_str.assign_ptr(buf, static_cast(tot_length)); } } return ret; @@ -241,6 +250,7 @@ int ObExprReplace::eval_replace(const ObExpr &expr, ObEvalCtx &ctx, ObDatum &exp expr_datum.set_datum(*text); } else if (!is_lob_res) { // non text tc inputs if (OB_FAIL(replace(res, + expr.args_[0]->datum_meta_.cs_type_, text->get_string(), !from->is_null() ? from->get_string() : ObString(), (NULL != to && !to->is_null()) ? to->get_string() : ObString(), @@ -275,7 +285,7 @@ int ObExprReplace::eval_replace(const ObExpr &expr, ObEvalCtx &ctx, ObDatum &exp LOG_WARN("failed to get string data", K(ret), K(expr.args_[2]->datum_meta_)); } if (OB_SUCC(ret)) { - if (OB_FAIL(replace(res, text_data, from_data, to_data, temp_allocator))) { + if (OB_FAIL(replace(res, expr.args_[0]->datum_meta_.cs_type_, text_data, from_data, to_data, temp_allocator))) { LOG_WARN("do replace for lob resutl failed", K(ret), K(expr.datum_meta_.type_)); } else if (OB_FAIL(ObTextStringHelper::string_to_templob_result(expr, ctx, expr_datum, res))) { LOG_WARN("set lob result failed", K(ret)); diff --git a/src/sql/engine/expr/ob_expr_replace.h b/src/sql/engine/expr/ob_expr_replace.h index 09c2c17f9a..ee0e2a0b2e 100644 --- a/src/sql/engine/expr/ob_expr_replace.h +++ b/src/sql/engine/expr/ob_expr_replace.h @@ -39,6 +39,7 @@ public: // helper func static int replace(common::ObString &result, + const ObCollationType cs_type, const common::ObString &text, const common::ObString &from, const common::ObString &to, diff --git a/src/sql/ob_sql_utils.cpp b/src/sql/ob_sql_utils.cpp index dd89cc1437..5baac65f49 100644 --- a/src/sql/ob_sql_utils.cpp +++ b/src/sql/ob_sql_utils.cpp @@ -1745,7 +1745,7 @@ int ObSQLUtils::check_well_formed_str(const ObString &src_str, } else { dst_str.assign_ptr(src_str.ptr(), static_cast(well_formed_length)); } - if (OB_SUCC(ret)) { + if (OB_SUCC(ret) && lib::is_mysql_mode()) { LOG_USER_WARN(OB_ERR_INVALID_CHARACTER_STRING, static_cast(charset_name_len), charset_name, static_cast(hex_len), hex_buf); @@ -3994,9 +3994,11 @@ int64_t ObSqlFatalErrExtraInfoGuard::to_string(char *buf, const int64_t buf_len) if (schema_obj.get_schema_type() == TABLE_SCHEMA) { ObSchemaGetterGuard schema_guard; ObSchemaPrinter schema_printer(schema_guard); + ObCharsetType charset_type = CHARSET_INVALID; + OZ (exec_ctx_->get_my_session()->get_character_set_results(charset_type)); OZ (GCTX.schema_service_->get_tenant_schema_guard(tenant_id_, schema_guard, schema_obj.version_)); OZ (databuff_printf(buf, buf_len, pos, (i != 0) ? ",\n\"" : "\n\"")); - OZ (schema_printer.print_table_definition(tenant_id_, schema_obj.get_object_id(), buf, buf_len, pos, NULL, LS_DEFAULT, false)); + OZ (schema_printer.print_table_definition(tenant_id_, schema_obj.get_object_id(), buf, buf_len, pos, NULL, LS_DEFAULT, false, charset_type)); OZ (databuff_printf(buf, buf_len, pos, "\"")); } }