bugfix: ignore conversion failure and create an empty file when there is no output data

This commit is contained in:
dontknow9179
2024-05-08 07:59:34 +00:00
committed by ob-robot
parent 4544105cb4
commit cb329ebdc6
6 changed files with 168 additions and 116 deletions

View File

@ -3066,6 +3066,20 @@ bool ObCharset::is_cs_unicode(ObCollationType collation_type)
return is_cs_unicode;
}
int ObCharset::get_replace_character(ObCollationType collation_type, int32_t &replaced_char_unicode)
{
int ret = OB_SUCCESS;
if (is_cs_unicode(collation_type)) {
replaced_char_unicode = OB_CS_REPLACEMENT_CHARACTER;
} else if (!is_cs_nonascii(collation_type)) {
replaced_char_unicode = '?';
} else {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("get unexpected collation type", K(ret));
}
return ret;
}
bool ObCharset::is_cjk_charset(ObCollationType collation_type)
{
ObCharsetType cs_type = ObCharset::charset_type_by_coll(collation_type);

View File

@ -541,6 +541,7 @@ public:
static bool is_cs_nonascii(ObCollationType collation_type);
static bool is_cs_unicode(ObCollationType collation_type);
static int get_replace_character(ObCollationType collation_type, int32_t &replaced_char_unicode);
static bool is_cjk_charset(ObCollationType collation_type);
static bool is_valid_connection_collation(ObCollationType collation_type);
static const char* get_oracle_charset_name_by_charset_type(ObCharsetType charset_type);

View File

@ -61,6 +61,8 @@ inline int ob_charset_char_len<CHARSET_UTF8MB4>(const unsigned char *s, const un
mb_len = 3;
} else if (c < 0xf8) {
mb_len = 4;
} else {
mb_len = 1; /* Illegal mb head */
}
if (s + mb_len > e) {
mb_len = OB_CS_TOOSMALL;
@ -195,6 +197,8 @@ inline int ob_charset_char_len<CHARSET_GB18030>(const unsigned char *s, const un
if (OB_LIKELY(s + 3 < e)) {
mb_len = 4;
}
} else {
mb_len = 1; /* Illegal low_c */
}
}
}
@ -614,6 +618,7 @@ public:
static int foreach_char_prototype(const ObString &str,
HANDLE_FUNC &func,
bool ignore_convert_failed = false,
bool stop_when_truncated = false,
int64_t *truncated_len = NULL)
{
int ret = OB_SUCCESS;
@ -621,20 +626,23 @@ public:
const char* end = str.ptr() + str.length();
int64_t step = 0;
ob_wc_t unicode = -1;
int32_t replace_wc = 0;
for (; OB_SUCC(ret) && begin < end; begin += step) {
if (DO_DECODE) {
step = ob_charset_decode_unicode<CS_TYPE>(pointer_cast<const unsigned char*>(begin), pointer_cast<const unsigned char*>(end), unicode);
} else {
step = ob_charset_char_len<CS_TYPE>(pointer_cast<const unsigned char*>(begin), pointer_cast<const unsigned char*>(end));
}
if (OB_UNLIKELY(step <= OB_CS_TOOSMALL)) {
ret = OB_ERR_DATA_TRUNCATED;
if (OB_NOT_NULL(truncated_len)) {
*truncated_len = end - begin;
}
} else if (OB_UNLIKELY(step <= 0)) {
if (ignore_convert_failed) {
if (OB_UNLIKELY(step <= 0)) {
if (ignore_convert_failed && !(stop_when_truncated && step <= OB_CS_TOOSMALL)) {
ret = OB_SUCCESS;
step = 1;
unicode = -1;
} else if (step <= OB_CS_TOOSMALL) {
ret = OB_ERR_DATA_TRUNCATED;
if (OB_NOT_NULL(truncated_len)) {
*truncated_len = end - begin;
}
} else {
ret = OB_ERR_INCORRECT_STRING_VALUE;
}
@ -662,44 +670,45 @@ public:
HANDLE_FUNC &func,
bool convert_unicode = true,
bool ignore_convert_failed = false,
bool stop_when_truncated = false,
int64_t *truncated_len = NULL)
{
int ret = OB_SUCCESS;
switch (cs_type) {
case CHARSET_UTF8MB4:
ret = convert_unicode ?
foreach_char_prototype<CHARSET_UTF8MB4, HANDLE_FUNC, true>(str, func, ignore_convert_failed, truncated_len)
: foreach_char_prototype<CHARSET_UTF8MB4, HANDLE_FUNC, false>(str, func, ignore_convert_failed, truncated_len);
foreach_char_prototype<CHARSET_UTF8MB4, HANDLE_FUNC, true>(str, func, ignore_convert_failed, stop_when_truncated, truncated_len)
: foreach_char_prototype<CHARSET_UTF8MB4, HANDLE_FUNC, false>(str, func, ignore_convert_failed, stop_when_truncated, truncated_len);
break;
case CHARSET_GBK:
ret = convert_unicode ?
foreach_char_prototype<CHARSET_GBK, HANDLE_FUNC, true>(str, func, ignore_convert_failed, truncated_len)
: foreach_char_prototype<CHARSET_GBK, HANDLE_FUNC, false>(str, func, ignore_convert_failed, truncated_len);
foreach_char_prototype<CHARSET_GBK, HANDLE_FUNC, true>(str, func, ignore_convert_failed, stop_when_truncated, truncated_len)
: foreach_char_prototype<CHARSET_GBK, HANDLE_FUNC, false>(str, func, ignore_convert_failed, stop_when_truncated, truncated_len);
break;
case CHARSET_GB18030:
ret = convert_unicode ?
foreach_char_prototype<CHARSET_GB18030, HANDLE_FUNC, true>(str, func, ignore_convert_failed, truncated_len)
: foreach_char_prototype<CHARSET_GB18030, HANDLE_FUNC, false>(str, func, ignore_convert_failed, truncated_len);
foreach_char_prototype<CHARSET_GB18030, HANDLE_FUNC, true>(str, func, ignore_convert_failed, stop_when_truncated, truncated_len)
: foreach_char_prototype<CHARSET_GB18030, HANDLE_FUNC, false>(str, func, ignore_convert_failed, stop_when_truncated, truncated_len);
break;
case CHARSET_GB18030_2022:
ret = convert_unicode ?
foreach_char_prototype<CHARSET_GB18030_2022, HANDLE_FUNC, true>(str, func, ignore_convert_failed, truncated_len)
: foreach_char_prototype<CHARSET_GB18030, HANDLE_FUNC, false>(str, func, ignore_convert_failed, truncated_len);
foreach_char_prototype<CHARSET_GB18030_2022, HANDLE_FUNC, true>(str, func, ignore_convert_failed, stop_when_truncated, truncated_len)
: foreach_char_prototype<CHARSET_GB18030, HANDLE_FUNC, false>(str, func, ignore_convert_failed, stop_when_truncated, truncated_len);
break;
case CHARSET_UTF16:
ret = convert_unicode ?
foreach_char_prototype<CHARSET_UTF16, HANDLE_FUNC, true>(str, func, ignore_convert_failed, truncated_len)
: foreach_char_prototype<CHARSET_UTF16, HANDLE_FUNC, false>(str, func, ignore_convert_failed, truncated_len);
foreach_char_prototype<CHARSET_UTF16, HANDLE_FUNC, true>(str, func, ignore_convert_failed, stop_when_truncated, truncated_len)
: foreach_char_prototype<CHARSET_UTF16, HANDLE_FUNC, false>(str, func, ignore_convert_failed, stop_when_truncated, truncated_len);
break;
case CHARSET_LATIN1:
ret = convert_unicode ?
foreach_char_prototype<CHARSET_LATIN1, HANDLE_FUNC, true>(str, func, ignore_convert_failed, truncated_len)
: foreach_char_prototype<CHARSET_LATIN1, HANDLE_FUNC, false>(str, func, ignore_convert_failed, truncated_len);
foreach_char_prototype<CHARSET_LATIN1, HANDLE_FUNC, true>(str, func, ignore_convert_failed, stop_when_truncated, truncated_len)
: foreach_char_prototype<CHARSET_LATIN1, HANDLE_FUNC, false>(str, func, ignore_convert_failed, stop_when_truncated, truncated_len);
break;
case CHARSET_BINARY:
ret = convert_unicode ?
foreach_char_prototype<CHARSET_BINARY, HANDLE_FUNC, true>(str, func, ignore_convert_failed, truncated_len)
: foreach_char_prototype<CHARSET_BINARY, HANDLE_FUNC, false>(str, func, ignore_convert_failed, truncated_len);
foreach_char_prototype<CHARSET_BINARY, HANDLE_FUNC, true>(str, func, ignore_convert_failed, stop_when_truncated, truncated_len)
: foreach_char_prototype<CHARSET_BINARY, HANDLE_FUNC, false>(str, func, ignore_convert_failed, stop_when_truncated, truncated_len);
break;
default:
ret = OB_ERR_UNEXPECTED;
@ -744,30 +753,31 @@ public:
ObCharsetType in_cs_type = ObCharset::charset_type_by_coll(src_coll_type);
ObCharsetType out_cs_type = ObCharset::charset_type_by_coll(out_coll_type);
int64_t truncated_len = 0;
bool stop_when_truncated = false;
switch (out_cs_type) {
case CHARSET_UTF8MB4: {
Encoder<CHARSET_UTF8MB4> encoder(buf, buf_len, pos, replaced_char);
ret = foreach_char(str, in_cs_type, encoder, true, !report_error, &truncated_len);
ret = foreach_char(str, in_cs_type, encoder, true, !report_error, stop_when_truncated, &truncated_len);
break;
}
case CHARSET_GBK: {
Encoder<CHARSET_GBK> encoder(buf, buf_len, pos, replaced_char);
ret = foreach_char(str, in_cs_type, encoder, true, !report_error, &truncated_len);
ret = foreach_char(str, in_cs_type, encoder, true, !report_error, stop_when_truncated, &truncated_len);
break;
}
case CHARSET_GB18030: {
Encoder<CHARSET_GB18030> encoder(buf, buf_len, pos, replaced_char);
ret = foreach_char(str, in_cs_type, encoder, true, !report_error, &truncated_len);
ret = foreach_char(str, in_cs_type, encoder, true, !report_error, stop_when_truncated, &truncated_len);
break;
}
case CHARSET_GB18030_2022: {
Encoder<CHARSET_GB18030_2022> encoder(buf, buf_len, pos, replaced_char);
ret = foreach_char(str, in_cs_type, encoder, true, !report_error, &truncated_len);
ret = foreach_char(str, in_cs_type, encoder, true, !report_error, stop_when_truncated, &truncated_len);
break;
}
case CHARSET_UTF16: {
Encoder<CHARSET_UTF16> encoder(buf, buf_len, pos, replaced_char);
ret = foreach_char(str, in_cs_type, encoder, true, !report_error, &truncated_len);
ret = foreach_char(str, in_cs_type, encoder, true, !report_error, stop_when_truncated, &truncated_len);
break;
}
default: {