3185 lines
115 KiB
C++
3185 lines
115 KiB
C++
/**
|
|
* Copyright (c) 2021 OceanBase
|
|
* OceanBase CE is licensed under Mulan PubL v2.
|
|
* You can use this software according to the terms and conditions of the Mulan PubL v2.
|
|
* You may obtain a copy of Mulan PubL v2 at:
|
|
* http://license.coscl.org.cn/MulanPubL-2.0
|
|
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
* See the Mulan PubL v2 for more details.
|
|
*/
|
|
|
|
#define USING_LOG_PREFIX LIB_CHARSET
|
|
#include "lib/charset/ob_charset.h"
|
|
#include "lib/utility/serialization.h"
|
|
#include "lib/ob_define.h"
|
|
#include "lib/worker.h"
|
|
#include "common/ob_common_utility.h"
|
|
|
|
namespace oceanbase
|
|
{
|
|
namespace common
|
|
{
|
|
|
|
// BEGIN displayed length {{{1
|
|
// ref: https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
|
|
|
|
struct interval {
|
|
int first;
|
|
int last;
|
|
};
|
|
|
|
/* auxiliary function for binary search in interval table */
|
|
static int bisearch(ob_wc_t ucs, const struct interval *table, int max) {
|
|
int min = 0;
|
|
int mid;
|
|
|
|
if (ucs < table[0].first || ucs > table[max].last)
|
|
return 0;
|
|
while (max >= min) {
|
|
mid = (min + max) / 2;
|
|
if (ucs > table[mid].last)
|
|
min = mid + 1;
|
|
else if (ucs < table[mid].first)
|
|
max = mid - 1;
|
|
else
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
/* The following two functions define the column width of an ISO 10646
|
|
* character as follows:
|
|
*
|
|
* - The null character (U+0000) has a column width of 0.
|
|
*
|
|
* - Other C0/C1 control characters and DEL will lead to a return
|
|
* value of -1.
|
|
*
|
|
* - Non-spacing and enclosing combining characters (general
|
|
* category code Mn or Me in the Unicode database) have a
|
|
* column width of 0.
|
|
*
|
|
* - SOFT HYPHEN (U+00AD) has a column width of 1.
|
|
*
|
|
* - Other format characters (general category code Cf in the Unicode
|
|
* database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
|
|
*
|
|
* - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
|
|
* have a column width of 0.
|
|
*
|
|
* - Spacing characters in the East Asian Wide (W) or East Asian
|
|
* Full-width (F) category as defined in Unicode Technical
|
|
* Report #11 have a column width of 2.
|
|
*
|
|
* - All remaining characters (including all printable
|
|
* ISO 8859-1 and WGL4 characters, Unicode control characters,
|
|
* etc.) have a column width of 1.
|
|
*
|
|
* This implementation assumes that wchar_t characters are encoded
|
|
* in ISO 10646.
|
|
*/
|
|
|
|
int mk_wcwidth(ob_wc_t ucs)
|
|
{
|
|
/* sorted list of non-overlapping intervals of non-spacing characters */
|
|
/* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
|
|
static const struct interval combining[] = {
|
|
{ 0x0300, 0x036F }, { 0x0483, 0x0486 }, { 0x0488, 0x0489 },
|
|
{ 0x0591, 0x05BD }, { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 },
|
|
{ 0x05C4, 0x05C5 }, { 0x05C7, 0x05C7 }, { 0x0600, 0x0603 },
|
|
{ 0x0610, 0x0615 }, { 0x064B, 0x065E }, { 0x0670, 0x0670 },
|
|
{ 0x06D6, 0x06E4 }, { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED },
|
|
{ 0x070F, 0x070F }, { 0x0711, 0x0711 }, { 0x0730, 0x074A },
|
|
{ 0x07A6, 0x07B0 }, { 0x07EB, 0x07F3 }, { 0x0901, 0x0902 },
|
|
{ 0x093C, 0x093C }, { 0x0941, 0x0948 }, { 0x094D, 0x094D },
|
|
{ 0x0951, 0x0954 }, { 0x0962, 0x0963 }, { 0x0981, 0x0981 },
|
|
{ 0x09BC, 0x09BC }, { 0x09C1, 0x09C4 }, { 0x09CD, 0x09CD },
|
|
{ 0x09E2, 0x09E3 }, { 0x0A01, 0x0A02 }, { 0x0A3C, 0x0A3C },
|
|
{ 0x0A41, 0x0A42 }, { 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D },
|
|
{ 0x0A70, 0x0A71 }, { 0x0A81, 0x0A82 }, { 0x0ABC, 0x0ABC },
|
|
{ 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 }, { 0x0ACD, 0x0ACD },
|
|
{ 0x0AE2, 0x0AE3 }, { 0x0B01, 0x0B01 }, { 0x0B3C, 0x0B3C },
|
|
{ 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B43 }, { 0x0B4D, 0x0B4D },
|
|
{ 0x0B56, 0x0B56 }, { 0x0B82, 0x0B82 }, { 0x0BC0, 0x0BC0 },
|
|
{ 0x0BCD, 0x0BCD }, { 0x0C3E, 0x0C40 }, { 0x0C46, 0x0C48 },
|
|
{ 0x0C4A, 0x0C4D }, { 0x0C55, 0x0C56 }, { 0x0CBC, 0x0CBC },
|
|
{ 0x0CBF, 0x0CBF }, { 0x0CC6, 0x0CC6 }, { 0x0CCC, 0x0CCD },
|
|
{ 0x0CE2, 0x0CE3 }, { 0x0D41, 0x0D43 }, { 0x0D4D, 0x0D4D },
|
|
{ 0x0DCA, 0x0DCA }, { 0x0DD2, 0x0DD4 }, { 0x0DD6, 0x0DD6 },
|
|
{ 0x0E31, 0x0E31 }, { 0x0E34, 0x0E3A }, { 0x0E47, 0x0E4E },
|
|
{ 0x0EB1, 0x0EB1 }, { 0x0EB4, 0x0EB9 }, { 0x0EBB, 0x0EBC },
|
|
{ 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 }, { 0x0F35, 0x0F35 },
|
|
{ 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 }, { 0x0F71, 0x0F7E },
|
|
{ 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 }, { 0x0F90, 0x0F97 },
|
|
{ 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 }, { 0x102D, 0x1030 },
|
|
{ 0x1032, 0x1032 }, { 0x1036, 0x1037 }, { 0x1039, 0x1039 },
|
|
{ 0x1058, 0x1059 }, { 0x1160, 0x11FF }, { 0x135F, 0x135F },
|
|
{ 0x1712, 0x1714 }, { 0x1732, 0x1734 }, { 0x1752, 0x1753 },
|
|
{ 0x1772, 0x1773 }, { 0x17B4, 0x17B5 }, { 0x17B7, 0x17BD },
|
|
{ 0x17C6, 0x17C6 }, { 0x17C9, 0x17D3 }, { 0x17DD, 0x17DD },
|
|
{ 0x180B, 0x180D }, { 0x18A9, 0x18A9 }, { 0x1920, 0x1922 },
|
|
{ 0x1927, 0x1928 }, { 0x1932, 0x1932 }, { 0x1939, 0x193B },
|
|
{ 0x1A17, 0x1A18 }, { 0x1B00, 0x1B03 }, { 0x1B34, 0x1B34 },
|
|
{ 0x1B36, 0x1B3A }, { 0x1B3C, 0x1B3C }, { 0x1B42, 0x1B42 },
|
|
{ 0x1B6B, 0x1B73 }, { 0x1DC0, 0x1DCA }, { 0x1DFE, 0x1DFF },
|
|
{ 0x200B, 0x200F }, { 0x202A, 0x202E }, { 0x2060, 0x2063 },
|
|
{ 0x206A, 0x206F }, { 0x20D0, 0x20EF }, /*{ 0x302A, 0x302F },*/
|
|
/*{ 0x3099, 0x309A },*/ { 0xA806, 0xA806 }, { 0xA80B, 0xA80B },
|
|
{ 0xA825, 0xA826 }, { 0xFB1E, 0xFB1E }, { 0xFE00, 0xFE0F },
|
|
{ 0xFE20, 0xFE23 }, { 0xFEFF, 0xFEFF }, { 0xFFF9, 0xFFFB },
|
|
{ 0x10A01, 0x10A03 }, { 0x10A05, 0x10A06 }, { 0x10A0C, 0x10A0F },
|
|
{ 0x10A38, 0x10A3A }, { 0x10A3F, 0x10A3F }, { 0x1D167, 0x1D169 },
|
|
{ 0x1D173, 0x1D182 }, { 0x1D185, 0x1D18B }, { 0x1D1AA, 0x1D1AD },
|
|
{ 0x1D242, 0x1D244 }, { 0xE0001, 0xE0001 }, { 0xE0020, 0xE007F },
|
|
{ 0xE0100, 0xE01EF }
|
|
};
|
|
|
|
/* test for 8-bit control characters */
|
|
if (ucs == 0)
|
|
return 0;
|
|
if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0))
|
|
return -1;
|
|
|
|
/* binary search in table of non-spacing characters */
|
|
if (0 != bisearch(ucs, combining,
|
|
sizeof(combining) / sizeof(struct interval) - 1))
|
|
return 0;
|
|
|
|
/* if we arrive here, ucs is not a combining or C0/C1 control character */
|
|
|
|
return 1 +
|
|
(ucs >= 0x1100 &&
|
|
(ucs <= 0x115f || /* Hangul Jamo init. consonants */
|
|
ucs == 0x2329 || ucs == 0x232a ||
|
|
(ucs >= 0x2e80 && ucs <= 0xa4cf &&
|
|
ucs != 0x303f) || /* CJK ... Yi */
|
|
(ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */
|
|
(ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility Ideographs */
|
|
(ucs >= 0xfe10 && ucs <= 0xfe19) || /* Vertical forms */
|
|
(ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */
|
|
(ucs >= 0xff00 && ucs <= 0xff60) || /* Fullwidth Forms */
|
|
(ucs >= 0xffe0 && ucs <= 0xffe6) ||
|
|
(ucs >= 0x20000 && ucs <= 0x2fffd) ||
|
|
(ucs >= 0x30000 && ucs <= 0x3fffd)));
|
|
}
|
|
|
|
int mk_wcswidth(const wchar_t *pwcs, size_t n)
|
|
{
|
|
int w, width = 0;
|
|
|
|
for (;*pwcs && n-- > 0; pwcs++)
|
|
if ((w = mk_wcwidth(*pwcs)) < 0)
|
|
return -1;
|
|
else
|
|
width += w;
|
|
|
|
return width;
|
|
}
|
|
|
|
|
|
/*
|
|
* The following functions are the same as mk_wcwidth() and
|
|
* mk_wcswidth(), except that spacing characters in the East Asian
|
|
* Ambiguous (A) category as defined in Unicode Technical Report #11
|
|
* have a column width of 2. This variant might be useful for users of
|
|
* CJK legacy encodings who want to migrate to UCS without changing
|
|
* the traditional terminal character-width behaviour. It is not
|
|
* otherwise recommended for general use.
|
|
*/
|
|
int mk_wcwidth_cjk(wchar_t ucs)
|
|
{
|
|
/* sorted list of non-overlapping intervals of East Asian Ambiguous
|
|
* characters, generated by "uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c" */
|
|
static const struct interval ambiguous[] = {
|
|
{ 0x00A1, 0x00A1 }, { 0x00A4, 0x00A4 }, { 0x00A7, 0x00A8 },
|
|
{ 0x00AA, 0x00AA }, { 0x00AE, 0x00AE }, { 0x00B0, 0x00B4 },
|
|
{ 0x00B6, 0x00BA }, { 0x00BC, 0x00BF }, { 0x00C6, 0x00C6 },
|
|
{ 0x00D0, 0x00D0 }, { 0x00D7, 0x00D8 }, { 0x00DE, 0x00E1 },
|
|
{ 0x00E6, 0x00E6 }, { 0x00E8, 0x00EA }, { 0x00EC, 0x00ED },
|
|
{ 0x00F0, 0x00F0 }, { 0x00F2, 0x00F3 }, { 0x00F7, 0x00FA },
|
|
{ 0x00FC, 0x00FC }, { 0x00FE, 0x00FE }, { 0x0101, 0x0101 },
|
|
{ 0x0111, 0x0111 }, { 0x0113, 0x0113 }, { 0x011B, 0x011B },
|
|
{ 0x0126, 0x0127 }, { 0x012B, 0x012B }, { 0x0131, 0x0133 },
|
|
{ 0x0138, 0x0138 }, { 0x013F, 0x0142 }, { 0x0144, 0x0144 },
|
|
{ 0x0148, 0x014B }, { 0x014D, 0x014D }, { 0x0152, 0x0153 },
|
|
{ 0x0166, 0x0167 }, { 0x016B, 0x016B }, { 0x01CE, 0x01CE },
|
|
{ 0x01D0, 0x01D0 }, { 0x01D2, 0x01D2 }, { 0x01D4, 0x01D4 },
|
|
{ 0x01D6, 0x01D6 }, { 0x01D8, 0x01D8 }, { 0x01DA, 0x01DA },
|
|
{ 0x01DC, 0x01DC }, { 0x0251, 0x0251 }, { 0x0261, 0x0261 },
|
|
{ 0x02C4, 0x02C4 }, { 0x02C7, 0x02C7 }, { 0x02C9, 0x02CB },
|
|
{ 0x02CD, 0x02CD }, { 0x02D0, 0x02D0 }, { 0x02D8, 0x02DB },
|
|
{ 0x02DD, 0x02DD }, { 0x02DF, 0x02DF }, { 0x0391, 0x03A1 },
|
|
{ 0x03A3, 0x03A9 }, { 0x03B1, 0x03C1 }, { 0x03C3, 0x03C9 },
|
|
{ 0x0401, 0x0401 }, { 0x0410, 0x044F }, { 0x0451, 0x0451 },
|
|
{ 0x2010, 0x2010 }, { 0x2013, 0x2016 }, { 0x2018, 0x2019 },
|
|
{ 0x201C, 0x201D }, { 0x2020, 0x2022 }, { 0x2024, 0x2027 },
|
|
{ 0x2030, 0x2030 }, { 0x2032, 0x2033 }, { 0x2035, 0x2035 },
|
|
{ 0x203B, 0x203B }, { 0x203E, 0x203E }, { 0x2074, 0x2074 },
|
|
{ 0x207F, 0x207F }, { 0x2081, 0x2084 }, { 0x20AC, 0x20AC },
|
|
{ 0x2103, 0x2103 }, { 0x2105, 0x2105 }, { 0x2109, 0x2109 },
|
|
{ 0x2113, 0x2113 }, { 0x2116, 0x2116 }, { 0x2121, 0x2122 },
|
|
{ 0x2126, 0x2126 }, { 0x212B, 0x212B }, { 0x2153, 0x2154 },
|
|
{ 0x215B, 0x215E }, { 0x2160, 0x216B }, { 0x2170, 0x2179 },
|
|
{ 0x2190, 0x2199 }, { 0x21B8, 0x21B9 }, { 0x21D2, 0x21D2 },
|
|
{ 0x21D4, 0x21D4 }, { 0x21E7, 0x21E7 }, { 0x2200, 0x2200 },
|
|
{ 0x2202, 0x2203 }, { 0x2207, 0x2208 }, { 0x220B, 0x220B },
|
|
{ 0x220F, 0x220F }, { 0x2211, 0x2211 }, { 0x2215, 0x2215 },
|
|
{ 0x221A, 0x221A }, { 0x221D, 0x2220 }, { 0x2223, 0x2223 },
|
|
{ 0x2225, 0x2225 }, { 0x2227, 0x222C }, { 0x222E, 0x222E },
|
|
{ 0x2234, 0x2237 }, { 0x223C, 0x223D }, { 0x2248, 0x2248 },
|
|
{ 0x224C, 0x224C }, { 0x2252, 0x2252 }, { 0x2260, 0x2261 },
|
|
{ 0x2264, 0x2267 }, { 0x226A, 0x226B }, { 0x226E, 0x226F },
|
|
{ 0x2282, 0x2283 }, { 0x2286, 0x2287 }, { 0x2295, 0x2295 },
|
|
{ 0x2299, 0x2299 }, { 0x22A5, 0x22A5 }, { 0x22BF, 0x22BF },
|
|
{ 0x2312, 0x2312 }, { 0x2460, 0x24E9 }, { 0x24EB, 0x254B },
|
|
{ 0x2550, 0x2573 }, { 0x2580, 0x258F }, { 0x2592, 0x2595 },
|
|
{ 0x25A0, 0x25A1 }, { 0x25A3, 0x25A9 }, { 0x25B2, 0x25B3 },
|
|
{ 0x25B6, 0x25B7 }, { 0x25BC, 0x25BD }, { 0x25C0, 0x25C1 },
|
|
{ 0x25C6, 0x25C8 }, { 0x25CB, 0x25CB }, { 0x25CE, 0x25D1 },
|
|
{ 0x25E2, 0x25E5 }, { 0x25EF, 0x25EF }, { 0x2605, 0x2606 },
|
|
{ 0x2609, 0x2609 }, { 0x260E, 0x260F }, { 0x2614, 0x2615 },
|
|
{ 0x261C, 0x261C }, { 0x261E, 0x261E }, { 0x2640, 0x2640 },
|
|
{ 0x2642, 0x2642 }, { 0x2660, 0x2661 }, { 0x2663, 0x2665 },
|
|
{ 0x2667, 0x266A }, { 0x266C, 0x266D }, { 0x266F, 0x266F },
|
|
{ 0x273D, 0x273D }, { 0x2776, 0x277F }, { 0xE000, 0xF8FF },
|
|
{ 0xFFFD, 0xFFFD }, { 0xF0000, 0xFFFFD }, { 0x100000, 0x10FFFD }
|
|
};
|
|
|
|
/* binary search in table of non-spacing characters */
|
|
if (bisearch(ucs, ambiguous,
|
|
sizeof(ambiguous) / sizeof(struct interval) - 1))
|
|
return 2;
|
|
|
|
return mk_wcwidth(ucs);
|
|
}
|
|
|
|
|
|
int mk_wcswidth_cjk(const wchar_t *pwcs, size_t n)
|
|
{
|
|
int w, width = 0;
|
|
|
|
for (;*pwcs && n-- > 0; pwcs++)
|
|
if ((w = mk_wcwidth_cjk(*pwcs)) < 0)
|
|
return -1;
|
|
else
|
|
width += w;
|
|
|
|
return width;
|
|
}
|
|
|
|
// END displayed length }}}
|
|
|
|
const ObCharsetWrapper ObCharset::charset_wrap_arr_[ObCharset::VALID_CHARSET_TYPES] =
|
|
{
|
|
{CHARSET_BINARY, "Binary pseudo charset", CS_TYPE_BINARY, 1},
|
|
{CHARSET_UTF8MB4, "UTF-8 Unicode", CS_TYPE_UTF8MB4_GENERAL_CI, 4},
|
|
{CHARSET_GBK, "GBK charset", CS_TYPE_GBK_CHINESE_CI, 2},
|
|
{CHARSET_UTF16, "UTF-16 Unicode", CS_TYPE_UTF16_GENERAL_CI, 2},
|
|
{CHARSET_GB18030, "GB18030 charset", CS_TYPE_GB18030_CHINESE_CI, 4},
|
|
{CHARSET_LATIN1, "cp1252 West European", CS_TYPE_LATIN1_SWEDISH_CI, 1},
|
|
{CHARSET_GB18030_2022, "GB18030-2022 charset", CS_TYPE_GB18030_2022_PINYIN_CI, 4},
|
|
};
|
|
|
|
const ObCollationWrapper ObCharset::collation_wrap_arr_[ObCharset::VALID_COLLATION_TYPES] =
|
|
{
|
|
{CS_TYPE_UTF8MB4_GENERAL_CI, CHARSET_UTF8MB4, CS_TYPE_UTF8MB4_GENERAL_CI, true, true, 1},
|
|
{CS_TYPE_UTF8MB4_BIN, CHARSET_UTF8MB4, CS_TYPE_UTF8MB4_BIN, false, true, 1},
|
|
{CS_TYPE_BINARY, CHARSET_BINARY, CS_TYPE_BINARY, true, true, 1},
|
|
{CS_TYPE_GBK_CHINESE_CI, CHARSET_GBK, CS_TYPE_GBK_CHINESE_CI, true, true, 1},
|
|
{CS_TYPE_GBK_BIN, CHARSET_GBK, CS_TYPE_GBK_BIN, false, true, 1},
|
|
{CS_TYPE_UTF16_GENERAL_CI, CHARSET_UTF16, CS_TYPE_UTF16_GENERAL_CI, true, true, 1},
|
|
{CS_TYPE_UTF16_BIN, CHARSET_UTF16, CS_TYPE_UTF16_BIN, false, true, 1},
|
|
{CS_TYPE_INVALID, CHARSET_INVALID, CS_TYPE_INVALID, false, false, 1},
|
|
{CS_TYPE_INVALID, CHARSET_INVALID, CS_TYPE_INVALID, false, false, 1},
|
|
{CS_TYPE_GB18030_CHINESE_CI, CHARSET_GB18030, CS_TYPE_GB18030_CHINESE_CI, true, true, 1},
|
|
{CS_TYPE_GB18030_BIN, CHARSET_GB18030, CS_TYPE_GB18030_BIN, false, true, 1},
|
|
{CS_TYPE_LATIN1_SWEDISH_CI, CHARSET_LATIN1, CS_TYPE_LATIN1_SWEDISH_CI,true, true, 1},
|
|
{CS_TYPE_LATIN1_BIN, CHARSET_LATIN1, CS_TYPE_LATIN1_BIN,false, true, 1},
|
|
{CS_TYPE_GB18030_2022_BIN, CHARSET_GB18030_2022, CS_TYPE_GB18030_2022_BIN, false, true, 1},
|
|
{CS_TYPE_GB18030_2022_PINYIN_CI, CHARSET_GB18030_2022, CS_TYPE_GB18030_2022_PINYIN_CI, true, true, 1},
|
|
{CS_TYPE_GB18030_2022_PINYIN_CS, CHARSET_GB18030_2022, CS_TYPE_GB18030_2022_PINYIN_CS, false, true, 1},
|
|
{CS_TYPE_GB18030_2022_RADICAL_CI, CHARSET_GB18030_2022, CS_TYPE_GB18030_2022_RADICAL_CI, false, true, 1},
|
|
{CS_TYPE_GB18030_2022_RADICAL_CS, CHARSET_GB18030_2022, CS_TYPE_GB18030_2022_RADICAL_CS, false, true, 1},
|
|
{CS_TYPE_GB18030_2022_STROKE_CI, CHARSET_GB18030_2022, CS_TYPE_GB18030_2022_STROKE_CI, false, true, 1},
|
|
{CS_TYPE_GB18030_2022_STROKE_CS, CHARSET_GB18030_2022, CS_TYPE_GB18030_2022_STROKE_CS, false, true, 1},
|
|
};
|
|
|
|
ObCharsetInfo *ObCharset::charset_arr[CS_TYPE_MAX] = {
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 0 ~ 7
|
|
&ob_charset_latin1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 8
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 16
|
|
NULL, NULL, NULL, NULL, &ob_charset_gbk_chinese_ci, // 24
|
|
NULL, NULL, NULL, // 29
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 32
|
|
NULL, NULL, NULL, NULL, NULL, // 40
|
|
&ob_charset_utf8mb4_general_ci, // 45
|
|
&ob_charset_utf8mb4_bin, // 46
|
|
&ob_charset_latin1_bin, // 47
|
|
NULL, NULL, NULL, NULL, NULL, NULL, // 48
|
|
&ob_charset_utf16_general_ci,// 54
|
|
&ob_charset_utf16_bin, // 55
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 56
|
|
&ob_charset_bin, // 63
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 64
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 72
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 80
|
|
&ob_charset_gbk_bin, // 87
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 88
|
|
NULL, NULL, NULL, NULL, NULL, // 96
|
|
NULL,
|
|
NULL, NULL, // 102
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 104
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 112
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 120
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 128
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 136
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 144
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 152
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 160
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 168
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 176
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 184
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 192
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 200
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 208
|
|
&ob_charset_gb18030_2022_bin, &ob_charset_gb18030_2022_pinyin_ci, // 216
|
|
&ob_charset_gb18030_2022_pinyin_cs, &ob_charset_gb18030_2022_radical_ci,// 218
|
|
&ob_charset_gb18030_2022_radical_cs, &ob_charset_gb18030_2022_stroke_ci, // 220
|
|
&ob_charset_gb18030_2022_stroke_cs, NULL, // 222
|
|
NULL,
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 225
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 232
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 240
|
|
&ob_charset_gb18030_chinese_ci, // 248
|
|
&ob_charset_gb18030_bin, // 249
|
|
NULL, &ob_charset_gb18030_chinese_cs, // 250
|
|
NULL, NULL, NULL, NULL, // 252
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 256
|
|
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, // 264
|
|
NULL // 272
|
|
};
|
|
|
|
double ObCharset::strntodv2(const char *str,
|
|
size_t str_len,
|
|
char **endptr,
|
|
int *err)
|
|
{
|
|
double result = 0.0;
|
|
if (lib::is_oracle_mode()) {
|
|
ObString str_orig(str_len, str);
|
|
ObString str_trim = str_orig.trim();
|
|
if ((str_trim.case_compare("NAN") == 0)
|
|
|| (str_trim.case_compare("-NAN") == 0)
|
|
|| (str_trim.case_compare("+NAN") == 0)) {
|
|
result = NAN;
|
|
*endptr = str_trim.ptr() + str_trim.length();
|
|
} else if ((str_trim.case_compare("+INFINITY") == 0)
|
|
|| (str_trim.case_compare("INFINITY") == 0)
|
|
|| (str_trim.case_compare("INF") == 0)) {
|
|
result = INFINITY;
|
|
*endptr = str_trim.ptr() + str_trim.length();
|
|
} else if ((str_trim.case_compare("-INFINITY") == 0)
|
|
|| (str_trim.case_compare("-INF") == 0)) {
|
|
result = -INFINITY;
|
|
*endptr = str_trim.ptr() + str_trim.length();
|
|
} else {
|
|
result = strntod(str, str_len, endptr, err);
|
|
}
|
|
} else {
|
|
result = strntod(str, str_len, endptr, err);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
double ObCharset::strntod(const char *str,
|
|
size_t str_len,
|
|
char **endptr,
|
|
int *err)
|
|
{
|
|
ObCharsetInfo *cs = &ob_charset_bin;
|
|
double result = 0.0;
|
|
if (is_argument_valid(cs, str, str_len)) {
|
|
result = cs->cset->strntod(cs, const_cast<char *>(str), str_len, endptr, err);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
int64_t ObCharset::strntoll(const char *str,
|
|
size_t str_len,
|
|
int base,
|
|
char **end_ptr,
|
|
int *err)
|
|
{
|
|
ObCharsetInfo *cs = &ob_charset_bin;
|
|
*end_ptr = const_cast<char*>(str);
|
|
int64_t result = 0;
|
|
if (is_argument_valid(cs, str, str_len)) {
|
|
result = cs->cset->strntoll(cs, str, str_len, base, end_ptr, err);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
uint64_t ObCharset::strntoull(const char *str,
|
|
size_t str_len,
|
|
int base,
|
|
char **end_ptr,
|
|
int *err)
|
|
{
|
|
ObCharsetInfo *cs = &ob_charset_bin;
|
|
*end_ptr = const_cast<char*>(str);
|
|
uint64_t result = 0;
|
|
if (is_argument_valid(cs, str, str_len)) {
|
|
result = cs->cset->strntoull(cs,
|
|
str,
|
|
str_len,
|
|
base,
|
|
end_ptr,
|
|
err);
|
|
}
|
|
return result;
|
|
}
|
|
int64_t ObCharset::strntoll(const char *str,
|
|
size_t str_len,
|
|
int base,
|
|
int *err)
|
|
{
|
|
ObCharsetInfo *cs = &ob_charset_bin;
|
|
char *end_ptr = NULL;
|
|
int64_t result = 0;
|
|
if (is_argument_valid(cs, str, str_len)) {
|
|
result = cs->cset->strntoll(cs, str, str_len, base, &end_ptr, err);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
uint64_t ObCharset::strntoull(const char *str,
|
|
size_t str_len,
|
|
int base,
|
|
int *err)
|
|
{
|
|
ObCharsetInfo *cs = &ob_charset_bin;
|
|
char *end_ptr = NULL;
|
|
uint64_t result = 0;
|
|
if (is_argument_valid(cs, str, str_len)) {
|
|
result = cs->cset->strntoull(cs,
|
|
str,
|
|
str_len,
|
|
base,
|
|
&end_ptr,
|
|
err);
|
|
}
|
|
return result;
|
|
}
|
|
uint64_t ObCharset::strntoullrnd(const char *str,
|
|
size_t str_len,
|
|
int unsigned_fl,
|
|
char **endptr,
|
|
int *err)
|
|
{
|
|
ObCharsetInfo *cs = &ob_charset_bin;
|
|
uint64_t result = 0;
|
|
if (is_argument_valid(cs, str, str_len)) {
|
|
result = cs->cset->strntoull10rnd(cs,
|
|
str,
|
|
str_len,
|
|
unsigned_fl,
|
|
endptr,
|
|
err);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
|
|
//=============================================================
|
|
char* ObCharset::lltostr(int64_t val, char *dst, int radix, int upcase)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
static const int64_t MAX_BUFFER_SIZE = 65;//ok for int64min
|
|
static char DIG_VEC_UPPER[] =
|
|
"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
|
|
static char DIG_VEC_LOWER[] =
|
|
"0123456789abcdefghijklmnopqrstuvwxyz";
|
|
//we do not take '\0' into consideration. '\0' terminated string is not expected
|
|
//use dst(start) and pret(end) to locate string, please.
|
|
char buffer[MAX_BUFFER_SIZE];
|
|
char *p = NULL;
|
|
long int new_val = 0;
|
|
char *dig_vec= upcase ? DIG_VEC_UPPER : DIG_VEC_LOWER;
|
|
uint64_t uval= (uint64_t) val;
|
|
char *pret = NULL;
|
|
if (radix < 0) {
|
|
if (radix < -36 || radix > -2) {
|
|
ret = OB_INVALID_ARGUMENT;
|
|
LOG_WARN("invalid radix", K(ret), K(radix));
|
|
} else {
|
|
if (val < 0) {
|
|
*dst++ = '-';
|
|
uval = (uint64_t)0 - uval;
|
|
}
|
|
radix = -radix;
|
|
}
|
|
} else if (radix > 36 || radix < 2) {
|
|
ret = OB_INVALID_ARGUMENT;
|
|
LOG_WARN("invalid radix", K(ret), K(radix));
|
|
}
|
|
|
|
/*
|
|
The slightly contorted code which follows is due to the fact that
|
|
few machines directly support unsigned long / and %. Certainly
|
|
the VAX C compiler generates a subroutine call. In the interests
|
|
of efficiency (hollow laugh) I let this happen for the first digit
|
|
only; after that "val" will be in range so that signed integer
|
|
division will do. Sorry 'bout that. CHECK THE CODE PRODUCED BY
|
|
YOUR C COMPILER. The first % and / should be unsigned, the second
|
|
% and / signed, but C compilers tend to be extraordinarily
|
|
sensitive to minor details of style. This works on a VAX, that's
|
|
all I claim for it.
|
|
*/
|
|
if (OB_SUCC(ret)) {
|
|
p = &buffer[sizeof(buffer)-1];
|
|
*p = '\0';
|
|
new_val= uval / (uint64_t) radix;
|
|
*--p = dig_vec[(uchar) (uval- (uint64_t) new_val*(uint64_t) radix)];
|
|
val = new_val;
|
|
ldiv_t res;
|
|
while (val != 0)
|
|
{
|
|
res=ldiv(val,radix);
|
|
*--p = dig_vec[res.rem];
|
|
val= res.quot;
|
|
}
|
|
while ((*dst++ = *p++) != 0) ;
|
|
pret = dst - 1;
|
|
}
|
|
return pret;
|
|
}
|
|
|
|
size_t ObCharset::scan_str(const char *str,
|
|
const char *end,
|
|
int sq)
|
|
{
|
|
ObCharsetInfo *cs = &ob_charset_bin;
|
|
size_t result = 0;
|
|
if (OB_ISNULL(str) || OB_ISNULL(end) || OB_ISNULL(cs)) {
|
|
BACKTRACE_RET(ERROR, OB_INVALID_ARGUMENT, true, "invalid argument. str = %p, end = %p, cs = %p", str, end, cs);
|
|
} else {
|
|
result = cs->cset->scan(cs, str, end, sq);
|
|
}
|
|
return result;
|
|
}
|
|
uint32_t ObCharset::instr(ObCollationType collation_type,
|
|
const char *str1,
|
|
int64_t str1_len,
|
|
const char *str2,
|
|
int64_t str2_len)
|
|
{
|
|
uint32_t result = 0;
|
|
if (is_argument_valid(collation_type, str1, str1_len, str2, str2_len)) {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
ob_match_t m_match_t[2];
|
|
uint nmatch = 1;
|
|
uint m_ret = cs->coll->instr(cs, str1, str1_len, str2, str2_len, m_match_t, nmatch);
|
|
if (0 == m_ret ) {
|
|
result = 0;
|
|
} else {
|
|
result = m_match_t[0].mb_len + 1;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
int64_t ObCharset::instrb(ObCollationType collation_type,
|
|
const char *str1,
|
|
int64_t str1_len,
|
|
const char *str2,
|
|
int64_t str2_len)
|
|
{
|
|
int64_t result = -1;
|
|
if (is_argument_valid(collation_type, str1, str1_len, str2, str2_len)) {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
ob_match_t m_match_t[2];
|
|
uint nmatch = 1;
|
|
uint m_ret = cs->coll->instr(cs, str1, str1_len, str2, str2_len, m_match_t, nmatch);
|
|
if (0 != m_ret) {
|
|
result = m_match_t[0].end - m_match_t[0].beg;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
uint32_t ObCharset::locate(ObCollationType collation_type,
|
|
const char *str1,
|
|
int64_t str1_len,
|
|
const char *str2,
|
|
int64_t str2_len,
|
|
int64_t pos)
|
|
{
|
|
uint32_t result = 0;
|
|
if (is_argument_valid(collation_type, str1, str1_len, str2, str2_len)) {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
int64_t start0 = pos - 1;
|
|
int64_t start = start0;
|
|
if (OB_UNLIKELY(start < 0 || start > str1_len)) {
|
|
result = 0;
|
|
} else {
|
|
int ret = OB_SUCCESS;
|
|
start = static_cast<int64_t>(charpos(collation_type, str1, str1_len, start, &ret));
|
|
if (OB_FAIL(ret)) {
|
|
result = 0;
|
|
} else if (static_cast<int64_t>(start) + str2_len > str1_len) {
|
|
result = 0;
|
|
} else if (0 == str2_len) {
|
|
result = static_cast<uint32_t>(start) + 1;
|
|
} else {
|
|
ob_match_t match_t;
|
|
uint32_t nmatch = 1;
|
|
uint32_t m_ret = cs->coll->instr(cs, str1 + start, str1_len - start, str2, str2_len, &match_t, nmatch);
|
|
if (0 == m_ret) {
|
|
result = 0;
|
|
} else {
|
|
result = match_t.mb_len + static_cast<uint32_t>(start0) + 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
int ObCharset::strcmp(ObCollationType collation_type,
|
|
const char *str1,
|
|
int64_t str1_len,
|
|
const char *str2,
|
|
int64_t str2_len)
|
|
{
|
|
int result = 0;
|
|
if (is_argument_valid(collation_type, str1, str1_len, str2, str2_len)) {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
const bool t_is_prefix = false;
|
|
result = cs->coll->strnncoll(cs,
|
|
reinterpret_cast<const uchar *>(str1),
|
|
str1_len,
|
|
reinterpret_cast<const uchar *>(str2),
|
|
str2_len, t_is_prefix);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
int ObCharset::strcmpsp(ObCollationType collation_type,
|
|
const char *str1,
|
|
int64_t str1_len,
|
|
const char *str2,
|
|
int64_t str2_len,
|
|
bool cmp_endspace)
|
|
{
|
|
int result = 0;
|
|
if (is_argument_valid(collation_type, str1, str1_len, str2, str2_len)) {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
result = cs->coll->strnncollsp(cs,
|
|
reinterpret_cast<const uchar *>(str1),
|
|
str1_len,
|
|
reinterpret_cast<const uchar *>(str2),
|
|
str2_len,
|
|
cmp_endspace);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
size_t ObCharset::casedn(const ObCollationType collation_type, char *src, size_t src_len,
|
|
char *dest, size_t dest_len)
|
|
{
|
|
size_t size = 0;
|
|
if (is_argument_valid(collation_type, src, src_len, dest, dest_len)) {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
size = cs->cset->casedn(cs, src, src_len, dest, dest_len);
|
|
}
|
|
return size;
|
|
}
|
|
|
|
size_t ObCharset::caseup(const ObCollationType collation_type, char *src, size_t src_len,
|
|
char *dest, size_t dest_len)
|
|
{
|
|
size_t size = 0;
|
|
if (is_argument_valid(collation_type, src, src_len, dest, dest_len)) {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
size = cs->cset->caseup(cs, src, src_len, dest, dest_len);
|
|
}
|
|
return size;
|
|
}
|
|
|
|
/**
|
|
* @brief allocate new buf and do caseup
|
|
*/
|
|
int ObCharset::caseup(const ObCollationType collation_type,
|
|
const ObString &src,
|
|
ObString &dst,
|
|
ObIAllocator &allocator)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
|
|
if (OB_UNLIKELY(!is_valid_collation(collation_type))) {
|
|
ret = OB_INVALID_ARGUMENT;
|
|
LOG_WARN("invalid argument", K(ret), K(collation_type));
|
|
} else if (src.empty()) {
|
|
dst.reset();
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
size_t buf_len = src.length() * cs->caseup_multiply;
|
|
char *buf = NULL;
|
|
if (OB_ISNULL(buf = static_cast<char *>(allocator.alloc(buf_len)))) {
|
|
ret = OB_ALLOCATE_MEMORY_FAILED;
|
|
LOG_WARN("fail to allocate memory", K(ret));
|
|
|
|
} else if (charset_type_by_coll(collation_type) == CHARSET_GB18030 ||
|
|
charset_type_by_coll(collation_type) == CHARSET_GB18030_2022) {
|
|
size_t dst_len = caseup(collation_type, (char*)src.ptr(), src.length(), buf, buf_len);
|
|
dst.assign_ptr(buf, static_cast<int32_t>(dst_len));
|
|
} else {
|
|
if (OB_FAIL(ob_write_string(allocator, src, dst))) {
|
|
LOG_WARN("fail to write string", K(ret));
|
|
} else {
|
|
ObCollationType col_type = (charset_type_by_coll(collation_type) == CHARSET_BINARY) ?
|
|
ObCollationType::CS_TYPE_UTF8MB4_BIN : collation_type;
|
|
size_t dst_len = caseup(col_type, dst.ptr(), dst.length(), dst.ptr(), dst.length());
|
|
dst.set_length(static_cast<int32_t>(dst_len));
|
|
}
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
/**
|
|
* @brief allocate new buf and do case down
|
|
*/
|
|
int ObCharset::casedn(const ObCollationType collation_type,
|
|
const ObString &src,
|
|
ObString &dst,
|
|
ObIAllocator &allocator)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
|
|
if (OB_UNLIKELY(!is_valid_collation(collation_type))) {
|
|
ret = OB_INVALID_ARGUMENT;
|
|
LOG_WARN("invalid argument", K(ret), K(collation_type));
|
|
} else if (src.empty()) {
|
|
dst.reset();
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
size_t buf_len = src.length() * cs->caseup_multiply;
|
|
char *buf = NULL;
|
|
if (OB_ISNULL(buf = static_cast<char *>(allocator.alloc(buf_len)))) {
|
|
ret = OB_ALLOCATE_MEMORY_FAILED;
|
|
LOG_WARN("fail to allocate memory", K(ret));
|
|
|
|
} else if (charset_type_by_coll(collation_type) == CHARSET_GB18030 ||
|
|
charset_type_by_coll(collation_type) == CHARSET_GB18030_2022) {
|
|
size_t dst_len = casedn(collation_type, (char*)src.ptr(), src.length(), buf, buf_len);
|
|
dst.assign_ptr(buf, static_cast<int32_t>(dst_len));
|
|
} else {
|
|
if (OB_FAIL(ob_write_string(allocator, src, dst))) {
|
|
LOG_WARN("fail to write string", K(ret));
|
|
} else {
|
|
ObCollationType col_type = (charset_type_by_coll(collation_type) == CHARSET_BINARY) ?
|
|
ObCollationType::CS_TYPE_UTF8MB4_BIN : collation_type;
|
|
size_t dst_len = casedn(col_type, dst.ptr(), dst.length(), dst.ptr(), dst.length());
|
|
dst.set_length(static_cast<int32_t>(dst_len));
|
|
}
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
#define OB_MAX_WEIGHT OB_MAX_VARCHAR_LENGTH
|
|
size_t ObCharset::sortkey(ObCollationType collation_type,
|
|
const char *str,
|
|
int64_t str_len,
|
|
char *key,
|
|
int64_t key_len,
|
|
bool &is_valid_unicode)
|
|
{
|
|
size_t result = 0;
|
|
bool is_valid_unicode_tmp = 0;
|
|
if (is_argument_valid(collation_type, str, str_len, key, key_len)) {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
|
|
// compare_collation_free函数已经能自动过滤尾部空格了,sortkey中过滤空格的逻辑不需要了
|
|
|
|
// is_valid_unicode参数的作用如下:
|
|
// 以一个例子说明,待比较的字符串为:
|
|
//
|
|
// 第一个字符串:0x2c 0x80
|
|
// 第二个字符串:0x2c 0x80 0x20
|
|
//
|
|
// 如果不采用sortkey转换后的字符串比较,会认为0x80及之后的字符为非法的unicode字符,对这之后的字符串采用二进制比较,则认为第二个字符串更大。
|
|
//
|
|
// 而采用sortkey转换后的字符串,则在碰到0x80非法字符之后,就停止转换,导致认为比较结果相等。
|
|
// 修复方案:
|
|
//
|
|
// 对于有非法字符的unicode字符串,采用原生的不转换sortkey的方式进行比较。
|
|
result = cs->coll->strnxfrm(cs,
|
|
reinterpret_cast<uchar *>(key),
|
|
key_len,
|
|
OB_MAX_WEIGHT,
|
|
reinterpret_cast<const uchar *>(str),
|
|
str_len,
|
|
0,
|
|
&is_valid_unicode_tmp);
|
|
is_valid_unicode = is_valid_unicode_tmp;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
size_t ObCharset::sortkey_var_len(ObCollationType collation_type,
|
|
const char *str,
|
|
int64_t str_len,
|
|
char *key,
|
|
int64_t key_len,
|
|
bool is_space_cmp,
|
|
bool &is_valid_unicode)
|
|
{
|
|
size_t result = 0;
|
|
bool is_valid_unicode_tmp = 0;
|
|
if (is_argument_valid(collation_type, str, str_len, key, key_len)) {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
|
|
// 对于有非法字符的unicode字符串,采用原生的不转换sortkey的方式进行比较。
|
|
if (cs->coll->strnxfrm_varlen == NULL) {
|
|
result = -1;
|
|
} else {
|
|
result = cs->coll->strnxfrm_varlen(cs,
|
|
reinterpret_cast<uchar *>(key),
|
|
key_len,
|
|
OB_MAX_WEIGHT,
|
|
reinterpret_cast<const uchar *>(str),
|
|
str_len,
|
|
is_space_cmp,
|
|
&is_valid_unicode_tmp);
|
|
is_valid_unicode = is_valid_unicode_tmp;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
uint64_t ObCharset::hash(ObCollationType collation_type,
|
|
const char *str,
|
|
int64_t str_len,
|
|
uint64_t seed,
|
|
const bool calc_end_space,
|
|
hash_algo hash_algo)
|
|
{
|
|
uint64_t ret = seed;
|
|
if (is_argument_valid(collation_type, str, str_len, NULL, 0)) {
|
|
// since hash_sort() of MY_COLLATION_HANDLER need two intergers, one for input and output as
|
|
// result, the other only for input as random seed, so I find 0xc6a4a7935bd1e995 from
|
|
// murmurhash64A(), U can also find similar usage too.
|
|
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
if (OB_ISNULL(cs->coll)) {
|
|
LOG_ERROR("unexpected error. invalid argument(s)", K(cs), K(cs->coll));
|
|
} else {
|
|
seed = 0xc6a4a7935bd1e995;
|
|
cs->coll->hash_sort(cs, reinterpret_cast<const uchar *>(str), str_len,
|
|
&ret, &seed, calc_end_space, hash_algo);
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
/* only called by unit test for now, is_oracle_mode will always return false in unit test
|
|
* if you want to use this hash fun in other places, please contact @maoli */
|
|
uint64_t ObCharset::hash(ObCollationType collation_type,
|
|
const char *str,
|
|
int64_t str_len,
|
|
uint64_t seed,
|
|
hash_algo hash_algo) {
|
|
return hash(collation_type, str, str_len, seed, lib::is_oracle_mode(), hash_algo);
|
|
}
|
|
|
|
int ObCharset::like_range(ObCollationType collation_type,
|
|
const ObString &like_str,
|
|
char escape,
|
|
char *min_str,
|
|
size_t *min_str_len,
|
|
char *max_str,
|
|
size_t *max_str_len)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX) ||
|
|
OB_ISNULL(min_str) ||
|
|
OB_ISNULL(min_str_len) ||
|
|
OB_ISNULL(max_str) ||
|
|
OB_ISNULL(max_str_len) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("unexpected error. invalid argument(s)",
|
|
K(ret),
|
|
K(collation_type),
|
|
KP(max_str), K(max_str_len),
|
|
KP(min_str), K(min_str_len));
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
static char w_one = '_';
|
|
static char w_many = '%';
|
|
// const char *tmp_str = like_str.ptr();
|
|
// 'abc%' -> real_len=3
|
|
// like_range里面会对min_str做字符填充, ('abc\min\min..', 'abc\max\max..')
|
|
// 存储层那边比较的时候会有字节比较的情况,导致判断出'abc'不在这个范围内
|
|
// 所以这里将start的长度修改为填充前的长度,变为('abc','abc\max\max\max..')
|
|
// size_t real_len = like_str.length();
|
|
// size_t cur_len = 0;
|
|
// while (cur_len < like_str.length()
|
|
// && *(tmp_str + cur_len) != w_many
|
|
// && *(tmp_str + cur_len) != w_one) {
|
|
// ++cur_len;
|
|
// }
|
|
// real_len = cur_len;
|
|
// 上面的修改会引发这样的问题:'a\0' 会不在范围内,因为mysql的utf8特性使得'a\0' < 'a',所以范围不能这么修改
|
|
// 具体的修正还是由存储层来做
|
|
size_t res_size = *min_str_len < *max_str_len ? *min_str_len : *max_str_len;
|
|
if (OB_ISNULL(cs->coll)) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("unexpected error. invalid argument(s)", K(cs), K(cs->coll));
|
|
} else if (0 != cs->coll->like_range(cs,
|
|
like_str.ptr(),
|
|
like_str.length(),
|
|
escape,
|
|
w_one,
|
|
w_many,
|
|
res_size,
|
|
min_str,
|
|
max_str,
|
|
min_str_len,
|
|
max_str_len)) {
|
|
ret = OB_EMPTY_RANGE;
|
|
} else {
|
|
// *min_str_len = real_len;
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
size_t ObCharset::strlen_char(const ObCollationType collation_type,
|
|
const char *str,
|
|
int64_t str_len)
|
|
{
|
|
size_t ret = 0;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
LOG_ERROR("unexpected error. invalid argument(s)", K(collation_type));
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
if (OB_ISNULL(cs->cset)) {
|
|
LOG_ERROR("unexpected error. invalid argument(s)", K(cs), K(cs->cset));
|
|
} else {
|
|
ret = cs->cset->numchars(cs, str, str + str_len);
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
size_t ObCharset::strlen_byte_no_sp(const ObCollationType collation_type,
|
|
const char *str,
|
|
int64_t str_len)
|
|
{
|
|
size_t ret = 0;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
LOG_ERROR("unexpected error. invalid argument(s)", K(collation_type));
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
if (OB_ISNULL(cs->cset)) {
|
|
LOG_ERROR("unexpected error. invalid argument(s)", K(cs), K(cs->cset));
|
|
} else {
|
|
ret = cs->cset->lengthsp(cs, str, str_len);
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
int ObCharset::well_formed_len(ObCollationType collation_type, const char *str,
|
|
int64_t str_len, int64_t &well_formed_len)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("unexpected error. invalid argument(s)", K(collation_type));
|
|
} else if (OB_UNLIKELY(NULL == str && 0 != str_len)) {
|
|
ret = OB_INVALID_ARGUMENT;
|
|
LOG_WARN("invalid argument, str is null and str_len is nonzero",
|
|
KP(str), K(str_len), K(ret));
|
|
} else if (str_len > 0) {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
if (OB_ISNULL(cs->cset)) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("unexpected error. invalid argument(s)", K(cs), K(cs->cset));
|
|
} else {
|
|
int32_t error = 0;
|
|
well_formed_len = cs->cset->well_formed_len(cs, str, str + str_len, UINT64_MAX, &error);
|
|
if (0 != error) {
|
|
ret = OB_ERR_INCORRECT_STRING_VALUE;
|
|
LOG_WARN("well_formed_len failed. invalid char found",
|
|
K(ret), K(error), "str", ObString(str_len, str), KPHEX(str, str_len));
|
|
}
|
|
}
|
|
} else {
|
|
well_formed_len = 0;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
|
|
int ObCharset::well_formed_len(ObCollationType collation_type, const char *str,
|
|
int64_t str_len, int64_t &well_formed_len, int32_t &well_formed_error)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("unexpected error. invalid argument(s)", K(collation_type));
|
|
} else if (OB_UNLIKELY(NULL == str && 0 != str_len)) {
|
|
ret = OB_INVALID_ARGUMENT;
|
|
LOG_WARN("invalid argument, str is null and str_len is nonzero",
|
|
KP(str), K(str_len), K(ret));
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
if (OB_ISNULL(cs->cset)) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("unexpected error. invalid argument(s)", K(cs), K(cs->cset));
|
|
} else {
|
|
well_formed_len = cs->cset->well_formed_len(cs, str, str + str_len, UINT64_MAX, &well_formed_error);
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
size_t ObCharset::charpos(const ObCollationType collation_type,
|
|
const char *str,
|
|
const int64_t str_len,
|
|
const int64_t length,
|
|
int *ret)
|
|
{
|
|
size_t res_pos = 0;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
LOG_ERROR_RET(OB_INVALID_ARGUMENT, "unexpected error. invalid argument(s)", K(collation_type));
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
if (OB_ISNULL(cs->cset)) {
|
|
LOG_ERROR_RET(OB_INVALID_ARGUMENT, "unexpected error. invalid argument(s)", K(cs), K(cs->cset));
|
|
} else {
|
|
res_pos = cs->cset->charpos(cs, str, str + str_len, length);
|
|
if (res_pos > str_len) {
|
|
res_pos = str_len;
|
|
if (OB_NOT_NULL(ret)) {
|
|
*ret = OB_ERROR_OUT_OF_RANGE;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return res_pos;
|
|
}
|
|
|
|
size_t ObCharset::max_bytes_charpos(const ObCollationType collation_type,
|
|
const char *str,
|
|
const int64_t str_len,
|
|
const int64_t max_bytes,
|
|
int64_t &char_len)
|
|
{
|
|
size_t ret = 0;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
LOG_ERROR("unexpected error. invalid argument(s)", K(collation_type));
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
if (OB_ISNULL(cs->cset)) {
|
|
LOG_ERROR("unexpected error. invalid argument(s)", K(cs), K(cs->cset));
|
|
} else {
|
|
size_t char_len_tmp = 0;
|
|
ret = cs->cset->max_bytes_charpos(cs, str, str + str_len, max_bytes, &char_len_tmp);
|
|
char_len = char_len_tmp;
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
bool ObCharset::wildcmp(ObCollationType collation_type,
|
|
const ObString &str,
|
|
const ObString &wildstr,
|
|
int32_t escape, int32_t w_one, int32_t w_many)
|
|
{
|
|
bool ret = false;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
LOG_ERROR("unexpected error. invalid argument(s)", K(collation_type));
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
if (OB_ISNULL(cs->coll)) {
|
|
LOG_ERROR("unexpected error. invalid argument(s)", K(cs), K(cs->coll));
|
|
} else {
|
|
int tmp = cs->coll->wildcmp(cs, str.ptr(), str.ptr() + str.length(),
|
|
wildstr.ptr(), wildstr.ptr() + wildstr.length(),
|
|
escape, w_one, w_many);
|
|
/*
|
|
** 0 if matched
|
|
** -1 if not matched with wildcard
|
|
** 1 if matched with wildcard
|
|
*/
|
|
ret = (0 == tmp);
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
int ObCharset::mb_wc(ObCollationType collation_type,
|
|
const ObString &mb, int32_t &wc)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("unexpected error. invalid argument(s)",
|
|
K(ret), K(collation_type));
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
ob_wc_t my_wc;
|
|
if (OB_ISNULL(cs->cset)) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("unexpected error. invalid argument(s)", K(cs), K(cs->cset));
|
|
} else {
|
|
int tmp = cs->cset->mb_wc(cs, &my_wc, reinterpret_cast<const uchar*>(mb.ptr()),
|
|
reinterpret_cast<const uchar*>(mb.ptr()+mb.length()));
|
|
if (tmp <= 0) {
|
|
ret = OB_ERR_INCORRECT_STRING_VALUE;
|
|
} else {
|
|
ret = OB_SUCCESS;
|
|
wc = static_cast<int32_t>(my_wc);
|
|
}
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
int ObCharset::mb_wc(ObCollationType collation_type,
|
|
const char *mb,
|
|
const int64_t mb_size,
|
|
int32_t &length,
|
|
int32_t &wc)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("unexpected error. invalid argument(s)",
|
|
K(ret), K(collation_type));
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
ob_wc_t my_wc;
|
|
if (OB_ISNULL(cs->cset)) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("unexpected error. invalid argument(s)", K(cs), K(cs->cset));
|
|
} else {
|
|
int tmp = cs->cset->mb_wc(cs, &my_wc, reinterpret_cast<const uchar*>(mb),
|
|
reinterpret_cast<const uchar*>(mb + mb_size));
|
|
if (tmp <= 0) {
|
|
ret = OB_ERR_INCORRECT_STRING_VALUE;
|
|
} else {
|
|
ret = OB_SUCCESS;
|
|
wc = static_cast<int32_t>(my_wc);
|
|
length = static_cast<int32_t>(tmp);
|
|
}
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
int ObCharset::display_len(ObCollationType collation_type,
|
|
const ObString &mb, int64_t &width)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
width = 0;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("unexpected error. invalid argument(s)",
|
|
K(ret), K(collation_type));
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
if (OB_ISNULL(cs->cset)) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("unexpected error. invalid argument(s)", K(cs), K(cs->cset));
|
|
} else {
|
|
const uchar *buf = reinterpret_cast<const uchar*>(mb.ptr());
|
|
int64_t buf_size = mb.length();
|
|
int64_t char_pos = 0;
|
|
bool found = false;
|
|
|
|
while (OB_SUCC(ret) && char_pos < buf_size && !found) {
|
|
ob_wc_t wc;
|
|
int bytes = cs->cset->mb_wc(cs, &wc, buf + char_pos, buf + buf_size);
|
|
|
|
if (OB_UNLIKELY(bytes == OB_CS_ILSEQ)) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_WARN("Failed to call mb_wc", K(ret), "func_ret", bytes);
|
|
} else if (bytes < 0) { // remain buf is too smalll
|
|
found = true;
|
|
} else {
|
|
// get displayed width
|
|
int w = ObCharset::is_cjk_charset(collation_type) ? mk_wcwidth_cjk(wc) : mk_wcwidth(wc);
|
|
if (w <= 0) {
|
|
w = 1;
|
|
}
|
|
if (char_pos + bytes <= buf_size) {
|
|
width += w;
|
|
char_pos += bytes;
|
|
} else {
|
|
found = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
int ObCharset::max_display_width_charpos(ObCollationType collation_type, const char *mb, const int64_t mb_size,
|
|
const int64_t max_width, int64_t &char_pos, int64_t *total_width_ret)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("unexpected error. invalid argument(s)",
|
|
K(ret), K(collation_type));
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
if (OB_ISNULL(cs->cset)) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("unexpected error. invalid argument(s)", K(cs), K(cs->cset));
|
|
} else {
|
|
char_pos = 0;
|
|
const uchar *buf = reinterpret_cast<const uchar*>(mb);
|
|
bool found = false;
|
|
int64_t total_width = 0;
|
|
|
|
while (OB_SUCC(ret) && char_pos < mb_size && !found) {
|
|
ob_wc_t wc;
|
|
int bytes = cs->cset->mb_wc(cs, &wc, buf + char_pos, buf + mb_size);
|
|
|
|
if (OB_UNLIKELY(bytes == OB_CS_ILSEQ)) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_WARN("Failed to call mb_wc", K(ret), "func_ret", bytes);
|
|
} else if (bytes < 0) { // remain buf is too smalll
|
|
found = true;
|
|
} else {
|
|
// get displayed width
|
|
int w = ObCharset::is_cjk_charset(collation_type) ? mk_wcwidth_cjk(wc) : mk_wcwidth(wc);
|
|
if (w <= 0) {
|
|
w = 1;
|
|
}
|
|
if (char_pos + bytes <= mb_size && total_width + w <= max_width) {
|
|
total_width += w;
|
|
char_pos += bytes;
|
|
} else {
|
|
found = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (OB_SUCC(ret) && NULL != total_width_ret) {
|
|
*total_width_ret = total_width;
|
|
}
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
|
|
int ObCharset::wc_mb(ObCollationType collation_type, int32_t wc, char *buff, int32_t buff_len, int32_t &length)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID || collation_type >= CS_TYPE_MAX)
|
|
|| OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("unexpected error. invalid argument(s)", K(ret), K(collation_type));
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
if (OB_ISNULL(cs) || OB_ISNULL(cs->cset)) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("unexpected error. invalid argument(s)", K(cs), K(ret));
|
|
} else {
|
|
int tmp = cs->cset->wc_mb(cs, wc, reinterpret_cast<uchar*>(buff),
|
|
reinterpret_cast<uchar*>(buff + buff_len));
|
|
if (tmp <= 0) {
|
|
ret = OB_ERR_INCORRECT_STRING_VALUE;
|
|
} else {
|
|
ret = OB_SUCCESS;
|
|
length = tmp;
|
|
}
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
const char *ObCharset::charset_name(ObCharsetType charset_type)
|
|
{
|
|
const char *ret_name = "invalid_type";
|
|
switch(charset_type) {
|
|
case CHARSET_BINARY: {
|
|
ret_name = "binary";
|
|
break;
|
|
}
|
|
case CHARSET_UTF8MB4: {
|
|
ret_name = "utf8mb4";
|
|
break;
|
|
}
|
|
case CHARSET_GBK: {
|
|
ret_name = "gbk";
|
|
break;
|
|
}
|
|
case CHARSET_UTF16: {
|
|
ret_name = "utf16";
|
|
break;
|
|
}
|
|
case CHARSET_GB18030: {
|
|
ret_name = "gb18030";
|
|
break;
|
|
}
|
|
case CHARSET_LATIN1: {
|
|
ret_name = "latin1";
|
|
break;
|
|
}
|
|
case CHARSET_GB18030_2022: {
|
|
ret_name = "gb18030_2022";
|
|
break;
|
|
}
|
|
default: {
|
|
break;
|
|
}
|
|
}
|
|
return ret_name;
|
|
}
|
|
|
|
const char *ObCharset::charset_name(ObCollationType collation_type)
|
|
{
|
|
return charset_name(charset_type_by_coll(collation_type));
|
|
}
|
|
|
|
const char *ObCharset::collation_name(ObCollationType collation_type)
|
|
{
|
|
ObCharsetInfo *cs = NULL;
|
|
if (collation_type < CS_TYPE_MAX && collation_type >= CS_TYPE_INVALID) {
|
|
cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
}
|
|
return (NULL == cs) ? "invalid_type" : cs->name;
|
|
}
|
|
|
|
int ObCharset::check_valid_implicit_convert(ObCollationType src_type, ObCollationType dst_type)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
ObCharsetType src_cs = ObCharset::charset_type_by_coll(src_type);
|
|
ObCharsetType dst_cs = ObCharset::charset_type_by_coll(dst_type);
|
|
if ((src_cs == CHARSET_GB18030 && dst_cs == CHARSET_GB18030_2022) ||
|
|
(src_cs == CHARSET_GB18030_2022 && dst_cs == CHARSET_GB18030)) {
|
|
ret = OB_CANT_AGGREGATE_2COLLATIONS;
|
|
LOG_WARN("implict cast between GB18030 and GB18030_2022 not allowed", K(ret));
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
int ObCharset::collation_name(ObCollationType collation_type, ObString &coll_name)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
ObCharsetInfo *charset_info = NULL;
|
|
if (collation_type < CS_TYPE_MAX && collation_type >= CS_TYPE_INVALID) {
|
|
charset_info = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
}
|
|
if (OB_ISNULL(charset_info)) {
|
|
ret = OB_INVALID_ARGUMENT;
|
|
LOG_WARN("invalid collation type", K(ret), K(collation_type));
|
|
} else {
|
|
coll_name = ObString(charset_info->name);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
const char* ObCharset::collation_level(const ObCollationLevel cs_level)
|
|
{
|
|
const char* ret = "unknown_collation_level";
|
|
switch(cs_level) {
|
|
case CS_LEVEL_EXPLICIT: {
|
|
ret = "EXPLICIT";
|
|
break;
|
|
}
|
|
case CS_LEVEL_NONE: {
|
|
ret = "NONE";
|
|
break;
|
|
}
|
|
case CS_LEVEL_IMPLICIT: {
|
|
ret = "IMPLICIT";
|
|
break;
|
|
}
|
|
case CS_LEVEL_SYSCONST: {
|
|
ret = "SYSCONST";
|
|
break;
|
|
}
|
|
case CS_LEVEL_COERCIBLE: {
|
|
ret = "COERCIBLE";
|
|
break;
|
|
}
|
|
case CS_LEVEL_NUMERIC: {
|
|
ret = "NUMERIC";
|
|
break;
|
|
}
|
|
case CS_LEVEL_IGNORABLE: {
|
|
ret = "IGNORABLE";
|
|
break;
|
|
}
|
|
case CS_LEVEL_INVALID: {
|
|
ret = "INVALID";
|
|
break;
|
|
}
|
|
default: {
|
|
break;
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
|
|
ObCharsetType ObCharset::charset_type(const ObString &cs_name)
|
|
{
|
|
ObCharsetType charset_type = CHARSET_INVALID;
|
|
if (0 == cs_name.case_compare("utf8")) {
|
|
// utf8是utf8mb4的别名
|
|
charset_type = CHARSET_UTF8MB4;
|
|
} else if (0 == cs_name.case_compare(ob_charset_utf8mb4_bin.csname)) {
|
|
charset_type = CHARSET_UTF8MB4;
|
|
} else if (0 == cs_name.case_compare(ob_charset_bin.csname)) {
|
|
charset_type = CHARSET_BINARY;
|
|
} else if (0 == cs_name.case_compare(ob_charset_gbk_bin.csname)) {
|
|
charset_type = CHARSET_GBK;
|
|
} else if (0 == cs_name.case_compare(ob_charset_utf16_general_ci.csname)) {
|
|
charset_type = CHARSET_UTF16;
|
|
} else if (0 == cs_name.case_compare(ob_charset_gb18030_bin.csname)) {
|
|
charset_type = CHARSET_GB18030;
|
|
} else if (0 == cs_name.case_compare(ob_charset_latin1.csname)) {
|
|
charset_type = CHARSET_LATIN1;
|
|
} else if (0 == cs_name.case_compare(ob_charset_gb18030_2022_bin.csname)) {
|
|
charset_type = CHARSET_GB18030_2022;
|
|
}
|
|
return charset_type;
|
|
}
|
|
|
|
ObCharsetType ObCharset::charset_type_by_name_oracle(const ObString &cs_name)
|
|
{
|
|
ObCharsetType charset_type = CHARSET_INVALID;
|
|
if (0 == cs_name.case_compare("AL32UTF8")
|
|
|| 0 == cs_name.case_compare("UTF8")) {
|
|
charset_type = CHARSET_UTF8MB4;
|
|
} else if (0 == cs_name.case_compare("AL16UTF16")) {
|
|
charset_type = CHARSET_UTF16;
|
|
} else if (0 == cs_name.case_compare("ZHS16GBK")) {
|
|
charset_type = CHARSET_GBK;
|
|
} else if (0 == cs_name.case_compare("ZHS32GB18030")) {
|
|
charset_type = CHARSET_GB18030;
|
|
} else if (0 == cs_name.case_compare("WE8MSWIN1252")) {
|
|
charset_type = CHARSET_LATIN1;
|
|
} else if (0 == cs_name.case_compare("ZHS32GB18030_2022")) {
|
|
charset_type = CHARSET_GB18030_2022;
|
|
}
|
|
return charset_type;
|
|
}
|
|
|
|
ObCharsetType ObCharset::charset_type(const char *cs_name)
|
|
{
|
|
ObCharsetType ct = CHARSET_INVALID;
|
|
if (OB_ISNULL(cs_name)) {
|
|
LOG_ERROR_RET(OB_INVALID_ARGUMENT, "unexpected error. invalid argument(s)",
|
|
K(ct), KP(cs_name), K(ct));
|
|
} else {
|
|
ObString cs_name_str = ObString::make_string(cs_name);
|
|
ct = charset_type(cs_name_str);
|
|
}
|
|
return ct;
|
|
}
|
|
|
|
ObCollationType ObCharset::collation_type(const ObString &cs_name)
|
|
{
|
|
ObCollationType collation_type = CS_TYPE_INVALID;
|
|
if (0 == cs_name.case_compare("utf8_bin")) {
|
|
collation_type = CS_TYPE_UTF8MB4_BIN;
|
|
} else if (0 == cs_name.case_compare("utf8_general_ci")) {
|
|
collation_type = CS_TYPE_UTF8MB4_GENERAL_CI;
|
|
} else if (0 == cs_name.case_compare(ob_charset_utf8mb4_bin.name)) {
|
|
collation_type = CS_TYPE_UTF8MB4_BIN;
|
|
} else if (0 == cs_name.case_compare(ob_charset_utf8mb4_general_ci.name)) {
|
|
collation_type = CS_TYPE_UTF8MB4_GENERAL_CI;
|
|
} else if (0 == cs_name.case_compare(ob_charset_bin.name)) {
|
|
collation_type = CS_TYPE_BINARY;
|
|
} else if (0 == cs_name.case_compare(ob_charset_gbk_chinese_ci.name)) {
|
|
collation_type = CS_TYPE_GBK_CHINESE_CI;
|
|
} else if (0 == cs_name.case_compare(ob_charset_gbk_bin.name)) {
|
|
collation_type = CS_TYPE_GBK_BIN;
|
|
} else if (0 == cs_name.case_compare(ob_charset_utf16_general_ci.name)) {
|
|
collation_type = CS_TYPE_UTF16_GENERAL_CI;
|
|
} else if (0 == cs_name.case_compare(ob_charset_utf16_bin.name)) {
|
|
collation_type = CS_TYPE_UTF16_BIN;
|
|
} else if (0 == cs_name.case_compare(ob_charset_gb18030_bin.name)) {
|
|
collation_type = CS_TYPE_GB18030_BIN;
|
|
} else if (0 == cs_name.case_compare(ob_charset_gb18030_chinese_ci.name)) {
|
|
collation_type = CS_TYPE_GB18030_CHINESE_CI;
|
|
} else if (0 == cs_name.case_compare(ob_charset_latin1_bin.name)) {
|
|
collation_type = CS_TYPE_LATIN1_BIN;
|
|
} else if (0 == cs_name.case_compare(ob_charset_latin1.name)) {
|
|
collation_type = CS_TYPE_LATIN1_SWEDISH_CI;
|
|
} else if (0 == cs_name.case_compare(ob_charset_gb18030_chinese_cs.name)) {
|
|
collation_type = CS_TYPE_GB18030_CHINESE_CS;
|
|
} else if (0 == cs_name.case_compare("any_cs")) {
|
|
collation_type = CS_TYPE_ANY;
|
|
} else if (0 == cs_name.case_compare(ob_charset_gb18030_2022_bin.name)) {
|
|
collation_type = CS_TYPE_GB18030_2022_BIN;
|
|
} else if (0 == cs_name.case_compare(ob_charset_gb18030_2022_pinyin_ci.name)) {
|
|
collation_type = CS_TYPE_GB18030_2022_PINYIN_CI;
|
|
} else if (0 == cs_name.case_compare(ob_charset_gb18030_2022_pinyin_cs.name)) {
|
|
collation_type = CS_TYPE_GB18030_2022_PINYIN_CS;
|
|
} else if (0 == cs_name.case_compare(ob_charset_gb18030_2022_radical_ci.name)) {
|
|
collation_type = CS_TYPE_GB18030_2022_RADICAL_CI;
|
|
} else if (0 == cs_name.case_compare(ob_charset_gb18030_2022_radical_cs.name)) {
|
|
collation_type = CS_TYPE_GB18030_2022_RADICAL_CS;
|
|
} else if (0 == cs_name.case_compare(ob_charset_gb18030_2022_stroke_ci.name)) {
|
|
collation_type = CS_TYPE_GB18030_2022_STROKE_CI;
|
|
} else if (0 == cs_name.case_compare(ob_charset_gb18030_2022_stroke_cs.name)) {
|
|
collation_type = CS_TYPE_GB18030_2022_STROKE_CS;
|
|
}
|
|
return collation_type;
|
|
}
|
|
|
|
ObCollationType ObCharset::collation_type(const char* cs_name)
|
|
{
|
|
ObString cs_name_str = ObString::make_string(cs_name);
|
|
return collation_type(cs_name_str);
|
|
}
|
|
|
|
bool ObCharset::is_valid_collation(ObCharsetType charset_type, ObCollationType collation_type)
|
|
{
|
|
bool ret = false;
|
|
if (CHARSET_UTF8MB4 == charset_type) {
|
|
if (CS_TYPE_UTF8MB4_BIN == collation_type
|
|
|| CS_TYPE_UTF8MB4_GENERAL_CI == collation_type
|
|
) {
|
|
ret = true;
|
|
}
|
|
} else if (CHARSET_BINARY == charset_type
|
|
&& CS_TYPE_BINARY == collation_type) {
|
|
ret = true;
|
|
} else if (CHARSET_GBK == charset_type) {
|
|
if (CS_TYPE_GBK_BIN == collation_type || CS_TYPE_GBK_CHINESE_CI == collation_type) {
|
|
ret = true;
|
|
}
|
|
} else if (CHARSET_UTF16 == charset_type) {
|
|
if (CS_TYPE_UTF16_GENERAL_CI == collation_type
|
|
|| CS_TYPE_UTF16_BIN == collation_type
|
|
) {
|
|
ret = true;
|
|
}
|
|
} else if (CHARSET_GB18030 == charset_type) {
|
|
if (CS_TYPE_GB18030_CHINESE_CI == collation_type
|
|
|| CS_TYPE_GB18030_BIN == collation_type) {
|
|
ret = true;
|
|
}
|
|
} else if (CHARSET_LATIN1 == charset_type) {
|
|
if (CS_TYPE_LATIN1_SWEDISH_CI == collation_type || CS_TYPE_LATIN1_BIN == collation_type) {
|
|
ret = true;
|
|
}
|
|
} else if (CHARSET_GB18030_2022 == charset_type) {
|
|
ret = is_gb18030_2022(collation_type);
|
|
}
|
|
return ret;
|
|
}
|
|
ObCollationType ObCharset::get_coll_type_by_nlssort_param(ObCharsetType charset_type,
|
|
const ObString &nlssort_param)
|
|
{
|
|
ObCollationType coll_type = CS_TYPE_INVALID;
|
|
ObNLSCollation nls_coll_type = NLS_COLLATION_INVALID;
|
|
static ObCollationType bin_coll_map[CHARSET_MAX] = {
|
|
CS_TYPE_INVALID,
|
|
CS_TYPE_BINARY,
|
|
CS_TYPE_UTF8MB4_BIN,
|
|
CS_TYPE_GBK_BIN,
|
|
CS_TYPE_UTF16_BIN,
|
|
CS_TYPE_GB18030_BIN,
|
|
CS_TYPE_LATIN1_BIN,
|
|
CS_TYPE_GB18030_2022_BIN,
|
|
};
|
|
static ObCollationType non_bin_coll_marks[NLS_COLLATION_MAX] = {
|
|
CS_TYPE_INVALID,
|
|
CS_TYPE_PINYIN_BEGIN_MARK,
|
|
CS_TYPE_RADICAL_BEGIN_MARK,
|
|
CS_TYPE_STROKE_BEGIN_MARK,
|
|
};
|
|
if (0 == nlssort_param.case_compare("SCHINESE_PINYIN_M")) {
|
|
nls_coll_type = NLS_COLLATION_SCHINESE_PINYIN_M;
|
|
} else if (0 == nlssort_param.case_compare("SCHINESE_PINYIN2_M")) {
|
|
nls_coll_type = NLS_COLLATION_SCHINESE_PINYIN2_M;
|
|
} else if (0 == nlssort_param.case_compare("SCHINESE_RADICAL2_M")) {
|
|
nls_coll_type = NLS_COLLATION_SCHINESE_RADICAL2_M;
|
|
} else if (0 == nlssort_param.case_compare("SCHINESE_STROKE2_M")) {
|
|
nls_coll_type = NLS_COLLATION_SCHINESE_STROKE2_M;
|
|
} else if (0 == nlssort_param.case_compare("UCA0900_SCHINESE_PINYIN")) {
|
|
nls_coll_type = NLS_COLLATION_SCHINESE_PINYIN_900;
|
|
} else if (0 == nlssort_param.case_compare("UCA0900_SCHINESE_RADICAL")) {
|
|
nls_coll_type = NLS_COLLATION_SCHINESE_RADICAL_900;
|
|
} else if (0 == nlssort_param.case_compare("UCA0900_SCHINESE_STROKE")) {
|
|
nls_coll_type = NLS_COLLATION_SCHINESE_STROKE_900;
|
|
} else if (0 == nlssort_param.case_compare("BINARY")) {
|
|
nls_coll_type = NLS_COLLATION_BINARY;
|
|
}
|
|
if (is_valid_nls_collation(nls_coll_type) && is_valid_charset(charset_type)) {
|
|
if (NLS_COLLATION_BINARY == nls_coll_type) {
|
|
coll_type = bin_coll_map[charset_type];
|
|
} else if (nls_coll_type == NLS_COLLATION_SCHINESE_PINYIN_M) {
|
|
coll_type = CS_TYPE_GB18030_CHINESE_CS;
|
|
} else if (nls_coll_type == NLS_COLLATION_SCHINESE_PINYIN2_M) {
|
|
coll_type = CS_TYPE_GB18030_2022_PINYIN_CS;
|
|
} else if (nls_coll_type == NLS_COLLATION_SCHINESE_RADICAL2_M) {
|
|
coll_type = CS_TYPE_GB18030_2022_RADICAL_CS;
|
|
} else if (nls_coll_type == NLS_COLLATION_SCHINESE_STROKE2_M) {
|
|
coll_type = CS_TYPE_GB18030_2022_STROKE_CS;
|
|
} else {
|
|
if (charset_type != CHARSET_LATIN1) {
|
|
coll_type = static_cast<ObCollationType>(
|
|
non_bin_coll_marks[nls_coll_type] + (charset_type - CHARSET_BINARY));
|
|
}
|
|
}
|
|
}
|
|
return coll_type;
|
|
}
|
|
|
|
bool ObCharset::is_valid_collation(int64_t collation_type_int)
|
|
{
|
|
ObCollationType collation_type = static_cast<ObCollationType>(collation_type_int);
|
|
return CS_TYPE_UTF8MB4_GENERAL_CI == collation_type
|
|
|| CS_TYPE_UTF8MB4_BIN == collation_type
|
|
|| CS_TYPE_BINARY == collation_type
|
|
|| CS_TYPE_GBK_BIN == collation_type
|
|
|| CS_TYPE_GBK_CHINESE_CI == collation_type
|
|
|| CS_TYPE_UTF16_BIN == collation_type
|
|
|| CS_TYPE_UTF16_GENERAL_CI == collation_type
|
|
|| CS_TYPE_GB18030_BIN == collation_type
|
|
|| CS_TYPE_GB18030_CHINESE_CI == collation_type
|
|
|| CS_TYPE_GB18030_CHINESE_CS == collation_type
|
|
|| CS_TYPE_LATIN1_SWEDISH_CI == collation_type
|
|
|| CS_TYPE_LATIN1_BIN == collation_type
|
|
|| is_gb18030_2022(collation_type)
|
|
;
|
|
}
|
|
|
|
ObCharsetType ObCharset::charset_type_by_coll(ObCollationType collation_type)
|
|
{
|
|
ObCharsetType charset_type = CHARSET_INVALID;
|
|
switch(collation_type) {
|
|
case CS_TYPE_UTF8MB4_GENERAL_CI:
|
|
//fall through
|
|
case CS_TYPE_UTF8MB4_BIN:
|
|
case CS_TYPE_UTF8MB4_ZH_0900_AS_CS:
|
|
case CS_TYPE_UTF8MB4_ZH2_0900_AS_CS:
|
|
case CS_TYPE_UTF8MB4_ZH3_0900_AS_CS:
|
|
case CS_TYPE_UTF8MB4_UNICODE_CI: {
|
|
charset_type = CHARSET_UTF8MB4;
|
|
break;
|
|
}
|
|
case CS_TYPE_BINARY: {
|
|
charset_type = CHARSET_BINARY;
|
|
break;
|
|
}
|
|
case CS_TYPE_GBK_CHINESE_CI:
|
|
case CS_TYPE_GBK_ZH_0900_AS_CS:
|
|
case CS_TYPE_GBK_ZH2_0900_AS_CS:
|
|
case CS_TYPE_GBK_ZH3_0900_AS_CS:
|
|
case CS_TYPE_GBK_BIN: {
|
|
charset_type = CHARSET_GBK;
|
|
break;
|
|
}
|
|
case CS_TYPE_UTF16_BIN:
|
|
case CS_TYPE_UTF16_ZH_0900_AS_CS:
|
|
case CS_TYPE_UTF16_ZH2_0900_AS_CS:
|
|
case CS_TYPE_UTF16_ZH3_0900_AS_CS:
|
|
case CS_TYPE_UTF16_GENERAL_CI:
|
|
case CS_TYPE_UTF16_UNICODE_CI: {
|
|
charset_type = CHARSET_UTF16;
|
|
break;
|
|
}
|
|
case CS_TYPE_GB18030_ZH_0900_AS_CS:
|
|
case CS_TYPE_GB18030_ZH2_0900_AS_CS:
|
|
case CS_TYPE_GB18030_ZH3_0900_AS_CS:
|
|
case CS_TYPE_GB18030_CHINESE_CS:
|
|
case CS_TYPE_GB18030_CHINESE_CI:
|
|
case CS_TYPE_GB18030_BIN: {
|
|
charset_type = CHARSET_GB18030;
|
|
break;
|
|
}
|
|
case CS_TYPE_LATIN1_SWEDISH_CI:
|
|
case CS_TYPE_LATIN1_BIN: {
|
|
charset_type = CHARSET_LATIN1;
|
|
break;
|
|
}
|
|
case CS_TYPE_GB18030_2022_BIN:
|
|
case CS_TYPE_GB18030_2022_PINYIN_CI:
|
|
case CS_TYPE_GB18030_2022_PINYIN_CS:
|
|
case CS_TYPE_GB18030_2022_RADICAL_CI:
|
|
case CS_TYPE_GB18030_2022_RADICAL_CS:
|
|
case CS_TYPE_GB18030_2022_STROKE_CI:
|
|
case CS_TYPE_GB18030_2022_STROKE_CS:
|
|
case CS_TYPE_GB18030_2022_ZH_0900_AS_CS:
|
|
case CS_TYPE_GB18030_2022_ZH2_0900_AS_CS:
|
|
case CS_TYPE_GB18030_2022_ZH3_0900_AS_CS: {
|
|
charset_type = CHARSET_GB18030_2022;
|
|
break;
|
|
}
|
|
default: {
|
|
break;
|
|
}
|
|
}
|
|
return charset_type;
|
|
}
|
|
|
|
ObNlsCharsetId ObCharset::charset_type_to_ora_charset_id(ObCharsetType cs_type)
|
|
{
|
|
ObNlsCharsetId cs_id = CHARSET_INVALID_ID;
|
|
switch (cs_type)
|
|
{
|
|
case CHARSET_UTF8MB4:
|
|
cs_id = CHARSET_AL32UTF8_ID;
|
|
break;
|
|
case CHARSET_GBK:
|
|
cs_id = CHARSET_ZHS16GBK_ID;
|
|
break;
|
|
case CHARSET_GB18030:
|
|
cs_id = CHARSET_ZHS32GB18030_ID;
|
|
break;
|
|
case CHARSET_UTF16:
|
|
cs_id = CHARSET_AL16UTF16_ID;
|
|
break;
|
|
case CHARSET_LATIN1:
|
|
cs_id = CHARSET_WE8MSWIN1252_ID;
|
|
break;
|
|
case CHARSET_GB18030_2022:
|
|
cs_id = CHARSET_ZHS32GB18030_2022_ID;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
return cs_id;
|
|
}
|
|
|
|
ObCharsetType ObCharset::ora_charset_type_to_charset_type(ObNlsCharsetId charset_id)
|
|
{
|
|
ObCharsetType cs_type = CHARSET_INVALID;
|
|
switch (charset_id)
|
|
{
|
|
case CHARSET_AL32UTF8_ID:
|
|
cs_type = CHARSET_UTF8MB4;
|
|
break;
|
|
case CHARSET_ZHS16GBK_ID:
|
|
cs_type = CHARSET_GBK;
|
|
break;
|
|
case CHARSET_ZHS32GB18030_ID:
|
|
cs_type = CHARSET_GB18030;
|
|
break;
|
|
case CHARSET_AL16UTF16_ID:
|
|
cs_type = CHARSET_UTF16;
|
|
break;
|
|
case CHARSET_WE8MSWIN1252_ID:
|
|
cs_type = CHARSET_LATIN1;
|
|
case CHARSET_ZHS32GB18030_2022_ID:
|
|
cs_type = CHARSET_GB18030_2022;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
return cs_type;
|
|
}
|
|
|
|
bool ObCharset::is_valid_nls_collation(ObNLSCollation nls_collation)
|
|
{
|
|
return nls_collation > NLS_COLLATION_INVALID && nls_collation < NLS_COLLATION_MAX;
|
|
}
|
|
|
|
int ObCharset::charset_name_by_coll(const ObString &coll_name, ObString &cs_name)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
ObCollationType coll_type = collation_type(coll_name);
|
|
if (OB_UNLIKELY(CS_TYPE_INVALID == coll_type)) {
|
|
ret = OB_ERR_UNKNOWN_COLLATION;
|
|
LOG_WARN("invalid collation type", K(ret), K(coll_name));
|
|
} else if (OB_FAIL(charset_name_by_coll(coll_type, cs_name))) {
|
|
LOG_WARN("fail to get charset type by collation type", K(ret), K(coll_type), K(coll_name));
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
int ObCharset::charset_name_by_coll(ObCollationType collation_type, ObString &cs_name)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
if (OB_UNLIKELY(CS_TYPE_INVALID == collation_type)) {
|
|
ret = OB_ERR_UNKNOWN_COLLATION;
|
|
LOG_WARN("invalid collation type", K(ret), K(collation_type));
|
|
} else {
|
|
ObCharsetType charset_type = charset_type_by_coll(collation_type);
|
|
if (OB_UNLIKELY(CHARSET_INVALID == charset_type)) {
|
|
ret = OB_ERR_UNKNOWN_CHARSET;
|
|
LOG_WARN("has no charset type of this collation type", K(ret), K(collation_type));
|
|
} else {
|
|
ObString tmp_cs_name = ObString(charset_name(charset_type));
|
|
if (OB_UNLIKELY(tmp_cs_name == "invalid_type")) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("charset str is invalid_type", K(ret), K(charset_type), K(collation_type));
|
|
} else {
|
|
cs_name = tmp_cs_name;
|
|
}
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
int ObCharset::calc_collation(
|
|
const ObCollationLevel collation_level1,
|
|
const ObCollationType collation_type1,
|
|
const ObCollationLevel collation_level2,
|
|
const ObCollationType collation_type2,
|
|
ObCollationLevel &res_level,
|
|
ObCollationType &res_type)
|
|
{
|
|
return ObCharset::result_collation(collation_level1, collation_type1,
|
|
collation_level2, collation_type2,
|
|
res_level, res_type);
|
|
}
|
|
|
|
int ObCharset::result_collation(
|
|
const ObCollationLevel collation_level1,
|
|
const ObCollationType collation_type1,
|
|
const ObCollationLevel collation_level2,
|
|
const ObCollationType collation_type2,
|
|
ObCollationLevel &res_level,
|
|
ObCollationType &res_type)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
if (OB_UNLIKELY(CS_LEVEL_INVALID == collation_level1
|
|
|| CS_LEVEL_INVALID == collation_level2
|
|
|| CS_TYPE_INVALID == collation_type1
|
|
|| CS_TYPE_INVALID == collation_type2)) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_WARN("invalid collation level or type", K(collation_level1), K(collation_type1), K(collation_level2), K(collation_type2));
|
|
} else if (collation_level1 == collation_level2) {
|
|
if (CS_LEVEL_EXPLICIT == collation_level1 && collation_type1 != collation_type2) {
|
|
// ERROR 1267 (HY000): Illegal mix of collations (utf8_general_ci,EXPLICIT) and (utf8_bin,EXPLICIT) for operation '='
|
|
ret = OB_CANT_AGGREGATE_2COLLATIONS;
|
|
// LOG_USER_ERROR(ret);
|
|
} else {
|
|
// just consider two collations: bin & general_ci.
|
|
// we must change the code below if we need to support more collations.
|
|
res_level = collation_level1;
|
|
res_type = (collation_type1 == collation_type2) ? collation_type1 : CS_TYPE_UTF8MB4_BIN;
|
|
}
|
|
} else if (collation_level1 < collation_level2) {
|
|
res_level = collation_level1;
|
|
res_type = collation_type1;
|
|
} else {
|
|
res_level = collation_level2;
|
|
res_type = collation_type2;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
int ObCharset::aggregate_collation(
|
|
const ObCollationLevel collation_level1,
|
|
const ObCollationType collation_type1,
|
|
const ObCollationLevel collation_level2,
|
|
const ObCollationType collation_type2,
|
|
ObCollationLevel &res_level,
|
|
ObCollationType &res_type)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
if (OB_UNLIKELY(
|
|
CS_LEVEL_INVALID == collation_level1
|
|
|| CS_LEVEL_INVALID == collation_level2
|
|
|| !is_valid_collation(collation_type1)
|
|
|| !is_valid_collation(collation_type2))) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_WARN ("invalid collation level or type",
|
|
K(ret), K(collation_level1), K(collation_type1), K(collation_level2), K(collation_type2));
|
|
} else {
|
|
/** 先比较level,level小的优先级大,使用相应的结果。
|
|
* 如果优先级相同,binary和string比较,统一用binary比较
|
|
* 如果都是string,按照规则进行处理
|
|
*/
|
|
ObCharsetType cs1 = charset_type_by_coll(collation_type1);
|
|
ObCharsetType cs2 = charset_type_by_coll(collation_type2);
|
|
if (collation_level1 < collation_level2) {
|
|
res_type = collation_type1;
|
|
res_level = collation_level1;
|
|
} else if (collation_level2 < collation_level1) {
|
|
res_type = collation_type2;
|
|
res_level = collation_level2;
|
|
} else if (CS_TYPE_BINARY == collation_type1) {
|
|
res_level = collation_level1;
|
|
res_type = collation_type1;
|
|
} else if (CS_TYPE_BINARY == collation_type2) {
|
|
res_level = collation_level2;
|
|
res_type = collation_type2;
|
|
} else if (cs1 != cs2) {
|
|
/**
|
|
* 左右字符集不相同的情况
|
|
* 主要以下情况
|
|
* utf8mb4和utf16:使用utf16
|
|
* utf8mb4和gbk:使用utf8mb4
|
|
* utf16和gbk:使用utf16
|
|
* utf8mb4和gb18030:使用utf8mb4
|
|
* utf16和gb18030:使用utf16
|
|
* gbk和gb18030:使用gb18030
|
|
* gb18030_2022 与 gb18030 的 AGGREGATE 暂定禁止
|
|
* 以上任一字符集X与latin1的组合结果都为X,latin1目前地位最低
|
|
*/
|
|
|
|
int res = AGGREGATE_2CHARSET[cs1][cs2];
|
|
if (res == 1) {
|
|
res_type = collation_type1;
|
|
res_level = collation_level1;
|
|
} else if (res == 2) {
|
|
res_type = collation_type2;
|
|
res_level = collation_level2;
|
|
} else {
|
|
// 所有不能转换的情况都到这里
|
|
ret = OB_CANT_AGGREGATE_2COLLATIONS;
|
|
}
|
|
} else {
|
|
//处理相同字符集的情况,每种字符集单独考虑
|
|
if (collation_type1 == collation_type2) {
|
|
res_type = collation_type1;
|
|
res_level = collation_level1;
|
|
} else if (CS_LEVEL_EXPLICIT == collation_level1) {
|
|
ret = OB_CANT_AGGREGATE_2COLLATIONS;
|
|
// ERROR 1267 (HY000): Illegal mix of collations (utf8_general_ci,EXPLICIT) and (utf8_bin,EXPLICIT) for operation '='
|
|
// LOG_USER_ERROR(ret);
|
|
} else if (charset_type_by_coll(collation_type1) == CHARSET_UTF8MB4) {
|
|
if (collation_type1 == CS_TYPE_UTF8MB4_BIN || collation_type2 == CS_TYPE_UTF8MB4_BIN) {
|
|
res_type = CS_TYPE_UTF8MB4_BIN;
|
|
res_level = (CS_TYPE_UTF8MB4_BIN == collation_type1) ? collation_level1 : collation_level2;
|
|
} else {
|
|
// utf8mb4_unicode_ci和utf8mb4_general_ci的情况报错,和mysql兼容
|
|
ret = OB_CANT_AGGREGATE_2COLLATIONS;
|
|
}
|
|
} else if (charset_type_by_coll(collation_type1) == CHARSET_GBK) {
|
|
res_type = CS_TYPE_GBK_BIN;
|
|
res_level = (CS_TYPE_GBK_BIN == collation_type1) ? collation_level1 : collation_level2;
|
|
} else if (charset_type_by_coll(collation_type1) == CHARSET_UTF16) {
|
|
if (collation_type1 == CS_TYPE_UTF16_BIN || collation_type2 == CS_TYPE_UTF16_BIN) {
|
|
res_type = CS_TYPE_UTF16_BIN;
|
|
res_level = (CS_TYPE_UTF16_BIN == collation_type1) ? collation_level1 : collation_level2;
|
|
} else {
|
|
// utf16_unicode_ci和utf16_general_ci直接报错,不应该出现这种情况
|
|
ret = OB_CANT_AGGREGATE_2COLLATIONS;
|
|
}
|
|
} else if (charset_type_by_coll(collation_type1) == CHARSET_GB18030) {
|
|
res_type = CS_TYPE_GB18030_BIN;
|
|
res_level = (CS_TYPE_GB18030_BIN == collation_type1) ? collation_level1 : collation_level2;
|
|
} else if (charset_type_by_coll(collation_type1) == CHARSET_LATIN1) {
|
|
if (collation_type1 == CS_TYPE_LATIN1_BIN || collation_type2 == CS_TYPE_LATIN1_BIN) {
|
|
res_type = CS_TYPE_LATIN1_BIN;
|
|
res_level = (CS_TYPE_LATIN1_BIN == collation_type1) ? collation_level1 : collation_level2;
|
|
} else {
|
|
//未来可能支持latin1_german,与latin1_swedish不兼容
|
|
ret = OB_CANT_AGGREGATE_2COLLATIONS;
|
|
}
|
|
} else if (charset_type_by_coll(collation_type1) == CHARSET_GB18030_2022) {
|
|
res_type = CS_TYPE_GB18030_2022_BIN;
|
|
res_level = (CS_TYPE_GB18030_2022_BIN == collation_type1) ? collation_level1 : collation_level2;
|
|
} else {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("Unexpected charset", K(collation_type1), K(collation_type2), KCSTRING(lbt()));
|
|
}
|
|
}
|
|
|
|
if (OB_SUCC(ret)) {
|
|
ObCharsetType res_cs = charset_type_by_coll(res_type);
|
|
if (CHARSET_GB18030 == res_cs) {
|
|
if (CHARSET_GB18030_2022 == cs1 || CHARSET_GB18030_2022 == cs2) {
|
|
ret = OB_CANT_AGGREGATE_2COLLATIONS;
|
|
}
|
|
} else if (CHARSET_GB18030_2022 == res_cs) {
|
|
if (CHARSET_GB18030 == cs1 || CHARSET_GB18030 == cs2) {
|
|
ret = OB_CANT_AGGREGATE_2COLLATIONS;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (OB_FAIL(ret)) {
|
|
LOG_WARN("Illegal mix of collations", K(ret),
|
|
"type1", ObCharset::collation_name(collation_type1),
|
|
"level1", ObCharset::collation_level(collation_level1),
|
|
"type2", ObCharset::collation_name(collation_type2),
|
|
"level2", ObCharset::collation_level(collation_level2));
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
bool ObCharset::is_bin_sort(ObCollationType collation_type)
|
|
{
|
|
bool ret = false;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
LOG_ERROR("unexpected error. invalid argument(s)",
|
|
K(ret), K(collation_type));
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
ret = (0 != (cs->state & OB_CS_BINSORT));
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
ObCharsetType ObCharset::default_charset_type_ = CHARSET_UTF8MB4;
|
|
ObCollationType ObCharset::default_collation_type_ = CS_TYPE_UTF8MB4_GENERAL_CI;
|
|
|
|
ObCharsetType ObCharset::get_default_charset()
|
|
{
|
|
return ObCharset::default_charset_type_;
|
|
}
|
|
|
|
ObCollationType ObCharset::get_default_collation(ObCharsetType charset_type)
|
|
{
|
|
ObCollationType collation_type = CS_TYPE_INVALID;
|
|
switch(charset_type) {
|
|
case CHARSET_UTF8MB4: {
|
|
collation_type = CS_TYPE_UTF8MB4_GENERAL_CI;
|
|
break;
|
|
}
|
|
case CHARSET_BINARY: {
|
|
collation_type = CS_TYPE_BINARY;
|
|
break;
|
|
}
|
|
case CHARSET_GBK: {
|
|
collation_type = CS_TYPE_GBK_CHINESE_CI;
|
|
break;
|
|
}
|
|
case CHARSET_UTF16: {
|
|
collation_type = CS_TYPE_UTF16_GENERAL_CI;
|
|
break;
|
|
}
|
|
case CHARSET_GB18030: {
|
|
collation_type = CS_TYPE_GB18030_CHINESE_CI;
|
|
break;
|
|
}
|
|
case CHARSET_LATIN1: {
|
|
collation_type = CS_TYPE_LATIN1_SWEDISH_CI;
|
|
break;
|
|
}
|
|
case CHARSET_GB18030_2022: {
|
|
collation_type = CS_TYPE_GB18030_2022_PINYIN_CI;
|
|
break;
|
|
}
|
|
default: {
|
|
break;
|
|
}
|
|
}
|
|
return collation_type;
|
|
}
|
|
|
|
ObCollationType ObCharset::get_default_collation_by_mode(ObCharsetType charset_type,
|
|
bool is_oracle_mode)
|
|
{
|
|
return is_oracle_mode ? get_default_collation_oracle(charset_type)
|
|
: get_default_collation(charset_type);
|
|
}
|
|
|
|
ObCollationType ObCharset::get_default_collation_oracle(ObCharsetType charset_type)
|
|
{
|
|
ObCollationType collation_type = CS_TYPE_INVALID;
|
|
switch(charset_type) {
|
|
case CHARSET_UTF8MB4: {
|
|
collation_type = CS_TYPE_UTF8MB4_BIN;
|
|
break;
|
|
}
|
|
case CHARSET_BINARY: {
|
|
collation_type = CS_TYPE_BINARY;
|
|
break;
|
|
}
|
|
case CHARSET_GBK: {
|
|
collation_type = CS_TYPE_GBK_BIN;
|
|
break;
|
|
}
|
|
case CHARSET_UTF16: {
|
|
collation_type = CS_TYPE_UTF16_BIN;
|
|
break;
|
|
}
|
|
case CHARSET_GB18030: {
|
|
collation_type = CS_TYPE_GB18030_BIN;
|
|
break;
|
|
}
|
|
case CHARSET_LATIN1: {
|
|
collation_type = CS_TYPE_LATIN1_BIN;
|
|
break;
|
|
}
|
|
case CHARSET_GB18030_2022: {
|
|
collation_type = CS_TYPE_GB18030_2022_BIN;
|
|
break;
|
|
}
|
|
default: {
|
|
break;
|
|
}
|
|
}
|
|
return collation_type;
|
|
}
|
|
|
|
int ObCharset::get_default_collation(ObCharsetType charset_type, ObCollationType &collation_type)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
switch(charset_type) {
|
|
case CHARSET_UTF8MB4: {
|
|
collation_type = CS_TYPE_UTF8MB4_GENERAL_CI;
|
|
break;
|
|
}
|
|
case CHARSET_BINARY: {
|
|
collation_type = CS_TYPE_BINARY;
|
|
break;
|
|
}
|
|
case CHARSET_GBK: {
|
|
collation_type = CS_TYPE_GBK_CHINESE_CI;
|
|
break;
|
|
}
|
|
case CHARSET_UTF16: {
|
|
collation_type = CS_TYPE_UTF16_GENERAL_CI;
|
|
break;
|
|
}
|
|
case CHARSET_GB18030: {
|
|
collation_type = CS_TYPE_GB18030_CHINESE_CI;
|
|
break;
|
|
}
|
|
case CHARSET_LATIN1: {
|
|
collation_type = CS_TYPE_LATIN1_SWEDISH_CI;
|
|
break;
|
|
}
|
|
case CHARSET_GB18030_2022: {
|
|
collation_type = CS_TYPE_GB18030_2022_PINYIN_CI;
|
|
break;
|
|
}
|
|
default: {
|
|
ret = OB_INVALID_ARGUMENT;
|
|
LOG_WARN("invalid charset type", K(ret), K(charset_type));
|
|
break;
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
ObCollationType ObCharset::get_bin_collation(ObCharsetType charset_type)
|
|
{
|
|
ObCollationType collation_type = CS_TYPE_INVALID;
|
|
switch(charset_type) {
|
|
case CHARSET_UTF8MB4: {
|
|
collation_type = CS_TYPE_UTF8MB4_BIN;
|
|
break;
|
|
}
|
|
case CHARSET_BINARY: {
|
|
collation_type = CS_TYPE_BINARY;
|
|
break;
|
|
}
|
|
case CHARSET_GBK: {
|
|
collation_type = CS_TYPE_GBK_BIN;
|
|
break;
|
|
}
|
|
case CHARSET_UTF16: {
|
|
collation_type = CS_TYPE_UTF16_BIN;
|
|
break;
|
|
}
|
|
case CHARSET_GB18030: {
|
|
collation_type = CS_TYPE_GB18030_BIN;
|
|
break;
|
|
}
|
|
case CHARSET_LATIN1: {
|
|
collation_type = CS_TYPE_LATIN1_BIN;
|
|
break;
|
|
}
|
|
case CHARSET_GB18030_2022: {
|
|
collation_type = CS_TYPE_GB18030_2022_BIN;
|
|
break;
|
|
}
|
|
default: {
|
|
break;
|
|
}
|
|
}
|
|
return collation_type;
|
|
}
|
|
|
|
int ObCharset::get_default_collation(const ObCollationType &in, ObCollationType &out)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
ObCharsetType charset_type = CHARSET_INVALID;
|
|
if (OB_UNLIKELY(in == CS_TYPE_INVALID)) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
} else if (OB_UNLIKELY(CHARSET_INVALID == (charset_type = ObCharset::charset_type_by_coll(in)))) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
} else if (OB_UNLIKELY(CS_TYPE_INVALID == (out = (lib::is_mysql_mode() ?
|
|
ObCharset::get_default_collation(charset_type)
|
|
: ObCharset::get_default_collation_oracle(charset_type))))) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
ObCollationType ObCharset::get_system_collation()
|
|
{
|
|
return CS_TYPE_UTF8MB4_GENERAL_CI;
|
|
}
|
|
|
|
int ObCharset::first_valid_char(
|
|
const ObCollationType collation_type,
|
|
const char *buf,
|
|
const int64_t buf_size,
|
|
int64_t &char_len)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("unexpected error. invalid argument(s)",
|
|
K(ret), K(collation_type));
|
|
} else if (OB_UNLIKELY(NULL == buf)) {
|
|
ret = OB_NOT_INIT;
|
|
LOG_ERROR("Null buffer passed in", K(ret), KP(buf));
|
|
} else if (buf_size <= 0) {
|
|
char_len = 0;
|
|
} else {
|
|
int error = 0;
|
|
int64_t len = 0;
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
if (OB_ISNULL(cs->cset)) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("unexpected error. invalid argument(s)", K(cs), K(cs->cset));
|
|
} else {
|
|
len = static_cast<int64_t>(cs->cset->well_formed_len(cs, buf, buf + buf_size, 1, &error));
|
|
if (OB_LIKELY(0 == error)) {
|
|
char_len = len;
|
|
} else {
|
|
ret = OB_INVALID_ARGUMENT;
|
|
LOG_WARN("invalid encoding found");
|
|
}
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
int ObCharset::last_valid_char(
|
|
const ObCollationType collation_type,
|
|
const char *buf,
|
|
const int64_t buf_size,
|
|
int64_t &char_len)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
|
|
if (OB_ISNULL(cs)) {
|
|
ret = OB_INVALID_ARGUMENT;
|
|
LOG_WARN("collation type is invalid", K(collation_type), K(ret));
|
|
} else {
|
|
if (buf_size <= 0 || OB_ISNULL(buf)) {
|
|
char_len = 0;
|
|
} else {
|
|
int64_t len = 0;
|
|
for (len = cs->mbminlen; len <= cs->mbmaxlen; ++len) {
|
|
int error = 0;
|
|
int real_len =
|
|
cs->cset->well_formed_len(cs, buf + buf_size - len, buf + buf_size, len, &error);
|
|
if (0 == error && real_len == len) {
|
|
char_len = len;
|
|
break;
|
|
}
|
|
}
|
|
if (len > cs->mbmaxlen) {
|
|
ret = OB_INVALID_ARGUMENT;
|
|
LOG_WARN("invalid encoding found", K(ret), "str", ObString(buf_size, buf));
|
|
}
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
int ObCharset::check_and_fill_info(ObCharsetType &charset_type, ObCollationType &collation_type)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
if (charset_type == CHARSET_INVALID && collation_type == CS_TYPE_INVALID) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
} else if (charset_type == CHARSET_INVALID) {
|
|
charset_type = ObCharset::charset_type_by_coll(collation_type);
|
|
} else if (collation_type == CS_TYPE_INVALID) {
|
|
collation_type = ObCharset::get_default_collation(charset_type);
|
|
} else {
|
|
if (!ObCharset::is_valid_collation(charset_type, collation_type)) {
|
|
ret = OB_ERR_COLLATION_MISMATCH;
|
|
LOG_WARN("invalid collation info", K(charset_type), K(collation_type));
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
bool ObCharset::is_default_collation(ObCollationType collation_type)
|
|
{
|
|
bool ret = false;
|
|
switch (collation_type) {
|
|
case CS_TYPE_UTF8MB4_GENERAL_CI:
|
|
//fall through
|
|
case CS_TYPE_GBK_CHINESE_CI:
|
|
case CS_TYPE_UTF16_GENERAL_CI:
|
|
case CS_TYPE_GB18030_CHINESE_CI:
|
|
case CS_TYPE_LATIN1_SWEDISH_CI:
|
|
case CS_TYPE_GB18030_2022_PINYIN_CI:
|
|
case CS_TYPE_BINARY: {
|
|
ret = true;
|
|
break;
|
|
}
|
|
default: {
|
|
break;
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
|
|
bool ObCharset::is_default_collation(ObCharsetType charset_type, ObCollationType collation_type)
|
|
{
|
|
bool ret = false;
|
|
ObCollationType default_collation_type = get_default_collation(charset_type);
|
|
if (CS_TYPE_INVALID != default_collation_type && collation_type == default_collation_type) {
|
|
ret = true;
|
|
} else { /* empty */ }
|
|
return ret;
|
|
}
|
|
|
|
int ObCharset::strcmp(const ObCollationType collation_type, const ObString &l_str,
|
|
const ObString &r_str)
|
|
{
|
|
int32_t ret = 0;
|
|
if (l_str.empty()) {
|
|
if (!r_str.empty()) {
|
|
ret = -1;
|
|
}
|
|
} else if (r_str.empty()) {
|
|
ret = 1;
|
|
} else {
|
|
ret = ObCharset::strcmp(collation_type, l_str.ptr(), l_str.length(), r_str.ptr(), r_str.length());
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
size_t ObCharset::casedn(const ObCollationType collation_type, ObString &src)
|
|
{
|
|
size_t size = 0;
|
|
if (!src.empty()) {
|
|
size = casedn(collation_type, src.ptr(), src.length(), src.ptr(), src.length());
|
|
src.set_length(static_cast<int32_t>(size));
|
|
}
|
|
return size;
|
|
}
|
|
|
|
size_t ObCharset::caseup(const ObCollationType collation_type, ObString &src)
|
|
{
|
|
size_t size = 0;
|
|
if (!src.empty()) {
|
|
size = caseup(collation_type, src.ptr(), src.length(), src.ptr(), src.length());
|
|
src.set_length(static_cast<int32_t>(size));
|
|
}
|
|
return size;
|
|
}
|
|
|
|
int ObCharset::toupper(const ObCollationType collation_type,
|
|
const ObString &src, ObString &dst,
|
|
ObIAllocator &allocator)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
const ObCharsetInfo *cs_info = NULL;
|
|
if (OB_ISNULL(cs_info = get_charset(collation_type))) {
|
|
ret = OB_INVALID_ARGUMENT;
|
|
LOG_WARN("invalid collation type", K(ret), K(collation_type));
|
|
} else {
|
|
int casemulti = cs_info->caseup_multiply;
|
|
if (1 == casemulti) {
|
|
if (OB_FAIL(ob_write_string(allocator, src, dst))) {
|
|
LOG_WARN("fail to copy string", K(ret), K(src));
|
|
} else {
|
|
size_t size = cs_info->cset->caseup(cs_info, dst.ptr(), dst.length(), dst.ptr(), dst.length());
|
|
dst.assign_ptr(dst.ptr(), static_cast<ObString::obstr_size_t>(size));
|
|
}
|
|
} else {
|
|
char *buf = NULL;
|
|
int64_t buf_len = src.length() * casemulti;
|
|
if (OB_ISNULL(buf = static_cast<char*>(allocator.alloc(buf_len)))) {
|
|
ret = OB_ALLOCATE_MEMORY_FAILED;
|
|
LOG_WARN("fail to alloc memory", K(ret));
|
|
} else {
|
|
size_t size = cs_info->cset->caseup(cs_info, const_cast<char*>(src.ptr()), src.length(), buf, buf_len);
|
|
dst.assign_ptr(buf, static_cast<ObString::obstr_size_t>(size));
|
|
}
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
|
|
int ObCharset::tolower(const ObCollationType collation_type,
|
|
const ObString &src, ObString &dst,
|
|
ObIAllocator &allocator)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
const ObCharsetInfo *cs_info = NULL;
|
|
if (OB_ISNULL(cs_info = get_charset(collation_type))) {
|
|
ret = OB_INVALID_ARGUMENT;
|
|
LOG_WARN("invalid collation type", K(ret), K(collation_type));
|
|
} else {
|
|
int casemulti = cs_info->casedn_multiply;
|
|
if (1 == casemulti) {
|
|
if (OB_FAIL(ob_write_string(allocator, src, dst))) {
|
|
LOG_WARN("fail to copy string", K(ret), K(src));
|
|
} else {
|
|
size_t size = cs_info->cset->casedn(cs_info, dst.ptr(), dst.length(), dst.ptr(), dst.length());
|
|
dst.assign_ptr(dst.ptr(), static_cast<ObString::obstr_size_t>(size));
|
|
}
|
|
} else {
|
|
char *buf = NULL;
|
|
int64_t buf_len = src.length() * casemulti;
|
|
if (OB_ISNULL(buf = static_cast<char*>(allocator.alloc(buf_len)))) {
|
|
ret = OB_ALLOCATE_MEMORY_FAILED;
|
|
LOG_WARN("fail to alloc memory", K(ret));
|
|
} else {
|
|
size_t size = cs_info->cset->casedn(cs_info, const_cast<char*>(src.ptr()), src.length(), buf, buf_len);
|
|
dst.assign_ptr(buf, static_cast<ObString::obstr_size_t>(size));
|
|
}
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
|
|
bool ObCharset::case_insensitive_equal(const ObString &one,
|
|
const ObString &another,
|
|
const ObCollationType &collation_type) {
|
|
return 0 == strcmp(collation_type, one, another);
|
|
}
|
|
|
|
bool ObCharset::case_sensitive_equal(const ObString &one, const ObString &another)
|
|
{
|
|
return 0 == strcmp(CS_TYPE_UTF8MB4_BIN, one, another);
|
|
}
|
|
|
|
//当租户模式为mysql时,不敏感匹配,租户模式为oracle时,敏感匹配
|
|
bool ObCharset::case_compat_mode_equal(const ObString &one, const ObString &another)
|
|
{
|
|
return lib::is_oracle_mode() ?
|
|
case_sensitive_equal(one, another) :
|
|
case_insensitive_equal(one, another);
|
|
}
|
|
/* for db objects' name use, like column names, table names; on oracle mode, trailing spaces are always part of the hash calc
|
|
* although trailing spaces are not allowed in db object's name, "a" and "a " are two different names in Oracle
|
|
* if you want to use this hash fun in other places, please contact @maoli */
|
|
uint64_t ObCharset::hash(const ObCollationType collation_type, const ObString &str,
|
|
uint64_t seed, hash_algo hash_algo)
|
|
{
|
|
uint64_t ret = 0;
|
|
if (!str.empty()) {
|
|
ret = ObCharset::hash(collation_type, str.ptr(), str.length(),
|
|
seed, lib::is_oracle_mode(), hash_algo);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
/* for db objects' name use, like column names, table names; on oracle mode, trailing spaces are always part of the hash calc
|
|
* although trailing spaces are not allowed in db object's name, "a" and "a " are two different names in Oracle
|
|
* if you want to use this hash fun in other places, please contact @xiaofeng.lby */
|
|
uint64_t ObCharset::hash(
|
|
const ObCollationType collation_type, const ObString &str,
|
|
uint64_t seed, const bool calc_end_space, hash_algo hash_algo)
|
|
{
|
|
uint64_t ret = 0;
|
|
if (!str.empty()) {
|
|
ret = ObCharset::hash(collation_type, str.ptr(), str.length(),
|
|
seed, calc_end_space, hash_algo);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
bool ObCharset::case_mode_equal(const ObNameCaseMode case_mode, const ObString &one,
|
|
const ObString &another)
|
|
{
|
|
bool is_equal = false;
|
|
if (OB_UNLIKELY(OB_NAME_CASE_INVALID >= case_mode ||
|
|
case_mode >= OB_NAME_CASE_MAX)) {
|
|
LOG_ERROR_RET(OB_INVALID_ARGUMENT, "unexpected error. invalid cast_mode",
|
|
K(case_mode));
|
|
} else {
|
|
ObCollationType collation_type = CS_TYPE_INVALID;
|
|
if (OB_ORIGIN_AND_SENSITIVE == case_mode) {
|
|
collation_type = CS_TYPE_UTF8MB4_BIN;
|
|
} else if (OB_ORIGIN_AND_INSENSITIVE == case_mode ||
|
|
OB_LOWERCASE_AND_INSENSITIVE == case_mode) {
|
|
collation_type = CS_TYPE_UTF8MB4_GENERAL_CI;
|
|
}
|
|
|
|
if (0 == strcmp(collation_type, one, another)) {
|
|
is_equal = true;
|
|
}
|
|
}
|
|
return is_equal;
|
|
}
|
|
|
|
bool ObCharset::is_space(const ObCollationType collation_type, char c)
|
|
{
|
|
bool ret = false;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
LOG_ERROR("unexpected error. invalid argument(s)",
|
|
K(ret), K(collation_type));
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
ret = ob_isspace(cs, c);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
bool ObCharset::is_graph(const ObCollationType collation_type, char c)
|
|
{
|
|
bool ret = false;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
LOG_ERROR("unexpected error. invalid argument(s)",
|
|
K(ret), K(collation_type));
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
ret = ob_isgraph(cs, c);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
bool ObCharset::usemb(const ObCollationType collation_type)
|
|
{
|
|
bool ret = false;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
LOG_ERROR("unexpected error. invalid argument(s)",
|
|
K(ret), K(collation_type));
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
ret = use_mb(cs);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
int ObCharset::is_mbchar(const ObCollationType collation_type, const char *str, const char *end)
|
|
{
|
|
bool ret = false;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
LOG_ERROR("unexpected error. invalid argument(s)",
|
|
K(ret), K(collation_type));
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
ret = ob_ismbchar(cs, str, end);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
const ObCharsetInfo *ObCharset::get_charset(const ObCollationType collation_type)
|
|
{
|
|
ObCharsetInfo *ret = NULL;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX)) {
|
|
LOG_ERROR_RET(OB_INVALID_ARGUMENT, "unexpected error. invalid argument(s)",
|
|
K(collation_type));
|
|
} else {
|
|
ret = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
int ObCharset::get_mbmaxlen_by_coll(const ObCollationType collation_type, int64_t &mbmaxlen)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("unexpected error. invalid argument(s)",
|
|
K(ret), K(collation_type));
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
mbmaxlen = cs->mbmaxlen;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
int ObCharset::get_mbminlen_by_coll(const ObCollationType collation_type, int64_t &mbminlen)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("unexpected error. invalid argument(s)",
|
|
K(ret), K(collation_type));
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
mbminlen = cs->mbminlen;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
/*in order to prevent a char from be splitted into 2 blocks
|
|
We have to get the right bound of a string in terms a block
|
|
Take "我爱你" as an example
|
|
if len_limit_in_byte = 8 which means that the max size of a block is 8 Bytes
|
|
since '我' and '爱' takes 6 Bytes in total already.
|
|
and '你' takes 3 Bytes.
|
|
if we assign the '你' to the block
|
|
then the total length will be 9 which is greater than 8
|
|
so , byte_num = 6 and char_num = 2 will be returned.
|
|
and '你' has to be assigned to another block.
|
|
|
|
Please note that:
|
|
|
|
byte_num and char_num should not be used if the status returned by this func is not ob_success!
|
|
|
|
*/
|
|
|
|
int ObCharset::fit_string(const ObCollationType collation_type,
|
|
const char *str,
|
|
const int64_t str_len,
|
|
const int64_t len_limit_in_byte,
|
|
int64_t &byte_num,
|
|
int64_t &char_num)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX) ||
|
|
len_limit_in_byte <= 0 ||
|
|
str_len <= 0 ||
|
|
OB_ISNULL(str) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_ERROR("unexpected error. invalid argument(s)",
|
|
K(collation_type), KP(str), K(str_len), K(len_limit_in_byte));
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
byte_num = 0;
|
|
char_num = 0;
|
|
int64_t max_len = std::min(str_len, len_limit_in_byte);
|
|
const char *buf_start = str;
|
|
const char *buf_end = str + str_len;
|
|
int64_t char_len = 0;
|
|
int error = 0;
|
|
while(buf_start < buf_end) {
|
|
char_len = static_cast<int64_t>(cs->cset->well_formed_len(cs, buf_start, buf_end, 1, &error));
|
|
if (OB_UNLIKELY(0 != error || char_len <= 0)) {
|
|
ret = OB_INVALID_ARGUMENT;
|
|
break;
|
|
} else if (OB_UNLIKELY(byte_num > max_len - char_len)) {
|
|
break;
|
|
} else {
|
|
byte_num += char_len;
|
|
buf_start += char_len;
|
|
++char_num;
|
|
}
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
inline bool ObCharset::is_argument_valid(const ObCharsetInfo *cs, const char *str, int64_t str_len)
|
|
{
|
|
//the unexpected case is str is null while str_len is not zero at the same time
|
|
//Yeah, this is obvious. But... Wait a second !
|
|
//What if str is null and str_len is zero which means empty string?
|
|
//Do not worry at all. the routine called (like cs->cset->xxxx) will deal with this
|
|
bool is_arg_valid = true;
|
|
if (lib::is_diagnose_info_enabled()) {
|
|
if ((OB_ISNULL(str) && OB_UNLIKELY(0 != str_len)) ||
|
|
OB_UNLIKELY(str_len < 0) ||
|
|
OB_ISNULL(cs) ||
|
|
OB_ISNULL(cs->cset)) {
|
|
is_arg_valid = false;
|
|
const ObFatalErrExtraInfoGuard *extra_info = ObFatalErrExtraInfoGuard::get_thd_local_val_ptr();
|
|
BACKTRACE_RET(ERROR, OB_INVALID_ARGUMENT, true, "invalid argument. charset info = %p, str = %p, str_len = %ld, extra_info=(%s)", cs, str, str_len, (NULL == extra_info) ? NULL : to_cstring(*extra_info));
|
|
}
|
|
}
|
|
return is_arg_valid;
|
|
}
|
|
inline bool ObCharset::is_argument_valid(const ObCollationType collation_type, const char *str1, int64_t str_len1, const char *str2, int64_t str_len2)
|
|
{
|
|
bool is_arg_valid = true;
|
|
if (lib::is_diagnose_info_enabled()) {
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID || collation_type >= CS_TYPE_MAX) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type]) ||
|
|
OB_UNLIKELY(str_len1 < 0) ||
|
|
OB_UNLIKELY(str_len2 < 0) ||
|
|
(OB_ISNULL(str1) && OB_UNLIKELY(0 != str_len1)) ||
|
|
(OB_ISNULL(str2) && OB_UNLIKELY(0 != str_len2))) {
|
|
is_arg_valid = false;
|
|
const ObFatalErrExtraInfoGuard *extra_info = ObFatalErrExtraInfoGuard::get_thd_local_val_ptr();
|
|
BACKTRACE_RET(ERROR, OB_INVALID_ARGUMENT, true, "invalid argument."
|
|
"collation_type = %d,"
|
|
"str1 = %p,"
|
|
"str1_len = %ld,"
|
|
"str2 = %p,"
|
|
"str2_len = %ld,"
|
|
"extra_info=(%s)", collation_type, str1, str_len1, str2, str_len2,
|
|
(NULL == extra_info) ? NULL : to_cstring(*extra_info));
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
if (OB_ISNULL(cs->cset) || OB_ISNULL(cs->coll)) {
|
|
is_arg_valid = false;
|
|
BACKTRACE_RET(ERROR, OB_INVALID_ARGUMENT, true, "invalid argument."
|
|
"collation_type = %d,"
|
|
"str1 = %p,"
|
|
"str1_len = %ld,"
|
|
"str2 = %p,"
|
|
"str2_len = %ld,"
|
|
"charset handler = %p,"
|
|
"collation handler = %p", collation_type, str1, str_len1, str2, str_len2, cs->cset, cs->coll);
|
|
}
|
|
}
|
|
}
|
|
return is_arg_valid;
|
|
}
|
|
|
|
int ObCharset::get_aggregate_len_unit(const ObCollationType collation_type, bool &len_in_byte)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
len_in_byte = false;
|
|
ObCharsetType res_charset = ObCharset::charset_type_by_coll(collation_type);
|
|
if (CHARSET_UTF8MB4 == res_charset
|
|
|| CHARSET_LATIN1 == res_charset
|
|
|| CHARSET_UTF16 == res_charset
|
|
|| CHARSET_GBK == res_charset
|
|
|| CHARSET_GB18030 == res_charset
|
|
|| CHARSET_GB18030_2022 == res_charset) {
|
|
len_in_byte = false;
|
|
} else if (CHARSET_BINARY == res_charset) {
|
|
len_in_byte = true;
|
|
} else {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_WARN("unexpected charset", K(ret), K(res_charset), K(collation_type));
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
//进行字符集之间的转换,from_type为源字符集,to_type为目标字符集
|
|
int ObCharset::charset_convert(const ObCollationType from_type,
|
|
const char *from_str,
|
|
const uint32_t from_len,
|
|
const ObCollationType to_type,
|
|
char *to_str,
|
|
int64_t to_len,
|
|
uint32_t &result_len,
|
|
bool trim_incomplete_tail,
|
|
bool report_error /*true*/,
|
|
const ob_wc_t replaced_char /*'?'*/) {
|
|
int ret = OB_SUCCESS;
|
|
if (NULL == from_str || from_len <=0) {
|
|
result_len = 0;
|
|
} else if (OB_UNLIKELY(from_type <= CS_TYPE_INVALID
|
|
|| from_type >= CS_TYPE_MAX
|
|
|| to_type <= CS_TYPE_INVALID
|
|
|| to_type >= CS_TYPE_MAX
|
|
|| (OB_ISNULL(to_str)
|
|
|| OB_UNLIKELY(to_len <= 0 || to_len > UINT32_MAX)))) {
|
|
ret = OB_INVALID_ARGUMENT;
|
|
LOG_WARN("invalid convert", K(ret), K(from_type), K(to_type),
|
|
K(ObString(from_len, from_str)), KP(to_str), K(from_len), K(to_len), KCSTRING(lbt()));
|
|
} else {
|
|
ObCharsetInfo *from_cs = static_cast<ObCharsetInfo*>(ObCharset::charset_arr[from_type]);
|
|
ObCharsetInfo *to_cs = static_cast<ObCharsetInfo*>(ObCharset::charset_arr[to_type]);
|
|
ObCharsetType src_cs = ObCharset::charset_type_by_coll(from_type);
|
|
ObCharsetType dst_cs = ObCharset::charset_type_by_coll(to_type);
|
|
if ((src_cs == CHARSET_GB18030 && dst_cs == CHARSET_GB18030_2022) ||
|
|
(src_cs == CHARSET_GB18030_2022 && dst_cs == CHARSET_GB18030)) {
|
|
/** GB18030 and GB18030_2022 have the same code points,
|
|
* but they have different mapping to unicode.
|
|
* So, we do charset_convert from the charset to the same charset*/
|
|
to_cs = from_cs;
|
|
}
|
|
if (OB_ISNULL(from_cs) || OB_ISNULL(to_cs)) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_WARN("unexpected collation type", K(ret), K(from_type), K(to_type));
|
|
} else {
|
|
uint errors = 0;
|
|
result_len = ob_convert(to_str, static_cast<uint32_t>(to_len), to_cs, from_str, from_len, from_cs,
|
|
trim_incomplete_tail, replaced_char, &errors);
|
|
if (OB_UNLIKELY(errors != 0 && report_error)) {
|
|
ret = OB_ERR_INCORRECT_STRING_VALUE;
|
|
LOG_WARN("ob_convert failed", K(ret), K(errors),
|
|
K(from_type), K(to_type),
|
|
"from_charset", from_cs->csname, "to_charset", to_cs->csname,
|
|
K(ObString(from_len, from_str)),
|
|
K(to_len), KPHEX(from_str, from_len));
|
|
}
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
int ObCharset::charset_convert(ObIAllocator &alloc,
|
|
const ObString &in,
|
|
const ObCollationType src_cs_type,
|
|
const ObCollationType dst_cs_type,
|
|
ObString &out,
|
|
int64_t convert_flag,
|
|
int64_t *action_flag)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
if (!is_valid_collation(src_cs_type) || !is_valid_collation(dst_cs_type)) {
|
|
ret = OB_INVALID_ARGUMENT;
|
|
LOG_WARN("invalid collation type", K(ret), K(src_cs_type), K(dst_cs_type));
|
|
} else {
|
|
if (0 == in.length()
|
|
|| charset_type_by_coll(src_cs_type) == charset_type_by_coll(dst_cs_type)
|
|
|| charset_type_by_coll(dst_cs_type) == CHARSET_BINARY) {
|
|
if (!(convert_flag & COPY_STRING_ON_SAME_CHARSET)) {
|
|
out = in;
|
|
} else {
|
|
if (OB_FAIL(ob_write_string(alloc, in, out))) {
|
|
LOG_WARN("fail to write string", K(ret), K(in));
|
|
}
|
|
}
|
|
} else if (charset_type_by_coll(src_cs_type) == CHARSET_BINARY) {
|
|
char *buf = nullptr;
|
|
int32_t align_offset = 0;
|
|
int32_t res_buf_len = 0;
|
|
int mbminlen = ObCharset::get_charset(dst_cs_type)->mbminlen;
|
|
if (mbminlen > 0 && in.length() % mbminlen != 0) {
|
|
align_offset = mbminlen - in.length() % mbminlen;
|
|
}
|
|
res_buf_len = in.length() + align_offset;
|
|
if (OB_ISNULL(buf = static_cast<char*>(alloc.alloc(res_buf_len)))) {
|
|
ret = OB_ALLOCATE_MEMORY_FAILED;
|
|
out.reset();
|
|
LOG_WARN("allocate memory failed", K(ret), K(in), K(align_offset));
|
|
} else {
|
|
MEMCPY(buf + align_offset, in.ptr(), in.length());
|
|
MEMSET(buf, 0, align_offset);
|
|
out.assign_ptr(buf, res_buf_len);
|
|
}
|
|
} else {
|
|
int64_t maxmb_len = 0;
|
|
if (OB_FAIL(ObCharset::get_mbmaxlen_by_coll(dst_cs_type, maxmb_len))) {
|
|
LOG_WARN("failed to get mbmaxlen by coll", K(dst_cs_type));
|
|
} else {
|
|
const uint32_t res_buf_len = in.length() * maxmb_len;
|
|
uint32_t res_len = 0;
|
|
char *res_buf = static_cast<char *>(alloc.alloc(res_buf_len));
|
|
if (OB_ISNULL(res_buf)) {
|
|
ret = OB_ALLOCATE_MEMORY_FAILED;
|
|
LOG_WARN("alloc memory failed", K(ret));
|
|
} else {
|
|
if (OB_SUCC(charset_convert(src_cs_type, in.ptr(), in.length(),
|
|
dst_cs_type, res_buf, res_buf_len, res_len))) {
|
|
out.assign_ptr(res_buf, res_len);
|
|
} else {
|
|
//handle replace unknown character
|
|
LOG_WARN("convert charset failed",
|
|
K(ret), K(in), K(src_cs_type), K(dst_cs_type),
|
|
KPHEX(in.ptr(), in.length()));
|
|
if (!!(convert_flag & REPLACE_UNKNOWN_CHARACTER)) {
|
|
if (OB_NOT_NULL(action_flag)) {
|
|
*action_flag |= REPLACE_UNKNOWN_CHARACTER;
|
|
}
|
|
int32_t in_offset = 0;
|
|
int64_t res_buf_offset = 0;
|
|
ObString question_mark = ObCharsetUtils::get_const_str(dst_cs_type, '?');
|
|
while (in_offset < in.length()
|
|
&& res_buf_offset + question_mark.length() <= res_buf_len) {
|
|
ret = OB_SUCCESS;
|
|
int64_t offset = ObCharset::charpos(src_cs_type, in.ptr() + in_offset,
|
|
in.length() - in_offset, 1, &ret);
|
|
if (OB_SUCC(ret)) {
|
|
ret = ObCharset::charset_convert(src_cs_type, in.ptr() + in_offset, offset,
|
|
dst_cs_type, res_buf + res_buf_offset, res_buf_len - res_buf_offset, res_len);
|
|
}
|
|
in_offset += offset;
|
|
if (OB_SUCCESS == ret) {
|
|
res_buf_offset += res_len;
|
|
} else {
|
|
MEMCPY(res_buf + res_buf_offset, question_mark.ptr(), question_mark.length());
|
|
res_buf_offset += question_mark.length();
|
|
}
|
|
}
|
|
if (in_offset < in.length()) {
|
|
ret = OB_SIZE_OVERFLOW;
|
|
LOG_WARN("buf size over flow", K(ret), K(in), KPHEX(in.ptr(), in.length()));
|
|
} else {
|
|
res_len = res_buf_offset;
|
|
out.assign_ptr(res_buf, res_len);
|
|
ret = OB_SUCCESS;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
int ObCharset::whitespace_padding(ObIAllocator &allocator,
|
|
const ObCollationType coll_type,
|
|
const ObString &input,
|
|
const int64_t pad_whitespace_length,
|
|
ObString &result)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
char *buf = NULL;
|
|
bool is_utf16 = charset_type_by_coll(coll_type) == CHARSET_UTF16;
|
|
int32_t buf_len = input.length() + pad_whitespace_length * (is_utf16 ? 2 : 1);
|
|
if (OB_UNLIKELY(pad_whitespace_length <= 0)) {
|
|
ret = OB_INVALID_ARGUMENT;
|
|
LOG_WARN("invalid len", K(ret), K(pad_whitespace_length));
|
|
} else if (OB_ISNULL(buf = static_cast<char*>(allocator.alloc(buf_len)))) {
|
|
ret = OB_ALLOCATE_MEMORY_FAILED;
|
|
LOG_WARN("no memory", K(ret), K(buf_len));
|
|
} else {
|
|
MEMMOVE(buf, input.ptr(), input.length());
|
|
if (!is_utf16) {
|
|
MEMSET(buf + input.length(), OB_PADDING_CHAR, pad_whitespace_length);
|
|
} else {
|
|
//UTF16 space is 0x0020
|
|
for (int i = input.length(); i + 1 < buf_len; i+=2) {
|
|
buf[i] = '\0';
|
|
buf[i+1] = OB_PADDING_CHAR;
|
|
}
|
|
LOG_DEBUG("UTF16 padding", K(pad_whitespace_length), K(input));
|
|
}
|
|
result = ObString(buf_len, buf_len, buf);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
bool ObCharset::is_cs_nonascii(ObCollationType collation_type)
|
|
{
|
|
bool is_cs_nonascii = false;
|
|
if (OB_UNLIKELY(collation_type <= CS_TYPE_INVALID ||
|
|
collation_type >= CS_TYPE_MAX) ||
|
|
OB_ISNULL(ObCharset::charset_arr[collation_type])) {
|
|
LOG_ERROR_RET(OB_INVALID_ARGUMENT, "unexpected error. invalid argument(s)",
|
|
K(collation_type));
|
|
} else {
|
|
ObCharsetInfo *cs = static_cast<ObCharsetInfo *>(ObCharset::charset_arr[collation_type]);
|
|
is_cs_nonascii = !!(cs->state & OB_CS_NONASCII);
|
|
}
|
|
return is_cs_nonascii;
|
|
}
|
|
|
|
bool ObCharset::is_cjk_charset(ObCollationType collation_type)
|
|
{
|
|
ObCharsetType cs_type = ObCharset::charset_type_by_coll(collation_type);
|
|
bool is_cjk_charset = (cs_type == CHARSET_GBK ||
|
|
cs_type == CHARSET_GB18030 ||
|
|
cs_type == CHARSET_GB18030_2022);
|
|
return is_cjk_charset;
|
|
}
|
|
|
|
bool ObCharset::is_valid_connection_collation(ObCollationType collation_type)
|
|
{
|
|
ObCharsetType cs_type = ObCharset::charset_type_by_coll(collation_type);
|
|
return cs_type == CHARSET_UTF8MB4
|
|
|| cs_type == CHARSET_LATIN1
|
|
|| cs_type == CHARSET_GBK
|
|
|| cs_type == CHARSET_GB18030
|
|
|| cs_type == CHARSET_GB18030_2022
|
|
|| cs_type == CHARSET_BINARY;
|
|
}
|
|
|
|
const char *ObCharset::get_oracle_charset_name_by_charset_type(ObCharsetType charset_type)
|
|
{
|
|
const char* ret = NULL;
|
|
switch (charset_type) {
|
|
case CHARSET_UTF8MB4:
|
|
ret = "AL32UTF8";
|
|
break;
|
|
case CHARSET_GBK:
|
|
ret = "ZHS16GBK";
|
|
break;
|
|
case CHARSET_UTF16:
|
|
ret = "AL16UTF16";
|
|
break;
|
|
case CHARSET_GB18030:
|
|
ret = "ZHS32GB18030";
|
|
break;
|
|
case CHARSET_GB18030_2022:
|
|
ret = "ZHS32GB18030_2022";
|
|
break;
|
|
case CHARSET_LATIN1:
|
|
ret = "WE8MSWIN1252";
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
int ObCharset::get_nls_charset_id_by_charset_type(ObCharsetType charset_type)
|
|
{
|
|
ObNlsCharsetId ret_id = ObNlsCharsetId::CHARSET_INVALID_ID;
|
|
switch (charset_type) {
|
|
case CHARSET_UTF8MB4:
|
|
ret_id = ObNlsCharsetId::CHARSET_AL32UTF8_ID;
|
|
break;
|
|
case CHARSET_GBK:
|
|
ret_id = ObNlsCharsetId::CHARSET_ZHS16GBK_ID;
|
|
break;
|
|
case CHARSET_UTF16:
|
|
ret_id = ObNlsCharsetId::CHARSET_AL16UTF16_ID;
|
|
break;
|
|
case CHARSET_GB18030:
|
|
ret_id = ObNlsCharsetId::CHARSET_ZHS32GB18030_ID;
|
|
break;
|
|
case CHARSET_LATIN1:
|
|
ret_id = ObNlsCharsetId::CHARSET_WE8MSWIN1252_ID;
|
|
break;
|
|
case CHARSET_GB18030_2022:
|
|
ret_id = ObNlsCharsetId::CHARSET_ZHS32GB18030_2022_ID;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
return static_cast<int>(ret_id);
|
|
}
|
|
|
|
|
|
int ObCharset::init_charset()
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
if (OB_FAIL(init_gb18030_2022())) {
|
|
LOG_WARN("failed to init gb18030 2022", K(ret));
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
|
|
ObString ObCharsetUtils::const_str_for_ascii_[CHARSET_MAX][INT8_MAX + 1];
|
|
|
|
int ObCharsetUtils::remove_char_endspace(ObString &str,
|
|
const ObCharsetType &charset_type) {
|
|
int ret = OB_SUCCESS;
|
|
const char *end = str.ptr() + str.length();
|
|
if ((CHARSET_UTF16 == charset_type)) {
|
|
end= (const char *) skip_trailing_space((const uchar *)str.ptr(), str.length(), 1);
|
|
} else {
|
|
end= (const char *) skip_trailing_space((const uchar *)str.ptr(), str.length(), 0);
|
|
}
|
|
if (end >= str.ptr()) {
|
|
str.assign_ptr(str.ptr(), end - str.ptr());
|
|
} else {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_WARN("str len < 0", K(ret));
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
int ObCharsetUtils::init(ObIAllocator &allocator)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
const int64_t buf_len = 32;
|
|
char buf[buf_len] = {0};
|
|
const lib::ObMemAttr attr(common::OB_SYS_TENANT_ID, "CharsetUtil");
|
|
|
|
for (int cs_i = CHARSET_INVALID; cs_i < CHARSET_MAX; ++cs_i) {
|
|
auto charset_type = static_cast<ObCharsetType>(cs_i);
|
|
if (ObCharset::is_valid_charset(charset_type)) {
|
|
ObCollationType coll_type = ObCharset::get_default_collation(charset_type);
|
|
if (!ObCharset::is_valid_collation(coll_type)) {
|
|
ret = OB_ERR_UNEXPECTED;
|
|
LOG_WARN("invalid collation type", K(ret), K(charset_type), K(coll_type));
|
|
}
|
|
for (int ascii_wc = 0; OB_SUCC(ret) && ascii_wc <= INT8_MAX; ascii_wc++) {
|
|
int result_len = 0;
|
|
char *sys_buf = nullptr;
|
|
|
|
if (OB_FAIL(ObCharset::wc_mb(coll_type, ascii_wc, buf, buf_len, result_len))) {
|
|
LOG_WARN("fail to convert ascii to multi byte char", K(ret), K(buf_len));
|
|
} else if (OB_ISNULL(sys_buf = static_cast<char*>(allocator.alloc(result_len, attr)))) {
|
|
ret = OB_ALLOCATE_MEMORY_FAILED;
|
|
LOG_WARN("fail to allocate mem", K(ret), K(result_len));
|
|
} else {
|
|
MEMCPY(sys_buf, buf, result_len);
|
|
const_str_for_ascii_[charset_type][ascii_wc].assign_ptr(sys_buf, result_len);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
bool ObStringScanner::next_character(ObString &encoding, int32_t &wchar, int &ret)
|
|
{
|
|
bool has_next = false;
|
|
ret = next_character(encoding, wchar);
|
|
|
|
if (OB_ITER_END == ret) {
|
|
has_next = false;
|
|
ret = OB_SUCCESS;
|
|
} else if (OB_SUCC(ret)) {
|
|
has_next = true;
|
|
} else {
|
|
LOG_WARN("fail to get next character", K(ret), K(*this));
|
|
has_next = false;
|
|
}
|
|
return has_next;
|
|
}
|
|
|
|
int ObStringScanner::next_character(ObString &encoding, int32_t &wchar)
|
|
{
|
|
int ret = OB_SUCCESS;
|
|
int32_t length = 0;
|
|
|
|
ObString &str = const_cast<ObString &>(str_);
|
|
|
|
if (str.empty()) {
|
|
ret = OB_ITER_END;
|
|
} else if (OB_FAIL(ObCharset::mb_wc(collation_type_, str.ptr(), str.length(), length, wchar))) {
|
|
ret = OB_ERR_INCORRECT_STRING_VALUE;
|
|
LOG_WARN("fail to call mb_wc", K(ret), KPHEX(str.ptr(), str.length()));
|
|
} else {
|
|
encoding.assign_ptr(str.ptr(), length);
|
|
LOG_DEBUG("next_character", K(ret), KPHEX(str.ptr(), str.length()));
|
|
str += length;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
#undef CHARSET_INIT_MEM_ATTR
|
|
|
|
} // namespace common
|
|
} // namespace oceanbase
|