/** * Copyright (c) 2021 OceanBase * OceanBase CE is licensed under Mulan PubL v2. * You can use this software according to the terms and conditions of the Mulan PubL v2. * You may obtain a copy of Mulan PubL v2 at: * http://license.coscl.org.cn/MulanPubL-2.0 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. * See the Mulan PubL v2 for more details. */ #include #include #include #include #include #include "lib/charset/ob_charset.h" #include "lib/string/ob_string.h" #include "lib/utility/ob_print_utils.h" #include "gtest/gtest.h" #include #include using namespace oceanbase::common; #define CUR_RESULT_FILE_SUFFIX ".record" #define STD_RESULT_FILE_SUFFIX ".result" class TestCharsetRandom : public ::testing::Test { public: TestCharsetRandom(); virtual ~TestCharsetRandom(); virtual void SetUp(); virtual void TearDown(); template void for_each_utf8(func handle); protected: void gen_random_unicode_string(const int len, char* res, int& real_len); int random_range(const int low, const int high); }; TestCharsetRandom::TestCharsetRandom() {} TestCharsetRandom::~TestCharsetRandom() {} void TestCharsetRandom::SetUp() { srand((unsigned)time(NULL)); } void TestCharsetRandom::TearDown() {} int TestCharsetRandom::random_range(const int low, const int high) { return std::rand() % (high - low) + low; } void TestCharsetRandom::gen_random_unicode_string(const int len, char* res, int& real_len) { int pos = 0; int unicode_point = 0; std::wstring_convert, char32_t> converter; for (int i = 0; i < len; ++i) { const int bytes = random_range(1, 7); if (bytes < 4) { unicode_point = random_range(0, 127); } else if (bytes < 6) { unicode_point = random_range(0xFF, 0xFFFF); } else if (bytes < 7) { unicode_point = random_range(0XFFFF, 0X10FFFF); } std::string utf_str = converter.to_bytes(unicode_point); // fprintf(stdout, "code_point=%d\n", unicode_point); // fprintf(stdout, "utf8_str=%s\n", utf_str.c_str()); MEMCPY(res + pos, &utf_str[0], utf_str.length()); pos += utf_str.length(); } real_len = pos; } int unicode_to_utf8(ob_wc_t c, unsigned char* utf8string) { if (c <= 0x7F) { utf8string[0] = c; return 1; } else if (c <= 0x7FF) { utf8string[0] = 0xC0 | ((c >> 6) & 0x1F); utf8string[1] = 0x80 | (c & 0x3F); return 2; } else if (c <= 0xFFFF) { utf8string[0] = 0xE0 | ((c >> 12) & 0x0F); utf8string[1] = 0x80 | ((c >> 6) & 0x3F); utf8string[2] = 0x80 | (c & 0x3F); return 3; } else { utf8string[0] = 0xF0 | ((c >> 18) & 0x07); utf8string[1] = 0x80 | ((c >> 12) & 0x3F); utf8string[2] = 0x80 | ((c >> 6) & 0x3F); utf8string[3] = 0x80 | (c & 0x3F); return 4; } return 0; } template void TestCharsetRandom::for_each_utf8(func handle) { char buf[4]; ObString str(4, 0, buf); for (ob_wc_t wchar = 0; wchar < 0x110000; wchar++) { int len = unicode_to_utf8(wchar, (unsigned char*)buf); ASSERT_TRUE(0 != len); str.set_length(len); handle(str, wchar); } } struct TestReusltFileGuard { TestReusltFileGuard(const char* test_name) : fp_(nullptr) { std::string file_path; file_path.append("./"); file_path.append(test_name); file_path.append(CUR_RESULT_FILE_SUFFIX); fp_ = fopen(file_path.c_str(), "w"); } ~TestReusltFileGuard() { if (nullptr != fp_) { fclose(fp_); fp_ = nullptr; } } FILE* get_fp() { return fp_; } FILE* fp_; }; void compare_result(const char* test_name) { std::string cur_res_file_path, std_res_file_path; cur_res_file_path.append("./"); cur_res_file_path.append(test_name); cur_res_file_path.append(CUR_RESULT_FILE_SUFFIX); std_res_file_path.append("./"); std_res_file_path.append(test_name); std_res_file_path.append(STD_RESULT_FILE_SUFFIX); std::ifstream cur_res(cur_res_file_path, std::ios::binary); ASSERT_TRUE(cur_res.is_open()); std::ifstream std_res(std_res_file_path, std::ios::binary); ASSERT_TRUE(std_res.is_open()); std::string cur_line; std::string std_line; int line_no = 0; while (std::getline(std_res, std_line)) { line_no++; ASSERT_TRUE(std::getline(cur_res, cur_line)); if (0 != std_line.compare(cur_line)) { fprintf(stdout, "not consistent result detected at line %d:\n" "cur_line:%s\n" "std_line:%s\n", line_no, cur_line.c_str(), std_line.c_str()); ASSERT_TRUE(0); } } } TEST_F(TestCharsetRandom, test_wellformed_len_random) { const int64_t max_len = 100; const int64_t max_random_times = 1000; char buf[(max_len + 10) * 4 + 1]; for (int64_t char_len = 0; char_len <= max_len; char_len++) { for (int random_times = max_random_times; random_times > 0; random_times--) { int real_len = 0; int64_t well_formed_len = 0; gen_random_unicode_string(char_len, buf, real_len); // debug value std::string str(buf, real_len); // ismbchar() - detects whether the given string is a multi-byte sequence do { bool is_mbchar_utf8 = (char_len > 0 && ((unsigned char*)buf)[0] > 0x7F); ASSERT_TRUE(ObCharset::is_mbchar(CS_TYPE_BINARY, buf, buf + real_len) == 0); ASSERT_TRUE(ObCharset::is_mbchar(CS_TYPE_UTF8MB4_GENERAL_CI, buf, buf + real_len) == is_mbchar_utf8); ASSERT_TRUE(ObCharset::is_mbchar(CS_TYPE_UTF8MB4_BIN, buf, buf + real_len) == is_mbchar_utf8); } while (0); // numchars() - returns number of characters in the given string, e.g. in SQL function CHAR_LENGTH(). do { ASSERT_TRUE(ObCharset::strlen_char(CS_TYPE_BINARY, buf, real_len) == real_len); ASSERT_TRUE(ObCharset::strlen_char(CS_TYPE_UTF8MB4_GENERAL_CI, buf, real_len) == char_len); ASSERT_TRUE(ObCharset::strlen_char(CS_TYPE_UTF8MB4_BIN, buf, real_len) == char_len); } while (0); // charpos() - calculates the offset of the given position in the string. // Used in SQL functions LEFT(), RIGHT(), SUBSTRING(), do { ASSERT_TRUE(ObCharset::charpos(CS_TYPE_BINARY, buf, real_len, real_len) == real_len); ASSERT_TRUE(ObCharset::charpos(CS_TYPE_UTF8MB4_GENERAL_CI, buf, real_len, char_len) == real_len); ASSERT_TRUE(ObCharset::charpos(CS_TYPE_UTF8MB4_BIN, buf, real_len, char_len) == real_len); } while (0); // max_bytes_charpos() - calculates the offset of the given byte position in the string. do { int64_t char_pos = 0; ASSERT_TRUE(ObCharset::max_bytes_charpos(CS_TYPE_BINARY, buf, real_len, real_len, char_pos) == real_len); ASSERT_TRUE(char_pos == real_len); ASSERT_TRUE( ObCharset::max_bytes_charpos(CS_TYPE_UTF8MB4_GENERAL_CI, buf, real_len, real_len, char_pos) == real_len); ASSERT_TRUE(char_pos == char_len); ASSERT_TRUE(ObCharset::max_bytes_charpos(CS_TYPE_UTF8MB4_BIN, buf, real_len, real_len, char_pos) == real_len); ASSERT_TRUE(char_pos == char_len); } while (0); // well_formed_len() // - returns length of a given multi-byte string in bytes // Used in INSERTs to shorten the given string so it // a) is "well formed" according to the given character set // b) can fit into the given data type do { ASSERT_TRUE(0 == ObCharset::well_formed_len(CS_TYPE_BINARY, buf, real_len, well_formed_len)); ASSERT_TRUE(well_formed_len == real_len); ASSERT_TRUE(0 == ObCharset::well_formed_len(CS_TYPE_UTF8MB4_GENERAL_CI, buf, real_len, well_formed_len)); ASSERT_TRUE(well_formed_len == real_len); ASSERT_TRUE(0 == ObCharset::well_formed_len(CS_TYPE_UTF8MB4_BIN, buf, real_len, well_formed_len)); ASSERT_TRUE(well_formed_len == real_len); } while (0); // lengthsp() - returns the length of the given string without trailing spaces. do { int gen_space_len = random_range(0, 10); int ori_space_len = 0; while (ori_space_len < real_len && buf[real_len - ori_space_len - 1] == 0x20) ori_space_len++; MEMSET(buf + real_len, 0x20, gen_space_len); ASSERT_TRUE( ObCharset::strlen_byte_no_sp(CS_TYPE_BINARY, buf, real_len + gen_space_len) == real_len + gen_space_len); ASSERT_TRUE(ObCharset::strlen_byte_no_sp(CS_TYPE_UTF8MB4_GENERAL_CI, buf, real_len + gen_space_len) == real_len - ori_space_len); ASSERT_TRUE(ObCharset::strlen_byte_no_sp(CS_TYPE_UTF8MB4_GENERAL_CI, buf, real_len + gen_space_len) == real_len - ori_space_len); } while (0); // mb_wc - converts the left multi-byte sequence into its Unicode code. // wc_mb - converts the given Unicode code into multi-byte sequence. // caseup - converts the given string to lowercase using length // casedn - converts the given string to lowercase using length // fill() - writes the given Unicode value into the given string // with the given length. Used to pad the string, usually // with space character, according to the given charset. // String-to-number conversion routines // scan() - to skip leading spaces in the given string. // Used when a string value is inserted into a numeric field. // COLLATION HANDLER // strnncoll() - compares two strings according to the given collation // strnncollsp() - like the above but ignores trailing spaces for PAD SPACE // collations. For NO PAD collations, identical to strnncoll. // strnxfrm() - makes a sort key suitable for memcmp() corresponding // to the given string // like_range() - creates a LIKE range, for optimizer // wildcmp() - wildcard comparison, for LIKE // strcasecmp() - 0-terminated string comparison // instr() - finds the first substring appearance in the string // hash_sort() - calculates hash value taking into account // the collation rules, e.g. case-insensitivity, // accent sensitivity, etc. } } } int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); }