295 lines
10 KiB
C++
295 lines
10 KiB
C++
/**
|
|
* Copyright (c) 2021 OceanBase
|
|
* OceanBase CE is licensed under Mulan PubL v2.
|
|
* You can use this software according to the terms and conditions of the Mulan PubL v2.
|
|
* You may obtain a copy of Mulan PubL v2 at:
|
|
* http://license.coscl.org.cn/MulanPubL-2.0
|
|
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
* See the Mulan PubL v2 for more details.
|
|
*/
|
|
|
|
#include <pthread.h>
|
|
#include <stdio.h>
|
|
#include <time.h>
|
|
#include <sys/time.h>
|
|
#include <codecvt>
|
|
#include "lib/charset/ob_charset.h"
|
|
#include "lib/string/ob_string.h"
|
|
#include "lib/utility/ob_print_utils.h"
|
|
#include "gtest/gtest.h"
|
|
#include <iostream>
|
|
#include <fstream>
|
|
|
|
using namespace oceanbase::common;
|
|
|
|
#define CUR_RESULT_FILE_SUFFIX ".record"
|
|
#define STD_RESULT_FILE_SUFFIX ".result"
|
|
|
|
class TestCharsetRandom : public ::testing::Test {
|
|
public:
|
|
TestCharsetRandom();
|
|
virtual ~TestCharsetRandom();
|
|
virtual void SetUp();
|
|
virtual void TearDown();
|
|
template <typename func>
|
|
void for_each_utf8(func handle);
|
|
|
|
protected:
|
|
void gen_random_unicode_string(const int len, char* res, int& real_len);
|
|
int random_range(const int low, const int high);
|
|
};
|
|
|
|
TestCharsetRandom::TestCharsetRandom()
|
|
{}
|
|
|
|
TestCharsetRandom::~TestCharsetRandom()
|
|
{}
|
|
|
|
void TestCharsetRandom::SetUp()
|
|
{
|
|
srand((unsigned)time(NULL));
|
|
}
|
|
|
|
void TestCharsetRandom::TearDown()
|
|
{}
|
|
|
|
int TestCharsetRandom::random_range(const int low, const int high)
|
|
{
|
|
return std::rand() % (high - low) + low;
|
|
}
|
|
|
|
void TestCharsetRandom::gen_random_unicode_string(const int len, char* res, int& real_len)
|
|
{
|
|
int pos = 0;
|
|
int unicode_point = 0;
|
|
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
|
|
for (int i = 0; i < len; ++i) {
|
|
const int bytes = random_range(1, 7);
|
|
if (bytes < 4) {
|
|
unicode_point = random_range(0, 127);
|
|
} else if (bytes < 6) {
|
|
unicode_point = random_range(0xFF, 0xFFFF);
|
|
} else if (bytes < 7) {
|
|
unicode_point = random_range(0XFFFF, 0X10FFFF);
|
|
}
|
|
std::string utf_str = converter.to_bytes(unicode_point);
|
|
// fprintf(stdout, "code_point=%d\n", unicode_point);
|
|
// fprintf(stdout, "utf8_str=%s\n", utf_str.c_str());
|
|
MEMCPY(res + pos, &utf_str[0], utf_str.length());
|
|
pos += utf_str.length();
|
|
}
|
|
real_len = pos;
|
|
}
|
|
|
|
int unicode_to_utf8(ob_wc_t c, unsigned char* utf8string)
|
|
{
|
|
if (c <= 0x7F) {
|
|
utf8string[0] = c;
|
|
return 1;
|
|
} else if (c <= 0x7FF) {
|
|
utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
|
|
utf8string[1] = 0x80 | (c & 0x3F);
|
|
return 2;
|
|
} else if (c <= 0xFFFF) {
|
|
utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
|
|
utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
|
|
utf8string[2] = 0x80 | (c & 0x3F);
|
|
return 3;
|
|
} else {
|
|
utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
|
|
utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
|
|
utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
|
|
utf8string[3] = 0x80 | (c & 0x3F);
|
|
return 4;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
template <typename func>
|
|
void TestCharsetRandom::for_each_utf8(func handle)
|
|
{
|
|
char buf[4];
|
|
ObString str(4, 0, buf);
|
|
|
|
for (ob_wc_t wchar = 0; wchar < 0x110000; wchar++) {
|
|
int len = unicode_to_utf8(wchar, (unsigned char*)buf);
|
|
ASSERT_TRUE(0 != len);
|
|
str.set_length(len);
|
|
handle(str, wchar);
|
|
}
|
|
}
|
|
|
|
struct TestReusltFileGuard {
|
|
TestReusltFileGuard(const char* test_name) : fp_(nullptr)
|
|
{
|
|
std::string file_path;
|
|
file_path.append("./");
|
|
file_path.append(test_name);
|
|
file_path.append(CUR_RESULT_FILE_SUFFIX);
|
|
fp_ = fopen(file_path.c_str(), "w");
|
|
}
|
|
~TestReusltFileGuard()
|
|
{
|
|
if (nullptr != fp_) {
|
|
fclose(fp_);
|
|
fp_ = nullptr;
|
|
}
|
|
}
|
|
FILE* get_fp()
|
|
{
|
|
return fp_;
|
|
}
|
|
FILE* fp_;
|
|
};
|
|
|
|
void compare_result(const char* test_name)
|
|
{
|
|
std::string cur_res_file_path, std_res_file_path;
|
|
cur_res_file_path.append("./");
|
|
cur_res_file_path.append(test_name);
|
|
cur_res_file_path.append(CUR_RESULT_FILE_SUFFIX);
|
|
std_res_file_path.append("./");
|
|
std_res_file_path.append(test_name);
|
|
std_res_file_path.append(STD_RESULT_FILE_SUFFIX);
|
|
|
|
std::ifstream cur_res(cur_res_file_path, std::ios::binary);
|
|
ASSERT_TRUE(cur_res.is_open());
|
|
std::ifstream std_res(std_res_file_path, std::ios::binary);
|
|
ASSERT_TRUE(std_res.is_open());
|
|
|
|
std::string cur_line;
|
|
std::string std_line;
|
|
int line_no = 0;
|
|
while (std::getline(std_res, std_line)) {
|
|
line_no++;
|
|
ASSERT_TRUE(std::getline(cur_res, cur_line));
|
|
if (0 != std_line.compare(cur_line)) {
|
|
fprintf(stdout,
|
|
"not consistent result detected at line %d:\n"
|
|
"cur_line:%s\n"
|
|
"std_line:%s\n",
|
|
line_no,
|
|
cur_line.c_str(),
|
|
std_line.c_str());
|
|
ASSERT_TRUE(0);
|
|
}
|
|
}
|
|
}
|
|
|
|
TEST_F(TestCharsetRandom, test_wellformed_len_random)
|
|
{
|
|
const int64_t max_len = 100;
|
|
const int64_t max_random_times = 1000;
|
|
char buf[(max_len + 10) * 4 + 1];
|
|
|
|
for (int64_t char_len = 0; char_len <= max_len; char_len++) {
|
|
for (int random_times = max_random_times; random_times > 0; random_times--) {
|
|
int real_len = 0;
|
|
int64_t well_formed_len = 0;
|
|
gen_random_unicode_string(char_len, buf, real_len);
|
|
|
|
// debug value
|
|
std::string str(buf, real_len);
|
|
|
|
// ismbchar() - detects whether the given string is a multi-byte sequence
|
|
do {
|
|
bool is_mbchar_utf8 = (char_len > 0 && ((unsigned char*)buf)[0] > 0x7F);
|
|
ASSERT_TRUE(ObCharset::is_mbchar(CS_TYPE_BINARY, buf, buf + real_len) == 0);
|
|
ASSERT_TRUE(ObCharset::is_mbchar(CS_TYPE_UTF8MB4_GENERAL_CI, buf, buf + real_len) == is_mbchar_utf8);
|
|
ASSERT_TRUE(ObCharset::is_mbchar(CS_TYPE_UTF8MB4_BIN, buf, buf + real_len) == is_mbchar_utf8);
|
|
} while (0);
|
|
|
|
// numchars() - returns number of characters in the given string, e.g. in SQL function CHAR_LENGTH().
|
|
do {
|
|
ASSERT_TRUE(ObCharset::strlen_char(CS_TYPE_BINARY, buf, real_len) == real_len);
|
|
ASSERT_TRUE(ObCharset::strlen_char(CS_TYPE_UTF8MB4_GENERAL_CI, buf, real_len) == char_len);
|
|
ASSERT_TRUE(ObCharset::strlen_char(CS_TYPE_UTF8MB4_BIN, buf, real_len) == char_len);
|
|
} while (0);
|
|
|
|
// charpos() - calculates the offset of the given position in the string.
|
|
// Used in SQL functions LEFT(), RIGHT(), SUBSTRING(),
|
|
do {
|
|
ASSERT_TRUE(ObCharset::charpos(CS_TYPE_BINARY, buf, real_len, real_len) == real_len);
|
|
ASSERT_TRUE(ObCharset::charpos(CS_TYPE_UTF8MB4_GENERAL_CI, buf, real_len, char_len) == real_len);
|
|
ASSERT_TRUE(ObCharset::charpos(CS_TYPE_UTF8MB4_BIN, buf, real_len, char_len) == real_len);
|
|
} while (0);
|
|
|
|
// max_bytes_charpos() - calculates the offset of the given byte position in the string.
|
|
do {
|
|
int64_t char_pos = 0;
|
|
ASSERT_TRUE(ObCharset::max_bytes_charpos(CS_TYPE_BINARY, buf, real_len, real_len, char_pos) == real_len);
|
|
ASSERT_TRUE(char_pos == real_len);
|
|
ASSERT_TRUE(
|
|
ObCharset::max_bytes_charpos(CS_TYPE_UTF8MB4_GENERAL_CI, buf, real_len, real_len, char_pos) == real_len);
|
|
ASSERT_TRUE(char_pos == char_len);
|
|
ASSERT_TRUE(ObCharset::max_bytes_charpos(CS_TYPE_UTF8MB4_BIN, buf, real_len, real_len, char_pos) == real_len);
|
|
ASSERT_TRUE(char_pos == char_len);
|
|
} while (0);
|
|
|
|
// well_formed_len()
|
|
// - returns length of a given multi-byte string in bytes
|
|
// Used in INSERTs to shorten the given string so it
|
|
// a) is "well formed" according to the given character set
|
|
// b) can fit into the given data type
|
|
do {
|
|
ASSERT_TRUE(0 == ObCharset::well_formed_len(CS_TYPE_BINARY, buf, real_len, well_formed_len));
|
|
ASSERT_TRUE(well_formed_len == real_len);
|
|
ASSERT_TRUE(0 == ObCharset::well_formed_len(CS_TYPE_UTF8MB4_GENERAL_CI, buf, real_len, well_formed_len));
|
|
ASSERT_TRUE(well_formed_len == real_len);
|
|
ASSERT_TRUE(0 == ObCharset::well_formed_len(CS_TYPE_UTF8MB4_BIN, buf, real_len, well_formed_len));
|
|
ASSERT_TRUE(well_formed_len == real_len);
|
|
} while (0);
|
|
|
|
// lengthsp() - returns the length of the given string without trailing spaces.
|
|
do {
|
|
int gen_space_len = random_range(0, 10);
|
|
int ori_space_len = 0;
|
|
while (ori_space_len < real_len && buf[real_len - ori_space_len - 1] == 0x20)
|
|
ori_space_len++;
|
|
MEMSET(buf + real_len, 0x20, gen_space_len);
|
|
ASSERT_TRUE(
|
|
ObCharset::strlen_byte_no_sp(CS_TYPE_BINARY, buf, real_len + gen_space_len) == real_len + gen_space_len);
|
|
ASSERT_TRUE(ObCharset::strlen_byte_no_sp(CS_TYPE_UTF8MB4_GENERAL_CI, buf, real_len + gen_space_len) ==
|
|
real_len - ori_space_len);
|
|
ASSERT_TRUE(ObCharset::strlen_byte_no_sp(CS_TYPE_UTF8MB4_GENERAL_CI, buf, real_len + gen_space_len) ==
|
|
real_len - ori_space_len);
|
|
} while (0);
|
|
|
|
// mb_wc - converts the left multi-byte sequence into its Unicode code.
|
|
// wc_mb - converts the given Unicode code into multi-byte sequence.
|
|
|
|
// caseup - converts the given string to lowercase using length
|
|
// casedn - converts the given string to lowercase using length
|
|
// fill() - writes the given Unicode value into the given string
|
|
// with the given length. Used to pad the string, usually
|
|
// with space character, according to the given charset.
|
|
// String-to-number conversion routines
|
|
// scan() - to skip leading spaces in the given string.
|
|
// Used when a string value is inserted into a numeric field.
|
|
|
|
// COLLATION HANDLER
|
|
// strnncoll() - compares two strings according to the given collation
|
|
// strnncollsp() - like the above but ignores trailing spaces for PAD SPACE
|
|
// collations. For NO PAD collations, identical to strnncoll.
|
|
// strnxfrm() - makes a sort key suitable for memcmp() corresponding
|
|
// to the given string
|
|
// like_range() - creates a LIKE range, for optimizer
|
|
// wildcmp() - wildcard comparison, for LIKE
|
|
// strcasecmp() - 0-terminated string comparison
|
|
// instr() - finds the first substring appearance in the string
|
|
// hash_sort() - calculates hash value taking into account
|
|
// the collation rules, e.g. case-insensitivity,
|
|
// accent sensitivity, etc.
|
|
}
|
|
}
|
|
}
|
|
|
|
int main(int argc, char** argv)
|
|
{
|
|
testing::InitGoogleTest(&argc, argv);
|
|
return RUN_ALL_TESTS();
|
|
}
|