Files
postgresql/src/common/unicode/case_test.c
Jeff Davis 90260e2ec6 Fix INITCAP() word boundaries for PG_UNICODE_FAST.
Word boundaries are based on whether a character is alphanumeric or
not. For the PG_UNICODE_FAST collation, alphanumeric includes
non-ASCII digits; whereas for the PG_C_UTF8 collation, it only
includes digits 0-9. Pass down the right information from the
pg_locale_t into initcap_wbnext to differentiate the behavior.

Reported-by: Noah Misch <noah@leadboat.com>
Reviewed-by: Noah Misch <noah@leadboat.com>
Discussion: https://postgr.es/m/20250417135841.33.nmisch@google.com
2025-04-21 12:34:58 -07:00

409 lines
10 KiB
C

/*-------------------------------------------------------------------------
* case_test.c
* Program to test Unicode case mapping functions.
*
* Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/common/unicode/case_test.c
*
*-------------------------------------------------------------------------
*/
#include "postgres_fe.h"
#include <locale.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wctype.h>
#ifdef USE_ICU
#include <unicode/ucasemap.h>
#include <unicode/uchar.h>
#endif
#include "common/unicode_case.h"
#include "common/unicode_category.h"
#include "common/unicode_version.h"
/* enough to hold largest source or result string, including NUL */
#define BUFSZ 256
#ifdef USE_ICU
static UCaseMap * casemap = NULL;
#endif
typedef size_t (*TestFunc) (char *dst, size_t dstsize, const char *src,
ssize_t srclen);
/* simple boundary iterator copied from pg_locale_builtin.c */
struct WordBoundaryState
{
const char *str;
size_t len;
size_t offset;
bool posix;
bool init;
bool prev_alnum;
};
static size_t
initcap_wbnext(void *state)
{
struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
while (wbstate->offset < wbstate->len &&
wbstate->str[wbstate->offset] != '\0')
{
pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
wbstate->offset);
bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
{
size_t prev_offset = wbstate->offset;
wbstate->init = true;
wbstate->offset += unicode_utf8len(u);
wbstate->prev_alnum = curr_alnum;
return prev_offset;
}
wbstate->offset += unicode_utf8len(u);
}
return wbstate->len;
}
#ifdef USE_ICU
static void
icu_test_simple(pg_wchar code)
{
pg_wchar lower = unicode_lowercase_simple(code);
pg_wchar title = unicode_titlecase_simple(code);
pg_wchar upper = unicode_uppercase_simple(code);
pg_wchar fold = unicode_casefold_simple(code);
pg_wchar iculower = u_tolower(code);
pg_wchar icutitle = u_totitle(code);
pg_wchar icuupper = u_toupper(code);
pg_wchar icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
if (lower != iculower || title != icutitle || upper != icuupper ||
fold != icufold)
{
printf("case_test: FAILURE for codepoint 0x%06x\n", code);
printf("case_test: Postgres lower/title/upper/fold: 0x%06x/0x%06x/0x%06x/0x%06x\n",
lower, title, upper, fold);
printf("case_test: ICU lower/title/upper/fold: 0x%06x/0x%06x/0x%06x/0x%06x\n",
iculower, icutitle, icuupper, icufold);
printf("\n");
exit(1);
}
}
static void
icu_test_full(char *str)
{
char lower[BUFSZ];
char title[BUFSZ];
char upper[BUFSZ];
char fold[BUFSZ];
char icu_lower[BUFSZ];
char icu_title[BUFSZ];
char icu_upper[BUFSZ];
char icu_fold[BUFSZ];
UErrorCode status;
/* full case mapping doesn't use posix semantics */
struct WordBoundaryState wbstate = {
.str = str,
.len = strlen(str),
.offset = 0,
.posix = false,
.init = false,
.prev_alnum = false,
};
unicode_strlower(lower, BUFSZ, str, -1, true);
unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate);
unicode_strupper(upper, BUFSZ, str, -1, true);
unicode_strfold(fold, BUFSZ, str, -1, true);
status = U_ZERO_ERROR;
ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status);
status = U_ZERO_ERROR;
ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status);
status = U_ZERO_ERROR;
ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status);
status = U_ZERO_ERROR;
ucasemap_utf8FoldCase(casemap, icu_fold, BUFSZ, str, -1, &status);
if (strcmp(lower, icu_lower) != 0)
{
printf("case_test: str='%s' lower='%s' icu_lower='%s'\n", str, lower,
icu_lower);
exit(1);
}
if (strcmp(title, icu_title) != 0)
{
printf("case_test: str='%s' title='%s' icu_title='%s'\n", str, title,
icu_title);
exit(1);
}
if (strcmp(upper, icu_upper) != 0)
{
printf("case_test: str='%s' upper='%s' icu_upper='%s'\n", str, upper,
icu_upper);
exit(1);
}
if (strcmp(fold, icu_fold) != 0)
{
printf("case_test: str='%s' fold='%s' icu_fold='%s'\n", str, fold,
icu_fold);
exit(1);
}
}
/*
* Exhaustively compare case mappings with the results from ICU.
*/
static void
test_icu(void)
{
int successful = 0;
int skipped_mismatch = 0;
for (pg_wchar code = 0; code <= 0x10ffff; code++)
{
pg_unicode_category category = unicode_category(code);
if (category != PG_U_UNASSIGNED)
{
uint8_t icu_category = u_charType(code);
char code_str[5] = {0};
if (icu_category == PG_U_UNASSIGNED)
{
skipped_mismatch++;
continue;
}
icu_test_simple(code);
unicode_to_utf8(code, (unsigned char *) code_str);
icu_test_full(code_str);
successful++;
}
}
if (skipped_mismatch > 0)
printf("case_test: skipped %d codepoints unassigned in ICU due to Unicode version mismatch\n",
skipped_mismatch);
printf("case_test: ICU simple mapping test: %d codepoints successful\n",
successful);
}
#endif
static void
test_convert(TestFunc tfunc, const char *test_string, const char *expected)
{
size_t src1len = strlen(test_string);
size_t src2len = -1; /* NUL-terminated */
size_t dst1len = strlen(expected);
size_t dst2len = strlen(expected) + 1; /* NUL-terminated */
char *src1 = malloc(src1len);
char *dst1 = malloc(dst1len);
char *src2 = strdup(test_string);
char *dst2 = malloc(dst2len);
size_t needed;
memcpy(src1, test_string, src1len); /* not NUL-terminated */
/* neither source nor destination are NUL-terminated */
memset(dst1, 0x7F, dst1len);
needed = tfunc(dst1, dst1len, src1, src1len);
if (needed != strlen(expected))
{
printf("case_test: convert_case test1 FAILURE: '%s' needed %zu expected %zu\n",
test_string, needed, strlen(expected));
exit(1);
}
if (memcmp(dst1, expected, dst1len) != 0)
{
printf("case_test: convert_case test1 FAILURE: test: '%s' result: '%.*s' expected: '%s'\n",
test_string, (int) dst1len, dst1, expected);
exit(1);
}
/* destination is NUL-terminated and source is not */
memset(dst2, 0x7F, dst2len);
needed = tfunc(dst2, dst2len, src1, src1len);
if (needed != strlen(expected))
{
printf("case_test: convert_case test2 FAILURE: '%s' needed %zu expected %zu\n",
test_string, needed, strlen(expected));
exit(1);
}
if (strcmp(dst2, expected) != 0)
{
printf("case_test: convert_case test2 FAILURE: test: '%s' result: '%s' expected: '%s'\n",
test_string, dst2, expected);
exit(1);
}
/* source is NUL-terminated and destination is not */
memset(dst1, 0x7F, dst1len);
needed = tfunc(dst1, dst1len, src2, src2len);
if (needed != strlen(expected))
{
printf("case_test: convert_case test3 FAILURE: '%s' needed %zu expected %zu\n",
test_string, needed, strlen(expected));
printf("case_test: convert_case test3 FAILURE: needed %zu\n", needed);
exit(1);
}
if (memcmp(dst1, expected, dst1len) != 0)
{
printf("case_test: convert_case test3 FAILURE: test: '%s' result: '%.*s' expected: '%s'\n",
test_string, (int) dst1len, dst1, expected);
exit(1);
}
/* both source and destination are NUL-terminated */
memset(dst2, 0x7F, dst2len);
needed = tfunc(dst2, dst2len, src2, src2len);
if (needed != strlen(expected))
{
printf("case_test: convert_case test4 FAILURE: '%s' needed %zu expected %zu\n",
test_string, needed, strlen(expected));
exit(1);
}
if (strcmp(dst2, expected) != 0)
{
printf("case_test: convert_case test4 FAILURE: test: '%s' result: '%s' expected: '%s'\n",
test_string, dst2, expected);
exit(1);
}
free(src1);
free(dst1);
free(src2);
free(dst2);
}
static size_t
tfunc_lower(char *dst, size_t dstsize, const char *src,
ssize_t srclen)
{
return unicode_strlower(dst, dstsize, src, srclen, true);
}
static size_t
tfunc_title(char *dst, size_t dstsize, const char *src,
ssize_t srclen)
{
struct WordBoundaryState wbstate = {
.str = src,
.len = srclen,
.offset = 0,
.init = false,
.prev_alnum = false,
};
return unicode_strtitle(dst, dstsize, src, srclen, true, initcap_wbnext,
&wbstate);
}
static size_t
tfunc_upper(char *dst, size_t dstsize, const char *src,
ssize_t srclen)
{
return unicode_strupper(dst, dstsize, src, srclen, true);
}
static size_t
tfunc_fold(char *dst, size_t dstsize, const char *src,
ssize_t srclen)
{
return unicode_strfold(dst, dstsize, src, srclen, true);
}
static void
test_convert_case()
{
/* test string with no case changes */
test_convert(tfunc_lower, "√∞", "√∞");
/* test adjust-to-cased behavior */
test_convert(tfunc_title, "abc 123xyz", "Abc 123xyz");
/* test string with case changes */
test_convert(tfunc_upper, "abc", "ABC");
/* test string with case changes and byte length changes */
test_convert(tfunc_lower, "ȺȺȺ", "ⱥⱥⱥ");
/* test special case conversions */
test_convert(tfunc_upper, "ß", "SS");
test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307");
test_convert(tfunc_upper, "ıiIİ", "IIIİ");
test_convert(tfunc_fold, "ıiIİ", "ıiii\u0307");
/* test final sigma */
test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς");
test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
test_convert(tfunc_fold, "σςΣ ΣΣΣ", "σσσ σσσ");
/* test that alphanumerics are word characters */
test_convert(tfunc_title, "λλ", "Λλ");
test_convert(tfunc_title, "1a", "1a");
/* U+FF11 FULLWIDTH ONE is alphanumeric for full case mapping */
test_convert(tfunc_title, "\uFF11a", "\uFF11a");
#ifdef USE_ICU
icu_test_full("");
icu_test_full("ȺȺȺ");
icu_test_full("ßßß");
icu_test_full("√∞");
icu_test_full("a b");
icu_test_full("abc 123xyz");
icu_test_full("σςΣ ΣΣΣ");
icu_test_full("ıiIİ");
icu_test_full("\uFF11a");
/* test <alpha><iota_subscript><acute> */
icu_test_full("\u0391\u0345\u0301");
#endif
printf("case_test: convert_case: success\n");
}
int
main(int argc, char **argv)
{
#ifdef USE_ICU
UErrorCode status = U_ZERO_ERROR;
/*
* Disable ICU's word break adjustment for titlecase to match the expected
* behavior of unicode_strtitle().
*/
casemap = ucasemap_open("und", U_TITLECASE_NO_BREAK_ADJUSTMENT, &status);
if (U_FAILURE(status))
{
printf("case_test: failure opening UCaseMap: %s\n",
u_errorName(status));
exit(1);
}
#endif
printf("case_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
#ifdef USE_ICU
printf("case_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);
test_icu();
#else
printf("case_test: ICU not available; skipping\n");
#endif
test_convert_case();
#ifdef USE_ICU
ucasemap_close(casemap);
#endif
exit(0);
}