Files
postgresql/src/test/regress/sql/collate.utf8.sql
Jeff Davis f69319f2f1 Support C.UTF-8 locale in the new builtin collation provider.
The builtin C.UTF-8 locale has similar semantics to the libc locale of
the same name. That is, code point sort order (fast, memcmp-based)
combined with Unicode semantics for character operations such as
pattern matching, regular expressions, and
LOWER()/INITCAP()/UPPER(). The character semantics are based on
Unicode simple case mappings.

The builtin provider's C.UTF-8 offers several important advantages
over libc:

 * faster sorting -- benefits from additional optimizations such as
   abbreviated keys and varstrfastcmp_c
 * faster case conversion, e.g. LOWER(), at least compared with some
   libc implementations
 * available on all platforms with identical semantics, and the
   semantics are stable, testable, and documentable within a given
   Postgres major version

Being based on memcmp, the builtin C.UTF-8 locale does not offer
natural language sort order. But it is an improvement for most use
cases that might otherwise use libc's "C.UTF-8" locale, as well as
many use cases that use libc's "C" locale.

Discussion: https://postgr.es/m/ff4c2f2f9c8fc7ca27c1c24ae37ecaeaeaff6b53.camel%40j-davis.com
Reviewed-by: Daniel Vérité, Peter Eisentraut, Jeremy Schneider
2024-03-19 15:24:41 -07:00

68 lines
1.9 KiB
SQL

/*
* This test is for collations and character operations when using the
* builtin provider with the C.UTF-8 locale.
*/
/* skip test if not UTF8 server encoding */
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
\if :skip_test
\quit
\endif
SET client_encoding TO UTF8;
--
-- Test PG_C_UTF8
--
CREATE COLLATION regress_pg_c_utf8 (
provider = builtin, locale = 'C_UTF8'); -- fails
CREATE COLLATION regress_pg_c_utf8 (
provider = builtin, locale = 'C.UTF8');
DROP COLLATION regress_pg_c_utf8;
CREATE COLLATION regress_pg_c_utf8 (
provider = builtin, locale = 'C.UTF-8');
CREATE TABLE test_pg_c_utf8 (
t TEXT COLLATE PG_C_UTF8
);
INSERT INTO test_pg_c_utf8 VALUES
('abc DEF 123abc'),
('ábc sßs ßss DÉF'),
('DŽxxDŽ džxxDž Džxxdž'),
('ȺȺȺ'),
('ⱥⱥⱥ'),
('ⱥȺ');
SELECT
t, lower(t), initcap(t), upper(t),
length(convert_to(t, 'UTF8')) AS t_bytes,
length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes,
length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes,
length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes
FROM test_pg_c_utf8;
DROP TABLE test_pg_c_utf8;
-- negative test: Final_Sigma not used for builtin locale C.UTF-8
SELECT lower('ΑΣ' COLLATE PG_C_UTF8);
SELECT lower('ΑͺΣͺ' COLLATE PG_C_UTF8);
SELECT lower('Α΄Σ΄' COLLATE PG_C_UTF8);
-- properties
SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_C_UTF8;
SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_C_UTF8;
SELECT '@' !~ '[[:alnum:]]' COLLATE PG_C_UTF8;
SELECT '=' ~ '[[:punct:]]' COLLATE PG_C_UTF8; -- symbols are punctuation in posix
SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_C_UTF8;
SELECT '' !~ '\d' COLLATE PG_C_UTF8; -- only 0-9 considered digits in posix
-- case mapping
SELECT 'xYz' ~* 'XyZ' COLLATE PG_C_UTF8;
SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8;
SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8;
SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8;
SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed