mirror of
https://git.postgresql.org/git/postgresql.git
synced 2026-02-12 01:18:35 +08:00
The builtin C.UTF-8 locale has similar semantics to the libc locale of the same name. That is, code point sort order (fast, memcmp-based) combined with Unicode semantics for character operations such as pattern matching, regular expressions, and LOWER()/INITCAP()/UPPER(). The character semantics are based on Unicode simple case mappings. The builtin provider's C.UTF-8 offers several important advantages over libc: * faster sorting -- benefits from additional optimizations such as abbreviated keys and varstrfastcmp_c * faster case conversion, e.g. LOWER(), at least compared with some libc implementations * available on all platforms with identical semantics, and the semantics are stable, testable, and documentable within a given Postgres major version Being based on memcmp, the builtin C.UTF-8 locale does not offer natural language sort order. But it is an improvement for most use cases that might otherwise use libc's "C.UTF-8" locale, as well as many use cases that use libc's "C" locale. Discussion: https://postgr.es/m/ff4c2f2f9c8fc7ca27c1c24ae37ecaeaeaff6b53.camel%40j-davis.com Reviewed-by: Daniel Vérité, Peter Eisentraut, Jeremy Schneider
68 lines
1.9 KiB
SQL
68 lines
1.9 KiB
SQL
/*
|
|
* This test is for collations and character operations when using the
|
|
* builtin provider with the C.UTF-8 locale.
|
|
*/
|
|
|
|
/* skip test if not UTF8 server encoding */
|
|
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
|
|
\if :skip_test
|
|
\quit
|
|
\endif
|
|
|
|
SET client_encoding TO UTF8;
|
|
|
|
--
|
|
-- Test PG_C_UTF8
|
|
--
|
|
|
|
CREATE COLLATION regress_pg_c_utf8 (
|
|
provider = builtin, locale = 'C_UTF8'); -- fails
|
|
CREATE COLLATION regress_pg_c_utf8 (
|
|
provider = builtin, locale = 'C.UTF8');
|
|
DROP COLLATION regress_pg_c_utf8;
|
|
CREATE COLLATION regress_pg_c_utf8 (
|
|
provider = builtin, locale = 'C.UTF-8');
|
|
|
|
CREATE TABLE test_pg_c_utf8 (
|
|
t TEXT COLLATE PG_C_UTF8
|
|
);
|
|
INSERT INTO test_pg_c_utf8 VALUES
|
|
('abc DEF 123abc'),
|
|
('ábc sßs ßss DÉF'),
|
|
('DŽxxDŽ džxxDž Džxxdž'),
|
|
('ȺȺȺ'),
|
|
('ⱥⱥⱥ'),
|
|
('ⱥȺ');
|
|
|
|
SELECT
|
|
t, lower(t), initcap(t), upper(t),
|
|
length(convert_to(t, 'UTF8')) AS t_bytes,
|
|
length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes,
|
|
length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes,
|
|
length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes
|
|
FROM test_pg_c_utf8;
|
|
|
|
DROP TABLE test_pg_c_utf8;
|
|
|
|
-- negative test: Final_Sigma not used for builtin locale C.UTF-8
|
|
SELECT lower('ΑΣ' COLLATE PG_C_UTF8);
|
|
SELECT lower('ΑͺΣͺ' COLLATE PG_C_UTF8);
|
|
SELECT lower('Α΄Σ΄' COLLATE PG_C_UTF8);
|
|
|
|
-- properties
|
|
|
|
SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_C_UTF8;
|
|
SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_C_UTF8;
|
|
SELECT '@' !~ '[[:alnum:]]' COLLATE PG_C_UTF8;
|
|
SELECT '=' ~ '[[:punct:]]' COLLATE PG_C_UTF8; -- symbols are punctuation in posix
|
|
SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_C_UTF8;
|
|
SELECT '൧' !~ '\d' COLLATE PG_C_UTF8; -- only 0-9 considered digits in posix
|
|
|
|
-- case mapping
|
|
|
|
SELECT 'xYz' ~* 'XyZ' COLLATE PG_C_UTF8;
|
|
SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8;
|
|
SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8;
|
|
SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8;
|
|
SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed
|