修复copy识别gbk/gb18030中文字符错误的bug

This commit is contained in:
luo_zihao5524
2023-07-07 15:52:39 +08:00
parent 25583ac6ff
commit 24d025b453
7 changed files with 300 additions and 3 deletions

View File

@ -6937,6 +6937,7 @@ static bool CopyReadLineTextTemplate(CopyState cstate)
for (;;) {
int prev_raw_ptr;
char c;
char sec = '\0';
/*
* Load more data if needed. Ideally we would just force four bytes
@ -6974,6 +6975,9 @@ static bool CopyReadLineTextTemplate(CopyState cstate)
/* OK to fetch a character */
prev_raw_ptr = raw_buf_ptr;
c = copy_raw_buf[raw_buf_ptr++];
if (raw_buf_ptr < copy_buf_len) {
sec = copy_raw_buf[raw_buf_ptr];
}
if (csv_mode) {
/*
@ -7265,10 +7269,12 @@ static bool CopyReadLineTextTemplate(CopyState cstate)
* high-bit set, so as an optimization we can avoid this block
* entirely if it is not set.
*/
if (cstate->encoding_embeds_ascii && IS_HIGHBIT_SET(c)) {
if ((cstate->encoding_embeds_ascii || cstate->file_encoding == PG_GBK || cstate->file_encoding == PG_GB18030)
&& IS_HIGHBIT_SET(c)) {
int mblen;
mblen_str[0] = c;
mblen_str[1] = sec;
/* All our encodings only read the first byte to get the length */
mblen = pg_encoding_mblen(cstate->file_encoding, mblen_str);
IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(mblen - 1);

View File

@ -0,0 +1,37 @@
CREATE DATABASE db_gb18030 TEMPLATE template0 encoding 'GB18030' lc_ctype 'zh_CN.GB18030' lc_collate 'zh_CN.GB18030';
\c db_gb18030
create table t (id int,c varchar);
insert into t values (1,'����');
insert into t values (2,'���ԫ\');
insert into t values (3,'����3�\����4');
insert into t values (4,'�\����4');
insert into t values (5,'�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�[�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\');
insert into t values (6,'�9�0�9�1�9�2�9�3�9�4�9�5�9�5�9�6�9�7�9�8�9�0�9�1�9�2�9�3�9�4�9�5�9�6�9�7�9�8');
create table t_stdin (id int,c varchar);
create table t_csv (id int,c varchar);
create table t_text (id int,c varchar);
create table t_fixed (id int,c varchar);
create table t_binary (id int,c varchar);
COPY t TO '@abs_srcdir@/data/datanode1/t_csv.data' WITH(FORMAT 'csv');
COPY t TO '@abs_srcdir@/data/datanode1/t_text.data' WITH(FORMAT 'text');
COPY t TO '@abs_srcdir@/data/datanode1/t_fixed.data' fixed formatter(id(0,2), c(2,300));
COPY t TO '@abs_srcdir@/data/datanode1/t_binary.data' WITH(FORMAT 'binary');
COPY t_csv FROM '@abs_srcdir@/data/datanode1/t_csv.data' WITH(FORMAT 'csv');
COPY t_text FROM '@abs_srcdir@/data/datanode1/t_text.data' WITH(FORMAT 'text');
COPY t_fixed FROM '@abs_srcdir@/data/datanode1/t_fixed.data' fixed formatter(id(0,2), c(2,300));
COPY t_binary FROM '@abs_srcdir@/data/datanode1/t_binary.data' WITH(FORMAT 'binary');
SELECT * FROM t;
SELECT * FROM t_csv;
SELECT * FROM t_text;
SELECT * FROM t_fixed;
SELECT * FROM t_binary;
\d t;
\d t_csv;
\d t_text;
\d t_fixed;
\d t_binary;

View File

@ -0,0 +1,36 @@
CREATE DATABASE db_gbk_test TEMPLATE template0 encoding 'GBK' lc_ctype 'zh_CN.GBK' lc_collate 'zh_CN.GBK';
\c db_gbk_test
create table t (id int,c varchar);
insert into t values (1,'����');
insert into t values (2,'���ԫ\');
insert into t values (3,'����3�\����4');
insert into t values (4,'�\����4');
insert into t values (5,'�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�[�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\');
create table t_stdin (id int,c varchar);
create table t_csv (id int,c varchar);
create table t_text (id int,c varchar);
create table t_fixed (id int,c varchar);
create table t_binary (id int,c varchar);
COPY t TO '@abs_srcdir@/data/datanode1/t_csv.data' WITH(FORMAT 'csv');
COPY t TO '@abs_srcdir@/data/datanode1/t_text.data' WITH(FORMAT 'text');
COPY t TO '@abs_srcdir@/data/datanode1/t_fixed.data' fixed formatter(id(0,2), c(2,300));
COPY t TO '@abs_srcdir@/data/datanode1/t_binary.data' WITH(FORMAT 'binary');
COPY t_csv FROM '@abs_srcdir@/data/datanode1/t_csv.data' WITH(FORMAT 'csv');
COPY t_text FROM '@abs_srcdir@/data/datanode1/t_text.data' WITH(FORMAT 'text');
COPY t_fixed FROM '@abs_srcdir@/data/datanode1/t_fixed.data' fixed formatter(id(0,2), c(2,300));
COPY t_binary FROM '@abs_srcdir@/data/datanode1/t_binary.data' WITH(FORMAT 'binary');
SELECT * FROM t;
SELECT * FROM t_csv;
SELECT * FROM t_text;
SELECT * FROM t_fixed;
SELECT * FROM t_binary;
\d t;
\d t_csv;
\d t_text;
\d t_fixed;
\d t_binary;

View File

@ -0,0 +1,112 @@
CREATE DATABASE db_gb18030 TEMPLATE template0 encoding 'GB18030' lc_ctype 'zh_CN.GB18030' lc_collate 'zh_CN.GB18030';
\c db_gb18030
create table t (id int,c varchar);
insert into t values (1,'����');
insert into t values (2,'���ԫ\');
insert into t values (3,'����3�\����4');
insert into t values (4,'�\����4');
insert into t values (5,'�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�[�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\');
insert into t values (6,'�9�0�9�1�9�2�9�3�9�4�9�5�9�5�9�6�9�7�9�8�9�0�9�1�9�2�9�3�9�4�9�5�9�6�9�7�9�8');
create table t_stdin (id int,c varchar);
create table t_csv (id int,c varchar);
create table t_text (id int,c varchar);
create table t_fixed (id int,c varchar);
create table t_binary (id int,c varchar);
COPY t TO '@abs_srcdir@/data/datanode1/t_csv.data' WITH(FORMAT 'csv');
COPY t TO '@abs_srcdir@/data/datanode1/t_text.data' WITH(FORMAT 'text');
COPY t TO '@abs_srcdir@/data/datanode1/t_fixed.data' fixed formatter(id(0,2), c(2,300));
COPY t TO '@abs_srcdir@/data/datanode1/t_binary.data' WITH(FORMAT 'binary');
COPY t_csv FROM '@abs_srcdir@/data/datanode1/t_csv.data' WITH(FORMAT 'csv');
COPY t_text FROM '@abs_srcdir@/data/datanode1/t_text.data' WITH(FORMAT 'text');
COPY t_fixed FROM '@abs_srcdir@/data/datanode1/t_fixed.data' fixed formatter(id(0,2), c(2,300));
COPY t_binary FROM '@abs_srcdir@/data/datanode1/t_binary.data' WITH(FORMAT 'binary');
SELECT * FROM t;
id | c
----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 | ����
2 | ���ԫ\
3 | ����3�\����4
4 | �\����4
5 | �\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�[�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\
6 | �9�0�9�1�9�2�9�3�9�4�9�5�9�5�9�6�9�7�9�8�9�0�9�1�9�2�9�3�9�4�9�5�9�6�9�7�9�8
(6 rows)
SELECT * FROM t_csv;
id | c
----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 | ����
2 | ���ԫ\
3 | ����3�\����4
4 | �\����4
5 | �\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�[�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\
6 | �9�0�9�1�9�2�9�3�9�4�9�5�9�5�9�6�9�7�9�8�9�0�9�1�9�2�9�3�9�4�9�5�9�6�9�7�9�8
(6 rows)
SELECT * FROM t_text;
id | c
----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 | ����
2 | ���ԫ\
3 | ����3�\����4
4 | �\����4
5 | �\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�[�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\
6 | �9�0�9�1�9�2�9�3�9�4�9�5�9�5�9�6�9�7�9�8�9�0�9�1�9�2�9�3�9�4�9�5�9�6�9�7�9�8
(6 rows)
SELECT * FROM t_fixed;
id | c
----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 | ����
2 | ���ԫ\
3 | ����3�\����4
4 | �\����4
5 | �\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�[�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\
6 | �9�0�9�1�9�2�9�3�9�4�9�5�9�5�9�6�9�7�9�8�9�0�9�1�9�2�9�3�9�4�9�5�9�6�9�7�9�8
(6 rows)
SELECT * FROM t_binary;
id | c
----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 | ����
2 | ���ԫ\
3 | ����3�\����4
4 | �\����4
5 | �\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�[�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\
6 | �9�0�9�1�9�2�9�3�9�4�9�5�9�5�9�6�9�7�9�8�9�0�9�1�9�2�9�3�9�4�9�5�9�6�9�7�9�8
(6 rows)
\d t;
Table "public.t"
Column | Type | Modifiers
--------+-------------------+-----------
id | integer |
c | character varying |
\d t_csv;
Table "public.t_csv"
Column | Type | Modifiers
--------+-------------------+-----------
id | integer |
c | character varying |
\d t_text;
Table "public.t_text"
Column | Type | Modifiers
--------+-------------------+-----------
id | integer |
c | character varying |
\d t_fixed;
Table "public.t_fixed"
Column | Type | Modifiers
--------+-------------------+-----------
id | integer |
c | character varying |
\d t_binary;
Table "public.t_binary"
Column | Type | Modifiers
--------+-------------------+-----------
id | integer |
c | character varying |

View File

@ -0,0 +1,106 @@
CREATE DATABASE db_gbk_test TEMPLATE template0 encoding 'GBK' lc_ctype 'zh_CN.GBK' lc_collate 'zh_CN.GBK';
\c db_gbk_test
create table t (id int,c varchar);
insert into t values (1,'����');
insert into t values (2,'���ԫ\');
insert into t values (3,'����3�\����4');
insert into t values (4,'�\����4');
insert into t values (5,'�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�[�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\');
create table t_stdin (id int,c varchar);
create table t_csv (id int,c varchar);
create table t_text (id int,c varchar);
create table t_fixed (id int,c varchar);
create table t_binary (id int,c varchar);
COPY t TO '@abs_srcdir@/data/datanode1/t_csv.data' WITH(FORMAT 'csv');
COPY t TO '@abs_srcdir@/data/datanode1/t_text.data' WITH(FORMAT 'text');
COPY t TO '@abs_srcdir@/data/datanode1/t_fixed.data' fixed formatter(id(0,2), c(2,300));
COPY t TO '@abs_srcdir@/data/datanode1/t_binary.data' WITH(FORMAT 'binary');
COPY t_csv FROM '@abs_srcdir@/data/datanode1/t_csv.data' WITH(FORMAT 'csv');
COPY t_text FROM '@abs_srcdir@/data/datanode1/t_text.data' WITH(FORMAT 'text');
COPY t_fixed FROM '@abs_srcdir@/data/datanode1/t_fixed.data' fixed formatter(id(0,2), c(2,300));
COPY t_binary FROM '@abs_srcdir@/data/datanode1/t_binary.data' WITH(FORMAT 'binary');
SELECT * FROM t;
id | c
----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 | ����
2 | ���ԫ\
3 | ����3�\����4
4 | �\����4
5 | �\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�[�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\
(5 rows)
SELECT * FROM t_csv;
id | c
----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 | ����
2 | ���ԫ\
3 | ����3�\����4
4 | �\����4
5 | �\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�[�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\
(5 rows)
SELECT * FROM t_text;
id | c
----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 | ����
2 | ���ԫ\
3 | ����3�\����4
4 | �\����4
5 | �\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�[�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\
(5 rows)
SELECT * FROM t_fixed;
id | c
----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 | ����
2 | ���ԫ\
3 | ����3�\����4
4 | �\����4
5 | �\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�[�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\
(5 rows)
SELECT * FROM t_binary;
id | c
----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 | ����
2 | ���ԫ\
3 | ����3�\����4
4 | �\����4
5 | �\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�[�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\�\
(5 rows)
\d t;
Table "public.t"
Column | Type | Modifiers
--------+-------------------+-----------
id | integer |
c | character varying |
\d t_csv;
Table "public.t_csv"
Column | Type | Modifiers
--------+-------------------+-----------
id | integer |
c | character varying |
\d t_text;
Table "public.t_text"
Column | Type | Modifiers
--------+-------------------+-----------
id | integer |
c | character varying |
\d t_fixed;
Table "public.t_fixed"
Column | Type | Modifiers
--------+-------------------+-----------
id | integer |
c | character varying |
\d t_binary;
Table "public.t_binary"
Column | Type | Modifiers
--------+-------------------+-----------
id | integer |
c | character varying |

View File

@ -624,7 +624,7 @@ test: interval tinterval macaddr tstypes comments
# is concurrent safe.(duplicate)
# ----------
test: copyselect copy_error_log copy_support_transform copy_from_support_parallel
test: copy_new_gram
test: copy_new_gram copy_gbk_test copy_gb18030_test
#test: copy_eol
# ----------

View File

@ -186,7 +186,7 @@ test: interval tinterval macaddr tstypes comments
# is concurrent safe.(duplicate)
# ----------
test: copyselect copy_error_log copy_support_transform copy_from_support_parallel
test: copy_new_gram
test: copy_new_gram copy_gbk_test copy_gb18030_test
#test: copy_eol
# ----------