From 97d36b4f38cccda9796bad9bc2dcfa328ef20271 Mon Sep 17 00:00:00 2001 From: HHoflittlefish777 <77738092+HHoflittlefish777@users.noreply.github.com> Date: Sun, 3 Dec 2023 22:57:55 +0800 Subject: [PATCH] [fix](csv_reader) fix trim_double_quotes behavior change (#27882) --- be/src/util/slice.h | 20 +++++++ be/src/vec/exec/format/csv/csv_reader.cpp | 2 +- .../test_csv_with_double_quotes.out | 2 + .../test_double_quotes_with_enclose.csv | 1 + .../load_p0/stream_load/test_single_quote.csv | 1 + .../test_csv_with_double_quotes.groovy | 54 +++++++++++++++++++ 6 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 regression-test/data/load_p0/stream_load/test_double_quotes_with_enclose.csv create mode 100644 regression-test/data/load_p0/stream_load/test_single_quote.csv diff --git a/be/src/util/slice.h b/be/src/util/slice.h index dce933e558..ed119b8051 100644 --- a/be/src/util/slice.h +++ b/be/src/util/slice.h @@ -176,6 +176,26 @@ public: } return change; } + + /// Remove quote char '"' which should exist as first and last char. + /// + /// @pre n <= size + /// + /// @note Only the base and bounds of the slice are changed; + /// the data is not modified. + /// + /// @param [in] n + /// Number of bytes of space that should be dropped from the beginning. + bool trim_double_quotes() { + int32_t begin = 0; + if (size > 2 && (data[begin] == '"' && data[size - 1] == '"')) { + data += 1; + size -= 2; + return true; + } + return false; + } + /// Truncate the slice to the given number of bytes. /// /// @pre n <= size diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp b/be/src/vec/exec/format/csv/csv_reader.cpp index 9440794167..fced6bdd49 100644 --- a/be/src/vec/exec/format/csv/csv_reader.cpp +++ b/be/src/vec/exec/format/csv/csv_reader.cpp @@ -619,7 +619,7 @@ Status CsvReader::_create_decompressor() { template Status CsvReader::deserialize_nullable_string(IColumn& column, Slice& slice) { auto& null_column = assert_cast(column); - if (!(from_json && _options.converted_from_string && slice.trim_quote())) { + if (!(from_json && _options.converted_from_string && slice.trim_double_quotes())) { if (slice.size == 2 && slice[0] == '\\' && slice[1] == 'N') { null_column.insert_data(nullptr, 0); return Status::OK(); diff --git a/regression-test/data/load_p0/stream_load/test_csv_with_double_quotes.out b/regression-test/data/load_p0/stream_load/test_csv_with_double_quotes.out index 0ae5ebe7f7..13a0dbaee9 100644 --- a/regression-test/data/load_p0/stream_load/test_csv_with_double_quotes.out +++ b/regression-test/data/load_p0/stream_load/test_csv_with_double_quotes.out @@ -19,3 +19,5 @@ 7 8 3 abc 2022-12-01 2022-12-01T09:30:31 8 9 3 abc 2022-12-01 2022-12-01T09:30:31 +-- !sql_test_single_quote -- +'a' 21 'b' diff --git a/regression-test/data/load_p0/stream_load/test_double_quotes_with_enclose.csv b/regression-test/data/load_p0/stream_load/test_double_quotes_with_enclose.csv new file mode 100644 index 0000000000..d77096361a --- /dev/null +++ b/regression-test/data/load_p0/stream_load/test_double_quotes_with_enclose.csv @@ -0,0 +1 @@ +?"a"?,21,?"b"? \ No newline at end of file diff --git a/regression-test/data/load_p0/stream_load/test_single_quote.csv b/regression-test/data/load_p0/stream_load/test_single_quote.csv new file mode 100644 index 0000000000..621abae313 --- /dev/null +++ b/regression-test/data/load_p0/stream_load/test_single_quote.csv @@ -0,0 +1 @@ +'a',21,'b' \ No newline at end of file diff --git a/regression-test/suites/load_p0/stream_load/test_csv_with_double_quotes.groovy b/regression-test/suites/load_p0/stream_load/test_csv_with_double_quotes.groovy index 1743d28d11..03a31f7997 100644 --- a/regression-test/suites/load_p0/stream_load/test_csv_with_double_quotes.groovy +++ b/regression-test/suites/load_p0/stream_load/test_csv_with_double_quotes.groovy @@ -59,4 +59,58 @@ suite("test_csv_with_double_quotes", "p0") { sql "sync" qt_sql "select * from ${tableName} order by k1, k2" sql """ DROP TABLE IF EXISTS ${tableName} """ + + def create_table = { testTablex -> + sql """ + CREATE TABLE `${testTablex}` ( + `name` varchar(48) NULL, + `age` bigint(20) NULL, + `agent_id` varchar(256) NULL + ) ENGINE=OLAP + DUPLICATE KEY(`name`) + COMMENT 'OLAP' + DISTRIBUTED BY RANDOM BUCKETS 10 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "is_being_synced" = "false", + "storage_format" = "V2", + "light_schema_change" = "true", + "disable_auto_compaction" = "false", + "enable_single_replica_compaction" = "false" + ); + """ + } + + def tableName1 = "test_single_quotes" + try { + sql "DROP TABLE IF EXISTS ${tableName1}" + + create_table.call(tableName1) + + streamLoad { + table "${tableName1}" + + set 'column_separator', ',' + set 'trim_double_quotes', 'true' + + file 'test_single_quote.csv' + + check { result, exception, startTime, endTime -> + if (exception != null) { + throw exception + } + log.info("Stream load result: ${result}".toString()) + def json = parseJson(result) + assertEquals("success", json.Status.toLowerCase()) + assertEquals(1, json.NumberTotalRows) + assertEquals(0, json.NumberFilteredRows) + assertEquals(0, json.NumberUnselectedRows) + } + } + + qt_sql_test_single_quote "SELECT * FROM ${tableName1} order by name" + + } finally { + sql "DROP TABLE IF EXISTS ${tableName1}" + } }