[pick](json-serde)pick jsonb string deserialize with spec char (#38711)

## Proposed changes
backport: https://github.com/apache/doris/pull/37176
Issue Number: close #xxx

<!--Describe your changes.-->
This commit is contained in:
amory
2024-08-02 13:37:41 +08:00
committed by GitHub
parent d800434859
commit 9b07cd2069
5 changed files with 157 additions and 6 deletions

View File

@ -73,19 +73,59 @@ public:
auto result = check_column_const_set_readability(column, row_num);
ColumnPtr ptr = result.first;
row_num = result.second;
if (_nesting_level > 1) {
bw.write('"');
}
const auto& value = assert_cast<const ColumnType&>(*ptr).get_data_at(row_num);
bw.write(value.data, value.size);
if (_nesting_level > 1) {
bw.write('"');
}
if constexpr (std::is_same_v<ColumnType, ColumnString>) {
if (options.escape_char != 0) {
// we should make deal with some special characters in json str if we have escape_char
StringRef str_ref = value;
write_with_escaped_char_to_json(str_ref, bw);
} else {
bw.write(value.data, value.size);
}
} else {
bw.write(value.data, value.size);
}
if (_nesting_level > 1) {
bw.write('"');
}
return Status::OK();
}
inline void write_with_escaped_char_to_json(StringRef value, BufferWritable& bw) const {
for (char it : value) {
switch (it) {
case '\b':
bw.write("\\b", 2);
break;
case '\f':
bw.write("\\f", 2);
break;
case '\n':
bw.write("\\n", 2);
break;
case '\r':
bw.write("\\r", 2);
break;
case '\t':
bw.write("\\t", 2);
break;
case '\\':
bw.write("\\\\", 2);
break;
case '"':
bw.write("\\\"", 2);
break;
default:
bw.write(it);
}
}
}
Status serialize_column_to_json(const IColumn& column, int start_idx, int end_idx,
BufferWritable& bw, FormatOptions& options) const override {
SERIALIZE_COLUMN_TO_JSON();

View File

@ -785,6 +785,7 @@ struct ConvertImplGenericToJsonb {
auto tmp_col = ColumnString::create();
vectorized::DataTypeSerDe::FormatOptions options;
options.escape_char = '\\';
for (size_t i = 0; i < input_rows_count; i++) {
// convert to string
tmp_col->clear();

View File

@ -0,0 +1,4 @@
1 \N
2 ['{\'x\' : \'{"y" : 1}\', \'t\' : \'{"y" : 2}\'}', '{"x" : 1}']
3 ['foo\'bar', 'foo"bar', 'foo\\'bar', 'foo\'\'bar']
4 ['\/some\/cool\/url', '/some/cool/url', 'a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e']
Can't render this file because it contains an unexpected character in line 2 and column 17.

View File

@ -0,0 +1,27 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !select_1 --
1 \N
2 ["{'x' : '{"y" : 1}', 't' : '{"y" : 2}'}", "{"x" : 1}"]
3 ["foo'bar', 'foo"bar', 'foo\\'bar', 'foo''bar"]
4 ["/some/cool/url", "/some/cool/url", "a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e"]
-- !select_2 --
1 \N
2 ["{'x' : '{"y" : 1}', 't' : '{"y" : 2}'}", "{"x" : 1}"]
3 ["foo'bar', 'foo"bar', 'foo\\'bar', 'foo''bar"]
4 ["/some/cool/url", "/some/cool/url", "a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e"]
27 ["{"k1":"v1", "k2": 200}"]
28 ["{"a.b.c":{"k1.a1":"v31", "k2": 300},"a":"niu"}"]
29 [" \n\r", " \n\r"]
30 ["f\r\n", "f\r\n""]
-- !select_json --
1 \N
2 ["{'x' : '{\\"y\\" : 1}', 't' : '{\\"y\\" : 2}'}","{\\"x\\" : 1}"]
3 ["foo'bar', 'foo\\"bar', 'foo\\\\'bar', 'foo''bar"]
4 ["/some/cool/url","/some/cool/url","a\\\\_\\\\c\\\\l\\\\i\\\\c\\\\k\\\\h\\\\o\\\\u\\\\s\\\\e"]
27 ["{\\"k1\\":\\"v1\\", \\"k2\\": 200}"]
28 ["{\\"a.b.c\\":{\\"k1.a1\\":\\"v31\\", \\"k2\\": 300},\\"a\\":\\"niu\\"}"]
29 ["\\f\\n\\r","\\f\\n\\r"]
30 ["f\\b\\r\\n","f\\b\\r\\n\\""]

View File

@ -0,0 +1,79 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
import org.codehaus.groovy.runtime.IOGroovyMethods
suite("test_jsonb_cast", "p0") {
// define a sql table with array<text> which has some Escape Character and should also to cast to json
def testTable = "tbl_test_array_text_cast_jsonb"
def dataFile = "test_jsonb_cast.csv"
sql """ set experimental_enable_nereids_planner = true """
sql """ set enable_fallback_to_original_planner = true """
sql "DROP TABLE IF EXISTS ${testTable}"
sql """
CREATE TABLE IF NOT EXISTS ${testTable} (
id INT,
a ARRAY<TEXT>,
)
DUPLICATE KEY(id)
DISTRIBUTED BY HASH(id) BUCKETS 3
PROPERTIES("replication_num" = "1");
"""
// load the jsonb data from csv file
streamLoad {
table testTable
file dataFile // import csv file
time 10000 // limit inflight 10s
set 'strict_mode', 'true'
// if declared a check callback, the default check condition will ignore.
// So you must check all condition
check { result, exception, startTime, endTime ->
if (exception != null) {
throw exception
}
log.info("Stream load result: ${result}".toString())
def json = parseJson(result)
assertEquals(4, json.NumberTotalRows)
assertEquals(4, json.NumberLoadedRows)
assertTrue(json.LoadBytes > 0)
}
}
sql """ sync; """
// check result
qt_select_1 "SELECT * FROM ${testTable} ORDER BY id"
// insert into valid json rows
sql """INSERT INTO ${testTable} VALUES(27, ['{"k1":"v1", "k2": 200}'])"""
sql """INSERT INTO ${testTable} VALUES(28, ['{"a.b.c":{"k1.a1":"v31", "k2": 300},"a":"niu"}'])"""
sql """INSERT INTO ${testTable} VALUES(29, ['\f\n\r', "\f\n\r"])"""
sql """INSERT INTO ${testTable} VALUES(30, ["\\f\\b\\r\\n", '\\f\\b\\r\\n"'])"""
// check result
qt_select_2 "SELECT * FROM ${testTable} ORDER BY id"
// check cast as json
qt_select_json "SELECT id, cast(a as JSON) FROM ${testTable} ORDER BY id"
}