From 6c0af24e9dafeb9882ec6aecb682cffdbf8bf39c Mon Sep 17 00:00:00 2001 From: lihangyu <15605149486@163.com> Date: Thu, 13 Apr 2023 21:58:44 +0800 Subject: [PATCH] [Improve](simdjson reader) support UTF-8 unicode (with BOM) (#18585) --- .../vec/exec/format/json/new_json_reader.cpp | 48 ++++++++++++++----- .../load_p0/stream_load/simple_json_bom.json | 12 +++++ .../load_p0/stream_load/test_json_load.out | 13 +++++ .../load_p0/stream_load/test_json_load.groovy | 15 ++++++ 4 files changed, 76 insertions(+), 12 deletions(-) create mode 100644 regression-test/data/load_p0/stream_load/simple_json_bom.json diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp b/be/src/vec/exec/format/json/new_json_reader.cpp index 95e0fc0535..a5241aef3d 100644 --- a/be/src/vec/exec/format/json/new_json_reader.cpp +++ b/be/src/vec/exec/format/json/new_json_reader.cpp @@ -17,6 +17,8 @@ #include "vec/exec/format/json/new_json_reader.h" +#include + #include "common/compiler_util.h" #include "exprs/json_functions.h" #include "io/file_factory.h" @@ -28,6 +30,7 @@ #include "runtime/descriptors.h" #include "runtime/runtime_state.h" #include "util/defer_op.h" +#include "util/string_util.h" #include "vec/core/block.h" #include "vec/exec/format/file_reader/new_plain_text_line_reader.h" #include "vec/exec/scan/vscanner.h" @@ -1549,6 +1552,13 @@ Status NewJsonReader::_simdjson_parse_json_doc(size_t* size, bool* eof) { _simdjson_ondemand_padding_buffer.resize(*size + simdjson::SIMDJSON_PADDING); _padded_size = *size + simdjson::SIMDJSON_PADDING; } + // trim BOM since simdjson does not handle UTF-8 Unicode (with BOM) + if (*size >= 3 && static_cast(json_str[0]) == '\xEF' && + static_cast(json_str[1]) == '\xBB' && static_cast(json_str[2]) == '\xBF') { + // skip the first three BOM bytes + json_str += 3; + *size -= 3; + } memcpy(&_simdjson_ondemand_padding_buffer.front(), json_str, *size); auto error = _ondemand_json_parser @@ -1576,12 +1586,25 @@ Status NewJsonReader::_simdjson_parse_json_doc(size_t* size, bool* eof) { error, simdjson::error_message(error)); return return_quality_error(error_msg, std::string((char*)json_str, *size)); } - try { - // set json root - // if it is an array at top level, then we should iterate the entire array in - // ::_simdjson_handle_flat_array_complex_json - if (_parsed_json_root.size() != 0 && - _original_json_doc.type() == simdjson::ondemand::json_type::object) { + auto type_res = _original_json_doc.type(); + if (type_res.error() != simdjson::error_code::SUCCESS) { + fmt::memory_buffer error_msg; + fmt::format_to(error_msg, "Parse json data for JsonDoc failed. code: {}, error info: {}", + type_res.error(), simdjson::error_message(type_res.error())); + return return_quality_error(error_msg, std::string((char*)json_str, *size)); + } + simdjson::ondemand::json_type type = type_res.value(); + if (type != simdjson::ondemand::json_type::object && + type != simdjson::ondemand::json_type::array) { + fmt::memory_buffer error_msg; + fmt::format_to(error_msg, "Not an json object or json array"); + return return_quality_error(error_msg, std::string((char*)json_str, *size)); + } + if (_parsed_json_root.size() != 0 && type == simdjson::ondemand::json_type::object) { + try { + // set json root + // if it is an array at top level, then we should iterate the entire array in + // ::_simdjson_handle_flat_array_complex_json simdjson::ondemand::object object = _original_json_doc; Status st = JsonFunctions::extract_from_object(object, _parsed_json_root, &_json_value); if (!st.ok()) { @@ -1589,13 +1612,14 @@ Status NewJsonReader::_simdjson_parse_json_doc(size_t* size, bool* eof) { fmt::format_to(error_msg, "{}", st.to_string()); return return_quality_error(error_msg, std::string((char*)json_str, *size)); } - } else { - _json_value = _original_json_doc; + } catch (simdjson::simdjson_error& e) { + fmt::memory_buffer error_msg; + fmt::format_to(error_msg, "Encounter error while extract_from_object, error: {}", + e.what()); + return return_quality_error(error_msg, std::string((char*)json_str, *size)); } - } catch (simdjson::simdjson_error& e) { - fmt::memory_buffer error_msg; - fmt::format_to(error_msg, "Encounter error while extract_from_object, error: {}", e.what()); - return return_quality_error(error_msg, std::string((char*)json_str, *size)); + } else { + _json_value = _original_json_doc; } if (_json_value.type() == simdjson::ondemand::json_type::array && !_strip_outer_array) { diff --git a/regression-test/data/load_p0/stream_load/simple_json_bom.json b/regression-test/data/load_p0/stream_load/simple_json_bom.json new file mode 100644 index 0000000000..8c9b3c4558 --- /dev/null +++ b/regression-test/data/load_p0/stream_load/simple_json_bom.json @@ -0,0 +1,12 @@ +[ + {"id": 1, "city": "beijing", "code": 2345671}, + {"id": 2, "city": "shanghai", "code": 2345672}, + {"id": 3, "city": "guangzhou", "code": 2345673}, + {"id": 4, "city": "shenzhen", "code": 2345674}, + {"id": 5, "city": "hangzhou", "code": 2345675}, + {"id": 6, "city": "nanjing", "code": 2345676}, + {"id": 7, "city": "wuhan", "code": 2345677}, + {"id": 8, "city": "chengdu", "code": 2345678}, + {"id": 9, "city": "xian", "code": 2345679}, + {"id": 10, "city": "hefei", "code": 23456710} +] \ No newline at end of file diff --git a/regression-test/data/load_p0/stream_load/test_json_load.out b/regression-test/data/load_p0/stream_load/test_json_load.out index 6abf0cd771..0ad43627c4 100644 --- a/regression-test/data/load_p0/stream_load/test_json_load.out +++ b/regression-test/data/load_p0/stream_load/test_json_load.out @@ -186,3 +186,16 @@ 2 shanghai 2345672 200 changsha 3456789 +-- !select1 -- +1 beijing 2345671 +2 shanghai 2345672 +3 guangzhou 2345673 +4 shenzhen 2345674 +5 hangzhou 2345675 +6 nanjing 2345676 +7 wuhan 2345677 +8 chengdu 2345678 +9 xian 2345679 +10 hefei 23456710 +200 changsha 3456789 + diff --git a/regression-test/suites/load_p0/stream_load/test_json_load.groovy b/regression-test/suites/load_p0/stream_load/test_json_load.groovy index 35bb9ffa5d..4e3d2dbf81 100644 --- a/regression-test/suites/load_p0/stream_load/test_json_load.groovy +++ b/regression-test/suites/load_p0/stream_load/test_json_load.groovy @@ -546,6 +546,21 @@ suite("test_json_load", "p0") { try_sql("DROP TABLE IF EXISTS ${testTable}") } + // case20: import json with BOM file + try { + sql "DROP TABLE IF EXISTS ${testTable}" + + create_test_table1.call(testTable) + + load_json_data.call('test_json_load_case1_2', 'true', '', 'json', '', '', '', '', '', 'simple_json_bom.json') + + sql "sync" + qt_select1 "select * from ${testTable} order by id" + + } finally { + try_sql("DROP TABLE IF EXISTS ${testTable}") + } + // if 'enableHdfs' in regression-conf.groovy has been set to true, // the test will run these case as below. if (enableHdfs()) {