[Improve](simdjson reader) support UTF-8 unicode (with BOM) (#18585)
This commit is contained in:
@ -17,6 +17,8 @@
|
||||
|
||||
#include "vec/exec/format/json/new_json_reader.h"
|
||||
|
||||
#include <simdjson/error.h>
|
||||
|
||||
#include "common/compiler_util.h"
|
||||
#include "exprs/json_functions.h"
|
||||
#include "io/file_factory.h"
|
||||
@ -28,6 +30,7 @@
|
||||
#include "runtime/descriptors.h"
|
||||
#include "runtime/runtime_state.h"
|
||||
#include "util/defer_op.h"
|
||||
#include "util/string_util.h"
|
||||
#include "vec/core/block.h"
|
||||
#include "vec/exec/format/file_reader/new_plain_text_line_reader.h"
|
||||
#include "vec/exec/scan/vscanner.h"
|
||||
@ -1549,6 +1552,13 @@ Status NewJsonReader::_simdjson_parse_json_doc(size_t* size, bool* eof) {
|
||||
_simdjson_ondemand_padding_buffer.resize(*size + simdjson::SIMDJSON_PADDING);
|
||||
_padded_size = *size + simdjson::SIMDJSON_PADDING;
|
||||
}
|
||||
// trim BOM since simdjson does not handle UTF-8 Unicode (with BOM)
|
||||
if (*size >= 3 && static_cast<char>(json_str[0]) == '\xEF' &&
|
||||
static_cast<char>(json_str[1]) == '\xBB' && static_cast<char>(json_str[2]) == '\xBF') {
|
||||
// skip the first three BOM bytes
|
||||
json_str += 3;
|
||||
*size -= 3;
|
||||
}
|
||||
memcpy(&_simdjson_ondemand_padding_buffer.front(), json_str, *size);
|
||||
auto error =
|
||||
_ondemand_json_parser
|
||||
@ -1576,12 +1586,25 @@ Status NewJsonReader::_simdjson_parse_json_doc(size_t* size, bool* eof) {
|
||||
error, simdjson::error_message(error));
|
||||
return return_quality_error(error_msg, std::string((char*)json_str, *size));
|
||||
}
|
||||
try {
|
||||
// set json root
|
||||
// if it is an array at top level, then we should iterate the entire array in
|
||||
// ::_simdjson_handle_flat_array_complex_json
|
||||
if (_parsed_json_root.size() != 0 &&
|
||||
_original_json_doc.type() == simdjson::ondemand::json_type::object) {
|
||||
auto type_res = _original_json_doc.type();
|
||||
if (type_res.error() != simdjson::error_code::SUCCESS) {
|
||||
fmt::memory_buffer error_msg;
|
||||
fmt::format_to(error_msg, "Parse json data for JsonDoc failed. code: {}, error info: {}",
|
||||
type_res.error(), simdjson::error_message(type_res.error()));
|
||||
return return_quality_error(error_msg, std::string((char*)json_str, *size));
|
||||
}
|
||||
simdjson::ondemand::json_type type = type_res.value();
|
||||
if (type != simdjson::ondemand::json_type::object &&
|
||||
type != simdjson::ondemand::json_type::array) {
|
||||
fmt::memory_buffer error_msg;
|
||||
fmt::format_to(error_msg, "Not an json object or json array");
|
||||
return return_quality_error(error_msg, std::string((char*)json_str, *size));
|
||||
}
|
||||
if (_parsed_json_root.size() != 0 && type == simdjson::ondemand::json_type::object) {
|
||||
try {
|
||||
// set json root
|
||||
// if it is an array at top level, then we should iterate the entire array in
|
||||
// ::_simdjson_handle_flat_array_complex_json
|
||||
simdjson::ondemand::object object = _original_json_doc;
|
||||
Status st = JsonFunctions::extract_from_object(object, _parsed_json_root, &_json_value);
|
||||
if (!st.ok()) {
|
||||
@ -1589,13 +1612,14 @@ Status NewJsonReader::_simdjson_parse_json_doc(size_t* size, bool* eof) {
|
||||
fmt::format_to(error_msg, "{}", st.to_string());
|
||||
return return_quality_error(error_msg, std::string((char*)json_str, *size));
|
||||
}
|
||||
} else {
|
||||
_json_value = _original_json_doc;
|
||||
} catch (simdjson::simdjson_error& e) {
|
||||
fmt::memory_buffer error_msg;
|
||||
fmt::format_to(error_msg, "Encounter error while extract_from_object, error: {}",
|
||||
e.what());
|
||||
return return_quality_error(error_msg, std::string((char*)json_str, *size));
|
||||
}
|
||||
} catch (simdjson::simdjson_error& e) {
|
||||
fmt::memory_buffer error_msg;
|
||||
fmt::format_to(error_msg, "Encounter error while extract_from_object, error: {}", e.what());
|
||||
return return_quality_error(error_msg, std::string((char*)json_str, *size));
|
||||
} else {
|
||||
_json_value = _original_json_doc;
|
||||
}
|
||||
|
||||
if (_json_value.type() == simdjson::ondemand::json_type::array && !_strip_outer_array) {
|
||||
|
||||
@ -0,0 +1,12 @@
|
||||
[
|
||||
{"id": 1, "city": "beijing", "code": 2345671},
|
||||
{"id": 2, "city": "shanghai", "code": 2345672},
|
||||
{"id": 3, "city": "guangzhou", "code": 2345673},
|
||||
{"id": 4, "city": "shenzhen", "code": 2345674},
|
||||
{"id": 5, "city": "hangzhou", "code": 2345675},
|
||||
{"id": 6, "city": "nanjing", "code": 2345676},
|
||||
{"id": 7, "city": "wuhan", "code": 2345677},
|
||||
{"id": 8, "city": "chengdu", "code": 2345678},
|
||||
{"id": 9, "city": "xian", "code": 2345679},
|
||||
{"id": 10, "city": "hefei", "code": 23456710}
|
||||
]
|
||||
@ -186,3 +186,16 @@
|
||||
2 shanghai 2345672
|
||||
200 changsha 3456789
|
||||
|
||||
-- !select1 --
|
||||
1 beijing 2345671
|
||||
2 shanghai 2345672
|
||||
3 guangzhou 2345673
|
||||
4 shenzhen 2345674
|
||||
5 hangzhou 2345675
|
||||
6 nanjing 2345676
|
||||
7 wuhan 2345677
|
||||
8 chengdu 2345678
|
||||
9 xian 2345679
|
||||
10 hefei 23456710
|
||||
200 changsha 3456789
|
||||
|
||||
|
||||
@ -546,6 +546,21 @@ suite("test_json_load", "p0") {
|
||||
try_sql("DROP TABLE IF EXISTS ${testTable}")
|
||||
}
|
||||
|
||||
// case20: import json with BOM file
|
||||
try {
|
||||
sql "DROP TABLE IF EXISTS ${testTable}"
|
||||
|
||||
create_test_table1.call(testTable)
|
||||
|
||||
load_json_data.call('test_json_load_case1_2', 'true', '', 'json', '', '', '', '', '', 'simple_json_bom.json')
|
||||
|
||||
sql "sync"
|
||||
qt_select1 "select * from ${testTable} order by id"
|
||||
|
||||
} finally {
|
||||
try_sql("DROP TABLE IF EXISTS ${testTable}")
|
||||
}
|
||||
|
||||
// if 'enableHdfs' in regression-conf.groovy has been set to true,
|
||||
// the test will run these case as below.
|
||||
if (enableHdfs()) {
|
||||
|
||||
Reference in New Issue
Block a user