[Improvement](JSONB) improve performance JSONB initial json parsing using simdjson (#15219)

test data: https://data.gharchive.org/2020-11-13-18.json.gz, 2GB, 197696 lines
before: String 13s vs. JSONB 28s
after: String 13s vs. JSONB 16s

**NOTICE: simdjson need to be patched since BOOL is conflicted with a macro BOOL defined in odbc sqltypes.h**
This commit is contained in:
Kang
2022-12-29 09:29:09 +08:00
committed by GitHub
parent 1b1083eb52
commit 0f3c0b78e3
6 changed files with 369 additions and 7 deletions

View File

@ -18,6 +18,7 @@
#include <boost/token_functions.hpp>
#include <vector>
// #include "util/jsonb_parser_simd.h"
#include "util/string_parser.hpp"
#include "util/string_util.h"
#include "vec/columns/column.h"
@ -47,7 +48,7 @@ enum class JsonbParseErrorMode { FAIL = 0, RETURN_NULL, RETURN_VALUE, RETURN_INV
template <NullalbeMode nullable_mode, JsonbParseErrorMode parse_error_handle_mode>
class FunctionJsonbParseBase : public IFunction {
private:
JsonbParser default_value_parser;
JsonbParserSIMD default_value_parser;
bool has_const_default_value = false;
public:
@ -193,6 +194,10 @@ public:
size_t size = col_from.size();
col_to->reserve(size);
// parser can be reused for performance
JsonbParserSIMD parser;
JsonbErrType error = JsonbErrType::E_NONE;
for (size_t i = 0; i < input_rows_count; ++i) {
if (col_from.is_null_at(i)) {
null_map->get_data()[i] = 1;
@ -201,8 +206,6 @@ public:
}
const auto& val = col_from_string->get_data_at(i);
JsonbParser parser;
JsonbErrType error = JsonbErrType::E_NONE;
if (parser.parse(val.data, val.size)) {
// insert jsonb format data
col_to->insert_data(parser.getWriter().getOutput()->getBuffer(),