Files
doris/be/src/exec/json_scanner.h
worker24h fdcc223ad2 [Bug][Json] Refactor the json load logic to fix some bug
1. Add `json_root` for nest json data.
2. Remove `_jmap` to make the logic reasonable.
2020-07-30 10:36:34 +08:00

154 lines
5.5 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef BE_SRC_JSON_SCANNER_H_
#define BE_SRC_JSON_SCANNER_H_
#include <memory>
#include <vector>
#include <string>
#include <map>
#include <sstream>
#include <rapidjson/document.h>
#include <rapidjson/error/en.h>
#include <rapidjson/stringbuffer.h>
#include <rapidjson/writer.h>
#include "exec/base_scanner.h"
#include "common/status.h"
#include "gen_cpp/PlanNodes_types.h"
#include "gen_cpp/Types_types.h"
#include "util/slice.h"
#include "util/runtime_profile.h"
#include "runtime/mem_pool.h"
#include "runtime/tuple.h"
#include "runtime/descriptors.h"
#include "runtime/stream_load/load_stream_mgr.h"
#include "runtime/small_file_mgr.h"
namespace doris {
class Tuple;
class SlotDescriptor;
class RuntimeState;
class TupleDescriptor;
class MemTracker;
class JsonReader;
class JsonScanner : public BaseScanner {
public:
JsonScanner(
RuntimeState* state,
RuntimeProfile* profile,
const TBrokerScanRangeParams& params,
const std::vector<TBrokerRangeDesc>& ranges,
const std::vector<TNetworkAddress>& broker_addresses,
ScannerCounter* counter);
~JsonScanner();
// Open this scanner, will initialize information needed
Status open() override;
// Get next tuple
Status get_next(Tuple* tuple, MemPool* tuple_pool, bool* eof) override;
// Close this scanner
void close() override;
private:
Status open_next_reader();
private:
const std::vector<TBrokerRangeDesc>& _ranges;
const std::vector<TNetworkAddress>& _broker_addresses;
std::string _jsonpath;
std::string _jsonpath_file;
// used to hold current StreamLoadPipe
std::shared_ptr<StreamLoadPipe> _stream_load_pipe;
// Reader
JsonReader* _cur_file_reader;
int _next_range;
bool _cur_file_eof; // is read over?
bool _scanner_eof;
};
class JsonDataInternal {
public:
JsonDataInternal(rapidjson::Value* v);
~JsonDataInternal() {}
rapidjson::Value::ConstValueIterator get_next();
bool is_null() const { return _json_values == nullptr; }
private:
rapidjson::Value* _json_values;
rapidjson::Value::ConstValueIterator _iterator;
};
struct JsonPath;
// Reader to parse the json.
// For most of its methods which return type is Status,
// return Status::OK() if process succeed or encounter data quality error.
// return other error Status if encounter other errors.
class JsonReader {
public:
JsonReader(RuntimeState* state, ScannerCounter* counter, RuntimeProfile* profile, FileReader* file_reader,
bool strip_outer_array);
~JsonReader();
Status init(const std::string& jsonpath, const std::string& json_root); // must call before use
Status read(Tuple* tuple, const std::vector<SlotDescriptor*>& slot_descs, MemPool* tuple_pool, bool* eof);
private:
Status (JsonReader::*_handle_json_callback)(Tuple* tuple, const std::vector<SlotDescriptor*>& slot_descs, MemPool* tuple_pool, bool* eof);
Status _handle_simple_json(Tuple* tuple, const std::vector<SlotDescriptor*>& slot_descs, MemPool* tuple_pool, bool* eof);
Status _handle_flat_array_complex_json(Tuple* tuple, const std::vector<SlotDescriptor*>& slot_descs, MemPool* tuple_pool, bool* eof);
Status _handle_nested_complex_json(Tuple* tuple, const std::vector<SlotDescriptor*>& slot_descs, MemPool* tuple_pool, bool* eof);
void _fill_slot(Tuple* tuple, SlotDescriptor* slot_desc, MemPool* mem_pool, const uint8_t* value, int32_t len);
Status _parse_json_doc(bool* eof);
void _set_tuple_value(rapidjson::Value& objectValue, Tuple* tuple, const std::vector<SlotDescriptor*>& slot_descs, MemPool* tuple_pool, bool *valid);
void _write_data_to_tuple(rapidjson::Value::ConstValueIterator value, SlotDescriptor* desc, Tuple* tuple, MemPool* tuple_pool, bool* valid);
bool _write_values_by_jsonpath(rapidjson::Value& objectValue, MemPool* tuple_pool, Tuple* tuple, const std::vector<SlotDescriptor*>& slot_descs);
std::string _print_json_value(const rapidjson::Value& value);
std::string _print_jsonpath(const std::vector<JsonPath>& path);
void _close();
Status _generate_json_paths(const std::string& jsonpath, std::vector<std::vector<JsonPath>>* vect);
private:
int _next_line;
int _total_lines;
RuntimeState* _state;
ScannerCounter* _counter;
RuntimeProfile* _profile;
FileReader*_file_reader;
bool _closed;
bool _strip_outer_array;
RuntimeProfile::Counter* _bytes_read_counter;
RuntimeProfile::Counter* _read_timer;
std::vector<std::vector<JsonPath>> _parsed_jsonpaths;
std::vector<JsonPath> _parsed_json_root;
rapidjson::Document _origin_json_doc; // origin json document object from parsed json string
rapidjson::Value *_json_doc; // _json_doc equals _final_json_doc iff not set `json_root`
};
} // end namesapce
#endif