482 lines
21 KiB
C++
482 lines
21 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#include "exec/parquet_writer.h"
|
|
|
|
#include <arrow/array.h>
|
|
#include <arrow/status.h>
|
|
#include <time.h>
|
|
|
|
#include "common/logging.h"
|
|
#include "exec/file_writer.h"
|
|
#include "gen_cpp/PaloBrokerService_types.h"
|
|
#include "gen_cpp/TPaloBrokerService.h"
|
|
#include "runtime/broker_mgr.h"
|
|
#include "runtime/client_cache.h"
|
|
#include "runtime/descriptors.h"
|
|
#include "runtime/exec_env.h"
|
|
#include "runtime/mem_pool.h"
|
|
#include "util/thrift_util.h"
|
|
#include "util/types.h"
|
|
|
|
namespace doris {
|
|
|
|
/// ParquetOutputStream
|
|
ParquetOutputStream::ParquetOutputStream(FileWriter* file_writer)
|
|
: _file_writer(file_writer), _cur_pos(0), _written_len(0) {
|
|
set_mode(arrow::io::FileMode::WRITE);
|
|
}
|
|
|
|
ParquetOutputStream::~ParquetOutputStream() {
|
|
arrow::Status st = Close();
|
|
if (!st.ok()) {
|
|
LOG(WARNING) << "close parquet file error: " << st.ToString();
|
|
}
|
|
}
|
|
|
|
arrow::Status ParquetOutputStream::Write(const void* data, int64_t nbytes) {
|
|
if (_is_closed) {
|
|
return arrow::Status::OK();
|
|
}
|
|
size_t written_len = 0;
|
|
Status st = _file_writer->write(static_cast<const uint8_t*>(data), nbytes, &written_len);
|
|
if (!st.ok()) {
|
|
return arrow::Status::IOError(st.get_error_msg());
|
|
}
|
|
_cur_pos += written_len;
|
|
_written_len += written_len;
|
|
return arrow::Status::OK();
|
|
}
|
|
|
|
arrow::Result<int64_t> ParquetOutputStream::Tell() const {
|
|
return _cur_pos;
|
|
}
|
|
|
|
arrow::Status ParquetOutputStream::Close() {
|
|
if (_is_closed) {
|
|
return arrow::Status::OK();
|
|
}
|
|
Status st = _file_writer->close();
|
|
if (!st.ok()) {
|
|
LOG(WARNING) << "close parquet output stream failed: " << st.get_error_msg();
|
|
return arrow::Status::IOError(st.get_error_msg());
|
|
}
|
|
_is_closed = true;
|
|
return arrow::Status::OK();
|
|
}
|
|
|
|
int64_t ParquetOutputStream::get_written_len() {
|
|
return _written_len;
|
|
}
|
|
|
|
void ParquetOutputStream::set_written_len(int64_t written_len) {
|
|
_written_len = written_len;
|
|
}
|
|
|
|
/// ParquetWriterWrapper
|
|
ParquetWriterWrapper::ParquetWriterWrapper(FileWriter* file_writer,
|
|
const std::vector<ExprContext*>& output_expr_ctxs,
|
|
const std::map<std::string, std::string>& properties,
|
|
const std::vector<std::vector<std::string>>& schema,
|
|
bool output_object_data)
|
|
: _output_expr_ctxs(output_expr_ctxs),
|
|
_str_schema(schema),
|
|
_cur_writed_rows(0),
|
|
_rg_writer(nullptr),
|
|
_output_object_data(output_object_data) {
|
|
_outstream = std::shared_ptr<ParquetOutputStream>(new ParquetOutputStream(file_writer));
|
|
parse_properties(properties);
|
|
parse_schema(schema);
|
|
init_parquet_writer();
|
|
}
|
|
|
|
void ParquetWriterWrapper::parse_properties(
|
|
const std::map<std::string, std::string>& propertie_map) {
|
|
parquet::WriterProperties::Builder builder;
|
|
for (auto it = propertie_map.begin(); it != propertie_map.end(); it++) {
|
|
std::string property_name = it->first;
|
|
std::string property_value = it->second;
|
|
if (property_name == "compression") {
|
|
// UNCOMPRESSED, SNAPPY, GZIP, BROTLI, ZSTD, LZ4, LZO, BZ2
|
|
if (property_value == "snappy") {
|
|
builder.compression(parquet::Compression::SNAPPY);
|
|
} else if (property_value == "gzip") {
|
|
builder.compression(parquet::Compression::GZIP);
|
|
} else if (property_value == "brotli") {
|
|
builder.compression(parquet::Compression::BROTLI);
|
|
} else if (property_value == "zstd") {
|
|
builder.compression(parquet::Compression::ZSTD);
|
|
} else if (property_value == "lz4") {
|
|
builder.compression(parquet::Compression::LZ4);
|
|
} else if (property_value == "lzo") {
|
|
builder.compression(parquet::Compression::LZO);
|
|
} else if (property_value == "bz2") {
|
|
builder.compression(parquet::Compression::BZ2);
|
|
} else {
|
|
builder.compression(parquet::Compression::UNCOMPRESSED);
|
|
}
|
|
} else if (property_name == "disable_dictionary") {
|
|
if (property_value == "true") {
|
|
builder.enable_dictionary();
|
|
} else {
|
|
builder.disable_dictionary();
|
|
}
|
|
} else if (property_name == "version") {
|
|
if (property_value == "v1") {
|
|
builder.version(parquet::ParquetVersion::PARQUET_1_0);
|
|
} else {
|
|
builder.version(parquet::ParquetVersion::PARQUET_2_0);
|
|
}
|
|
}
|
|
}
|
|
_properties = builder.build();
|
|
}
|
|
|
|
Status ParquetWriterWrapper::parse_schema(const std::vector<std::vector<std::string>>& schema) {
|
|
parquet::schema::NodeVector fields;
|
|
for (auto column = schema.begin(); column != schema.end(); column++) {
|
|
std::string repetition_type = (*column)[0];
|
|
parquet::Repetition::type parquet_repetition_type = parquet::Repetition::REQUIRED;
|
|
if (repetition_type.find("required") != std::string::npos) {
|
|
parquet_repetition_type = parquet::Repetition::REQUIRED;
|
|
} else if (repetition_type.find("repeated") != std::string::npos) {
|
|
parquet_repetition_type = parquet::Repetition::REPEATED;
|
|
} else if (repetition_type.find("optional") != std::string::npos) {
|
|
parquet_repetition_type = parquet::Repetition::OPTIONAL;
|
|
} else {
|
|
parquet_repetition_type = parquet::Repetition::UNDEFINED;
|
|
}
|
|
|
|
std::string data_type = (*column)[1];
|
|
parquet::Type::type parquet_data_type = parquet::Type::BYTE_ARRAY;
|
|
if (data_type == "boolean") {
|
|
parquet_data_type = parquet::Type::BOOLEAN;
|
|
} else if (data_type.find("int32") != std::string::npos) {
|
|
parquet_data_type = parquet::Type::INT32;
|
|
} else if (data_type.find("int64") != std::string::npos) {
|
|
parquet_data_type = parquet::Type::INT64;
|
|
} else if (data_type.find("int96") != std::string::npos) {
|
|
parquet_data_type = parquet::Type::INT96;
|
|
} else if (data_type.find("float") != std::string::npos) {
|
|
parquet_data_type = parquet::Type::FLOAT;
|
|
} else if (data_type.find("double") != std::string::npos) {
|
|
parquet_data_type = parquet::Type::DOUBLE;
|
|
} else if (data_type.find("byte_array") != std::string::npos) {
|
|
parquet_data_type = parquet::Type::BYTE_ARRAY;
|
|
} else if (data_type.find("fixed_len_byte_array") != std::string::npos) {
|
|
parquet_data_type = parquet::Type::FIXED_LEN_BYTE_ARRAY;
|
|
} else {
|
|
parquet_data_type = parquet::Type::UNDEFINED;
|
|
}
|
|
|
|
std::string column_name = (*column)[2];
|
|
fields.push_back(parquet::schema::PrimitiveNode::Make(column_name, parquet_repetition_type,
|
|
parquet::LogicalType::None(),
|
|
parquet_data_type));
|
|
_schema = std::static_pointer_cast<parquet::schema::GroupNode>(
|
|
parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields));
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
Status ParquetWriterWrapper::write(const RowBatch& row_batch) {
|
|
int num_rows = row_batch.num_rows();
|
|
for (int i = 0; i < num_rows; ++i) {
|
|
TupleRow* row = row_batch.get_row(i);
|
|
RETURN_IF_ERROR(_write_one_row(row));
|
|
_cur_writed_rows++;
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
Status ParquetWriterWrapper::init_parquet_writer() {
|
|
_writer = parquet::ParquetFileWriter::Open(_outstream, _schema, _properties);
|
|
if (_writer == nullptr) {
|
|
return Status::InternalError("Failed to create file writer");
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
parquet::RowGroupWriter* ParquetWriterWrapper::get_rg_writer() {
|
|
if (_rg_writer == nullptr) {
|
|
_rg_writer = _writer->AppendBufferedRowGroup();
|
|
}
|
|
if (_cur_writed_rows > _max_row_per_group) {
|
|
_rg_writer->Close();
|
|
_rg_writer = _writer->AppendBufferedRowGroup();
|
|
_cur_writed_rows = 0;
|
|
}
|
|
return _rg_writer;
|
|
}
|
|
|
|
Status ParquetWriterWrapper::_write_one_row(TupleRow* row) {
|
|
int num_columns = _output_expr_ctxs.size();
|
|
if (num_columns != _str_schema.size()) {
|
|
return Status::InternalError("project field size is not equal to schema column size");
|
|
}
|
|
try {
|
|
for (int index = 0; index < num_columns; ++index) {
|
|
void* item = _output_expr_ctxs[index]->get_value(row);
|
|
switch (_output_expr_ctxs[index]->root()->type().type) {
|
|
case TYPE_BOOLEAN: {
|
|
if (_str_schema[index][1] != "boolean") {
|
|
std::stringstream ss;
|
|
ss << "project field type is boolean, but the definition type of column "
|
|
<< _str_schema[index][2] << " is " << _str_schema[index][1];
|
|
return Status::InvalidArgument(ss.str());
|
|
}
|
|
parquet::RowGroupWriter* rgWriter = get_rg_writer();
|
|
parquet::BoolWriter* col_writer =
|
|
static_cast<parquet::BoolWriter*>(rgWriter->column(index));
|
|
if (item != nullptr) {
|
|
col_writer->WriteBatch(1, nullptr, nullptr, static_cast<bool*>(item));
|
|
} else {
|
|
bool default_bool = false;
|
|
col_writer->WriteBatch(1, nullptr, nullptr, &default_bool);
|
|
}
|
|
break;
|
|
}
|
|
case TYPE_TINYINT:
|
|
case TYPE_SMALLINT:
|
|
case TYPE_INT: {
|
|
if (_str_schema[index][1] != "int32") {
|
|
std::stringstream ss;
|
|
ss << "project field type is tiny int/small int/int, should use int32, but the "
|
|
"definition type of column "
|
|
<< _str_schema[index][2] << " is " << _str_schema[index][1];
|
|
return Status::InvalidArgument(ss.str());
|
|
}
|
|
|
|
parquet::RowGroupWriter* rgWriter = get_rg_writer();
|
|
parquet::Int32Writer* col_writer =
|
|
static_cast<parquet::Int32Writer*>(rgWriter->column(index));
|
|
if (item != nullptr) {
|
|
col_writer->WriteBatch(1, nullptr, nullptr, static_cast<int32_t*>(item));
|
|
} else {
|
|
int32_t default_int32 = 0;
|
|
col_writer->WriteBatch(1, nullptr, nullptr, &default_int32);
|
|
}
|
|
break;
|
|
}
|
|
case TYPE_BIGINT: {
|
|
if (_str_schema[index][1] != "int64") {
|
|
std::stringstream ss;
|
|
ss << "project field type is big int, should use int64, but the definition "
|
|
"type of column "
|
|
<< _str_schema[index][2] << " is " << _str_schema[index][1];
|
|
return Status::InvalidArgument(ss.str());
|
|
}
|
|
parquet::RowGroupWriter* rgWriter = get_rg_writer();
|
|
parquet::Int64Writer* col_writer =
|
|
static_cast<parquet::Int64Writer*>(rgWriter->column(index));
|
|
if (item != nullptr) {
|
|
col_writer->WriteBatch(1, nullptr, nullptr, (int64_t*)(item));
|
|
} else {
|
|
int64_t default_int644 = 0;
|
|
col_writer->WriteBatch(1, nullptr, nullptr, &default_int644);
|
|
}
|
|
break;
|
|
}
|
|
case TYPE_LARGEINT: {
|
|
// TODO: not support int_128
|
|
// It is better write a default value, because rg_writer need all columns has value before flush to disk.
|
|
parquet::RowGroupWriter* rgWriter = get_rg_writer();
|
|
parquet::Int64Writer* col_writer =
|
|
static_cast<parquet::Int64Writer*>(rgWriter->column(index));
|
|
int64_t default_int64 = 0;
|
|
col_writer->WriteBatch(1, nullptr, nullptr, &default_int64);
|
|
return Status::InvalidArgument("do not support large int type.");
|
|
}
|
|
case TYPE_FLOAT: {
|
|
if (_str_schema[index][1] != "float") {
|
|
std::stringstream ss;
|
|
ss << "project field type is float, but the definition type of column "
|
|
<< _str_schema[index][2] << " is " << _str_schema[index][1];
|
|
return Status::InvalidArgument(ss.str());
|
|
}
|
|
parquet::RowGroupWriter* rgWriter = get_rg_writer();
|
|
parquet::FloatWriter* col_writer =
|
|
static_cast<parquet::FloatWriter*>(rgWriter->column(index));
|
|
if (item != nullptr) {
|
|
col_writer->WriteBatch(1, nullptr, nullptr, (float_t*)(item));
|
|
} else {
|
|
float_t default_float = 0.0;
|
|
col_writer->WriteBatch(1, nullptr, nullptr, &default_float);
|
|
}
|
|
break;
|
|
}
|
|
case TYPE_DOUBLE: {
|
|
if (_str_schema[index][1] != "double") {
|
|
std::stringstream ss;
|
|
ss << "project field type is double, but the definition type of column "
|
|
<< _str_schema[index][2] << " is " << _str_schema[index][1];
|
|
return Status::InvalidArgument(ss.str());
|
|
}
|
|
parquet::RowGroupWriter* rgWriter = get_rg_writer();
|
|
parquet::DoubleWriter* col_writer =
|
|
static_cast<parquet::DoubleWriter*>(rgWriter->column(index));
|
|
if (item != nullptr) {
|
|
col_writer->WriteBatch(1, nullptr, nullptr, (double_t*)(item));
|
|
} else {
|
|
double_t default_double = 0.0;
|
|
col_writer->WriteBatch(1, nullptr, nullptr, &default_double);
|
|
}
|
|
break;
|
|
}
|
|
case TYPE_DATETIME:
|
|
case TYPE_DATE: {
|
|
if (_str_schema[index][1] != "int64") {
|
|
std::stringstream ss;
|
|
ss << "project field type is date/datetime, should use int64, but the "
|
|
"definition type of column "
|
|
<< _str_schema[index][2] << " is " << _str_schema[index][1];
|
|
return Status::InvalidArgument(ss.str());
|
|
}
|
|
parquet::RowGroupWriter* rgWriter = get_rg_writer();
|
|
parquet::Int64Writer* col_writer =
|
|
static_cast<parquet::Int64Writer*>(rgWriter->column(index));
|
|
if (item != nullptr) {
|
|
const DateTimeValue* time_val = (const DateTimeValue*)(item);
|
|
int64_t timestamp = time_val->to_olap_datetime();
|
|
col_writer->WriteBatch(1, nullptr, nullptr, ×tamp);
|
|
} else {
|
|
int64_t default_int64 = 0;
|
|
col_writer->WriteBatch(1, nullptr, nullptr, &default_int64);
|
|
}
|
|
break;
|
|
}
|
|
|
|
case TYPE_HLL:
|
|
case TYPE_OBJECT: {
|
|
if (_output_object_data) {
|
|
if (_str_schema[index][1] != "byte_array") {
|
|
std::stringstream ss;
|
|
ss << "project field type is hll/bitmap, should use byte_array, but the "
|
|
"definition type of column "
|
|
<< _str_schema[index][2] << " is " << _str_schema[index][1];
|
|
return Status::InvalidArgument(ss.str());
|
|
}
|
|
parquet::RowGroupWriter* rgWriter = get_rg_writer();
|
|
parquet::ByteArrayWriter* col_writer =
|
|
static_cast<parquet::ByteArrayWriter*>(rgWriter->column(index));
|
|
if (item != nullptr) {
|
|
const StringValue* string_val = (const StringValue*)(item);
|
|
parquet::ByteArray value;
|
|
value.ptr = reinterpret_cast<const uint8_t*>(string_val->ptr);
|
|
value.len = string_val->len;
|
|
col_writer->WriteBatch(1, nullptr, nullptr, &value);
|
|
} else {
|
|
parquet::ByteArray value;
|
|
col_writer->WriteBatch(1, nullptr, nullptr, &value);
|
|
}
|
|
} else {
|
|
std::stringstream ss;
|
|
ss << "unsupported file format: "
|
|
<< _output_expr_ctxs[index]->root()->type().type;
|
|
return Status::InvalidArgument(ss.str());
|
|
}
|
|
break;
|
|
}
|
|
case TYPE_CHAR:
|
|
case TYPE_VARCHAR:
|
|
case TYPE_STRING: {
|
|
if (_str_schema[index][1] != "byte_array") {
|
|
std::stringstream ss;
|
|
ss << "project field type is char/varchar, should use byte_array, but the "
|
|
"definition type of column "
|
|
<< _str_schema[index][2] << " is " << _str_schema[index][1];
|
|
return Status::InvalidArgument(ss.str());
|
|
}
|
|
parquet::RowGroupWriter* rgWriter = get_rg_writer();
|
|
parquet::ByteArrayWriter* col_writer =
|
|
static_cast<parquet::ByteArrayWriter*>(rgWriter->column(index));
|
|
if (item != nullptr) {
|
|
const StringValue* string_val = (const StringValue*)(item);
|
|
parquet::ByteArray value;
|
|
value.ptr = reinterpret_cast<const uint8_t*>(string_val->ptr);
|
|
value.len = string_val->len;
|
|
col_writer->WriteBatch(1, nullptr, nullptr, &value);
|
|
} else {
|
|
parquet::ByteArray value;
|
|
col_writer->WriteBatch(1, nullptr, nullptr, &value);
|
|
}
|
|
break;
|
|
}
|
|
case TYPE_DECIMALV2: {
|
|
if (_str_schema[index][1] != "byte_array") {
|
|
std::stringstream ss;
|
|
ss << "project field type is decimal v2, should use byte_array, but the "
|
|
"definition type of column "
|
|
<< _str_schema[index][2] << " is " << _str_schema[index][1];
|
|
return Status::InvalidArgument(ss.str());
|
|
}
|
|
parquet::RowGroupWriter* rgWriter = get_rg_writer();
|
|
parquet::ByteArrayWriter* col_writer =
|
|
static_cast<parquet::ByteArrayWriter*>(rgWriter->column(index));
|
|
if (item != nullptr) {
|
|
const DecimalV2Value decimal_val(
|
|
reinterpret_cast<const PackedInt128*>(item)->value);
|
|
char decimal_buffer[MAX_DECIMAL_WIDTH];
|
|
int output_scale = _output_expr_ctxs[index]->root()->output_scale();
|
|
parquet::ByteArray value;
|
|
value.ptr = reinterpret_cast<const uint8_t*>(decimal_buffer);
|
|
value.len = decimal_val.to_buffer(decimal_buffer, output_scale);
|
|
col_writer->WriteBatch(1, nullptr, nullptr, &value);
|
|
} else {
|
|
parquet::ByteArray value;
|
|
col_writer->WriteBatch(1, nullptr, nullptr, &value);
|
|
}
|
|
break;
|
|
}
|
|
default: {
|
|
std::stringstream ss;
|
|
ss << "unsupported file format: " << _output_expr_ctxs[index]->root()->type().type;
|
|
return Status::InvalidArgument(ss.str());
|
|
}
|
|
}
|
|
}
|
|
} catch (const std::exception& e) {
|
|
LOG(WARNING) << "Parquet write error: " << e.what();
|
|
return Status::InternalError(e.what());
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
int64_t ParquetWriterWrapper::written_len() {
|
|
return _outstream->get_written_len();
|
|
}
|
|
void ParquetWriterWrapper::close() {
|
|
try {
|
|
if (_rg_writer != nullptr) {
|
|
_rg_writer->Close();
|
|
_rg_writer = nullptr;
|
|
}
|
|
_writer->Close();
|
|
arrow::Status st = _outstream->Close();
|
|
if (!st.ok()) {
|
|
LOG(WARNING) << "close parquet file error: " << st.ToString();
|
|
}
|
|
} catch (const std::exception& e) {
|
|
_rg_writer = nullptr;
|
|
LOG(WARNING) << "Parquet writer close error: " << e.what();
|
|
}
|
|
}
|
|
|
|
ParquetWriterWrapper::~ParquetWriterWrapper() {}
|
|
|
|
} // namespace doris
|