[feature-wip] (parquet-reader) add parquet reader impl template (#11285)
This commit is contained in:
@ -221,7 +221,15 @@ set(VEC_FILES
|
||||
exec/file_arrow_scanner.cpp
|
||||
exec/file_scanner.cpp
|
||||
exec/file_scan_node.cpp
|
||||
exec/file_text_scanner.cpp)
|
||||
exec/file_text_scanner.cpp
|
||||
exec/file_hdfs_scanner.cpp
|
||||
exec/format/parquet/vparquet_column_chunk_reader.cpp
|
||||
exec/format/parquet/vparquet_group_reader.cpp
|
||||
exec/format/parquet/vparquet_page_index.cpp
|
||||
exec/format/parquet/vparquet_reader.cpp
|
||||
exec/format/parquet/vparquet_file_metadata.cpp
|
||||
exec/format/parquet/vparquet_page_reader.cpp
|
||||
exec/format/parquet/schema_desc.cpp)
|
||||
|
||||
add_library(Vec STATIC
|
||||
${VEC_FILES}
|
||||
|
||||
20
be/src/vec/exec/file_hdfs_scanner.cpp
Normal file
20
be/src/vec/exec/file_hdfs_scanner.cpp
Normal file
@ -0,0 +1,20 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "file_hdfs_scanner.h"
|
||||
|
||||
namespace doris::vectorized {} // namespace doris::vectorized
|
||||
26
be/src/vec/exec/file_hdfs_scanner.h
Normal file
26
be/src/vec/exec/file_hdfs_scanner.h
Normal file
@ -0,0 +1,26 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
class HdfsFileScanner {};
|
||||
|
||||
class ParquetFileHdfsScanner : public HdfsFileScanner {};
|
||||
|
||||
} // namespace doris::vectorized
|
||||
87
be/src/vec/exec/format/parquet/parquet_thrift_util.h
Normal file
87
be/src/vec/exec/format/parquet/parquet_thrift_util.h
Normal file
@ -0,0 +1,87 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
#include <common/status.h>
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "common/logging.h"
|
||||
#include "gen_cpp/parquet_types.h"
|
||||
#include "io/file_reader.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/thrift_util.h"
|
||||
#include "vparquet_file_metadata.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
constexpr uint8_t PARQUET_VERSION_NUMBER[4] = {'P', 'A', 'R', '1'};
|
||||
constexpr int64_t PARQUET_FOOTER_READ_SIZE = 64 * 1024;
|
||||
constexpr uint32_t PARQUET_FOOTER_SIZE = 8;
|
||||
|
||||
Status parse_thrift_footer(FileReader* file, std::shared_ptr<FileMetaData>& file_metadata) {
|
||||
// try with buffer on stack
|
||||
uint8_t buff[PARQUET_FOOTER_READ_SIZE];
|
||||
int64_t file_size = file->size();
|
||||
// read footer bytes
|
||||
uint64_t footer_read_size = std::min(file_size, PARQUET_FOOTER_READ_SIZE);
|
||||
|
||||
int64_t bytes_read = 0;
|
||||
RETURN_IF_ERROR(
|
||||
file->readat(file_size - footer_read_size, footer_read_size, &bytes_read, buff));
|
||||
|
||||
// validate magic
|
||||
uint8_t* magic_ptr = buff + footer_read_size - sizeof(PARQUET_VERSION_NUMBER);
|
||||
if (memcmp(magic_ptr, PARQUET_VERSION_NUMBER, sizeof(PARQUET_VERSION_NUMBER)) != 0) {
|
||||
return Status::Corruption("Invalid magic number in parquet file");
|
||||
}
|
||||
|
||||
// get metadata_size
|
||||
uint8_t* footer_buff = buff + footer_read_size - PARQUET_FOOTER_SIZE;
|
||||
uint32_t metadata_size = decode_fixed32_le(footer_buff);
|
||||
if (metadata_size > file_size - PARQUET_FOOTER_SIZE) {
|
||||
Status::Corruption("Parquet file size is ", file_size,
|
||||
" bytes, smaller than the size reported by footer's (", metadata_size,
|
||||
"bytes)");
|
||||
}
|
||||
tparquet::FileMetaData t_metadata;
|
||||
// deserialize footer
|
||||
RETURN_IF_ERROR(
|
||||
deserialize_thrift_msg(footer_buff - metadata_size, &metadata_size, true, &t_metadata));
|
||||
file_metadata.reset(new FileMetaData(t_metadata));
|
||||
RETURN_IF_ERROR(file_metadata->init_schema());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Status parse_page_header() {
|
||||
// uint8_t* page_buf;
|
||||
//
|
||||
// }
|
||||
|
||||
// Status parse_page_index() {
|
||||
//
|
||||
// }
|
||||
|
||||
// void deserialize_column_index(int64_t start_offset, tparquet::ColumnIndex) {
|
||||
//
|
||||
// }
|
||||
//
|
||||
// void deserialize_offset_index(int64_t start_offset, tparquet::OffsetIndex) {
|
||||
//
|
||||
// }
|
||||
|
||||
} // namespace doris::vectorized
|
||||
33
be/src/vec/exec/format/parquet/schema_desc.cpp
Normal file
33
be/src/vec/exec/format/parquet/schema_desc.cpp
Normal file
@ -0,0 +1,33 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "schema_desc.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
SchemaDescriptor::~SchemaDescriptor() {
|
||||
// fields.clear();
|
||||
}
|
||||
|
||||
std::string SchemaDescriptor::debug_string() const {
|
||||
return std::string();
|
||||
}
|
||||
|
||||
std::string FieldSchema::debug_string() const {
|
||||
return std::string();
|
||||
}
|
||||
} // namespace doris::vectorized
|
||||
46
be/src/vec/exec/format/parquet/schema_desc.h
Normal file
46
be/src/vec/exec/format/parquet/schema_desc.h
Normal file
@ -0,0 +1,46 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common/status.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
class FieldSchema {
|
||||
public:
|
||||
int16_t max_def_level() const { return _max_def_level; }
|
||||
int16_t max_rep_level() const { return _max_rep_level; }
|
||||
std::string debug_string() const;
|
||||
|
||||
private:
|
||||
int16_t _max_def_level;
|
||||
int16_t _max_rep_level;
|
||||
// std::vector<FieldSchema> children;
|
||||
};
|
||||
|
||||
class SchemaDescriptor {
|
||||
public:
|
||||
SchemaDescriptor() = default;
|
||||
~SchemaDescriptor();
|
||||
|
||||
std::string debug_string() const;
|
||||
|
||||
private:
|
||||
// std::vector<FieldSchema> fields;
|
||||
};
|
||||
|
||||
} // namespace doris::vectorized
|
||||
@ -0,0 +1,36 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
#include "vparquet_column_chunk_reader.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
Status ColumnChunkReader::init() {
|
||||
return Status();
|
||||
}
|
||||
|
||||
Status ColumnChunkReader::read_min_max_stat() {
|
||||
return Status();
|
||||
}
|
||||
|
||||
Status ColumnChunkReader::decode_dict_page() {
|
||||
return Status();
|
||||
}
|
||||
|
||||
Status ColumnChunkReader::decode_nested_page() {
|
||||
return Status();
|
||||
}
|
||||
} // namespace doris::vectorized
|
||||
@ -0,0 +1,33 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
#include <common/status.h>
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
class ColumnChunkReader {
|
||||
public:
|
||||
Status init();
|
||||
Status read_min_max_stat();
|
||||
Status decode_dict_page();
|
||||
Status decode_nested_page();
|
||||
|
||||
private:
|
||||
};
|
||||
|
||||
} // namespace doris::vectorized
|
||||
52
be/src/vec/exec/format/parquet/vparquet_file_metadata.cpp
Normal file
52
be/src/vec/exec/format/parquet/vparquet_file_metadata.cpp
Normal file
@ -0,0 +1,52 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "vparquet_file_metadata.h"
|
||||
|
||||
#include <sstream>
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
FileMetaData::FileMetaData(tparquet::FileMetaData& metadata) : _metadata(metadata) {
|
||||
_num_rows = metadata.num_rows;
|
||||
_num_groups = metadata.row_groups.size();
|
||||
if (_num_groups != 0) {
|
||||
_num_columns = metadata.row_groups[0].columns.size();
|
||||
}
|
||||
if (metadata.schema[0].num_children <= 0) {
|
||||
}
|
||||
}
|
||||
|
||||
Status FileMetaData::init_schema() {
|
||||
return Status();
|
||||
}
|
||||
|
||||
const tparquet::FileMetaData& FileMetaData::to_thrift_metadata() {
|
||||
return _metadata;
|
||||
}
|
||||
|
||||
std::string FileMetaData::debug_string() const {
|
||||
std::stringstream out;
|
||||
out << "Parquet Metadata(";
|
||||
out << "; version=" << _metadata.version;
|
||||
out << "; num row groups=" << _num_groups;
|
||||
out << "; num rows=" << _num_rows;
|
||||
out << ")";
|
||||
return out.str();
|
||||
}
|
||||
|
||||
} // namespace doris::vectorized
|
||||
45
be/src/vec/exec/format/parquet/vparquet_file_metadata.h
Normal file
45
be/src/vec/exec/format/parquet/vparquet_file_metadata.h
Normal file
@ -0,0 +1,45 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
#include "common/status.h"
|
||||
#include "gen_cpp/parquet_types.h"
|
||||
#include "schema_desc.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
class FileMetaData {
|
||||
public:
|
||||
FileMetaData(tparquet::FileMetaData& metadata);
|
||||
~FileMetaData() = default;
|
||||
Status init_schema();
|
||||
const tparquet::FileMetaData& to_thrift_metadata();
|
||||
int32_t num_row_groups() const { return _num_groups; }
|
||||
int32_t num_columns() const { return _num_columns; };
|
||||
int32_t num_rows() const { return _num_rows; };
|
||||
SchemaDescriptor schema() const { return _schema; };
|
||||
std::string debug_string() const;
|
||||
|
||||
private:
|
||||
tparquet::FileMetaData _metadata;
|
||||
int32_t _num_groups = 0;
|
||||
int32_t _num_columns = 0;
|
||||
int64_t _num_rows = 0;
|
||||
SchemaDescriptor _schema;
|
||||
};
|
||||
|
||||
} // namespace doris::vectorized
|
||||
20
be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
Normal file
20
be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
Normal file
@ -0,0 +1,20 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "vparquet_group_reader.h"
|
||||
|
||||
namespace doris::vectorized {} // namespace doris::vectorized
|
||||
24
be/src/vec/exec/format/parquet/vparquet_group_reader.h
Normal file
24
be/src/vec/exec/format/parquet/vparquet_group_reader.h
Normal file
@ -0,0 +1,24 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
#pragma once
|
||||
#include <common/status.h>
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
class RowGroupReader {};
|
||||
|
||||
} // namespace doris::vectorized
|
||||
29
be/src/vec/exec/format/parquet/vparquet_page_index.cpp
Normal file
29
be/src/vec/exec/format/parquet/vparquet_page_index.cpp
Normal file
@ -0,0 +1,29 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "vparquet_page_index.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
Status PageIndex::get_row_range_for_page() {
|
||||
return Status();
|
||||
}
|
||||
|
||||
Status PageIndex::collect_skipped_page_range() {
|
||||
return Status();
|
||||
}
|
||||
} // namespace doris::vectorized
|
||||
35
be/src/vec/exec/format/parquet/vparquet_page_index.h
Normal file
35
be/src/vec/exec/format/parquet/vparquet_page_index.h
Normal file
@ -0,0 +1,35 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
#include <common/status.h>
|
||||
#include <gen_cpp/parquet_types.h>
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
class PageIndex {
|
||||
public:
|
||||
Status get_row_range_for_page();
|
||||
Status collect_skipped_page_range();
|
||||
|
||||
//private:
|
||||
// // row range define
|
||||
// tparquet::ColumnIndex _column_index;
|
||||
// tparquet::OffsetIndex _offset_index;
|
||||
};
|
||||
|
||||
} // namespace doris::vectorized
|
||||
33
be/src/vec/exec/format/parquet/vparquet_page_reader.cpp
Normal file
33
be/src/vec/exec/format/parquet/vparquet_page_reader.cpp
Normal file
@ -0,0 +1,33 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "vparquet_page_reader.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
Status PageReader::read_page_header() {
|
||||
return Status();
|
||||
}
|
||||
|
||||
Status PageReader::read_page_data() {
|
||||
return Status();
|
||||
}
|
||||
|
||||
Status PageReader::init() {
|
||||
return Status();
|
||||
}
|
||||
} // namespace doris::vectorized
|
||||
34
be/src/vec/exec/format/parquet/vparquet_page_reader.h
Normal file
34
be/src/vec/exec/format/parquet/vparquet_page_reader.h
Normal file
@ -0,0 +1,34 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
#include <common/status.h>
|
||||
#include <gen_cpp/parquet_types.h>
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
class PageReader {
|
||||
public:
|
||||
Status init();
|
||||
Status read_page_header();
|
||||
Status read_page_data();
|
||||
|
||||
//private:
|
||||
// tparquet::PageHeader* _page_header;
|
||||
};
|
||||
|
||||
} // namespace doris::vectorized
|
||||
61
be/src/vec/exec/format/parquet/vparquet_reader.cpp
Normal file
61
be/src/vec/exec/format/parquet/vparquet_reader.cpp
Normal file
@ -0,0 +1,61 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "vparquet_reader.h"
|
||||
|
||||
#include "parquet_thrift_util.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
doris::vectorized::ParquetReader::ParquetReader(doris::FileReader* file_reader, int64_t batch_size,
|
||||
int32_t num_of_columns_from_file,
|
||||
int64_t range_start_offset, int64_t range_size) {
|
||||
// : _batch_size(batch_size), _num_of_columns_from_file(num_of_columns_from_file) {
|
||||
_file_reader = file_reader;
|
||||
_total_groups = 0;
|
||||
// _current_group = 0;
|
||||
// _statistics = std::make_shared<Statistics>();
|
||||
}
|
||||
|
||||
doris::vectorized::ParquetReader::~ParquetReader() {
|
||||
// _batch.clear();
|
||||
}
|
||||
|
||||
Status ParquetReader::init_reader(const TupleDescriptor* tuple_desc,
|
||||
const std::vector<SlotDescriptor*>& tuple_slot_descs,
|
||||
const std::vector<ExprContext*>& conjunct_ctxs,
|
||||
const std::string& timezone) {
|
||||
_file_reader->open();
|
||||
RETURN_IF_ERROR(parse_thrift_footer(_file_reader, _file_metadata));
|
||||
auto metadata = _file_metadata->to_thrift_metadata();
|
||||
|
||||
_total_groups = metadata.row_groups.size();
|
||||
if (_total_groups == 0) {
|
||||
return Status::EndOfFile("Empty Parquet File");
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
int64_t ParquetReader::_get_row_group_start_offset(const tparquet::RowGroup& row_group) {
|
||||
if (row_group.__isset.file_offset) {
|
||||
return row_group.file_offset;
|
||||
}
|
||||
const tparquet::ColumnMetaData& first_column = row_group.columns[0].meta_data;
|
||||
return first_column.data_page_offset;
|
||||
}
|
||||
|
||||
} // namespace doris::vectorized
|
||||
87
be/src/vec/exec/format/parquet/vparquet_reader.h
Normal file
87
be/src/vec/exec/format/parquet/vparquet_reader.h
Normal file
@ -0,0 +1,87 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <parquet/api/reader.h>
|
||||
#include <parquet/api/writer.h>
|
||||
#include <parquet/exception.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "common/status.h"
|
||||
#include "exprs/expr_context.h"
|
||||
#include "gen_cpp/PaloBrokerService_types.h"
|
||||
#include "gen_cpp/PlanNodes_types.h"
|
||||
#include "gen_cpp/Types_types.h"
|
||||
#include "gen_cpp/parquet_types.h"
|
||||
#include "io/file_reader.h"
|
||||
#include "vec/core/block.h"
|
||||
#include "vparquet_file_metadata.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
// struct Statistics {
|
||||
// int32_t filtered_row_groups = 0;
|
||||
// int32_t total_groups = 0;
|
||||
// int64_t filtered_rows = 0;
|
||||
// int64_t total_rows = 0;
|
||||
// int64_t filtered_total_bytes = 0;
|
||||
// int64_t total_bytes = 0;
|
||||
// };
|
||||
|
||||
class ParquetReader {
|
||||
public:
|
||||
ParquetReader(FileReader* file_reader, int64_t batch_size, int32_t num_of_columns_from_file,
|
||||
int64_t range_start_offset, int64_t range_size);
|
||||
~ParquetReader();
|
||||
virtual Status init_reader(const TupleDescriptor* tuple_desc,
|
||||
const std::vector<SlotDescriptor*>& tuple_slot_descs,
|
||||
const std::vector<ExprContext*>& conjunct_ctxs,
|
||||
const std::string& timezone) = 0;
|
||||
virtual Status next_batch(bool* eof) = 0;
|
||||
// std::shared_ptr<Statistics>& statistics() { return _statistics; }
|
||||
void close() {};
|
||||
int64_t size(int64_t* size) { return _file_reader->size(); }
|
||||
|
||||
private:
|
||||
int64_t _get_row_group_start_offset(const tparquet::RowGroup& row_group);
|
||||
|
||||
private:
|
||||
FileReader* _file_reader;
|
||||
std::shared_ptr<FileMetaData> _file_metadata;
|
||||
// const int64_t _batch_size;
|
||||
// const int32_t _num_of_columns_from_file;
|
||||
int _total_groups; // num of groups(stripes) of a parquet(orc) file
|
||||
// int _current_group; // current group(stripe)
|
||||
// std::map<std::string, int> _map_column; // column-name <---> column-index
|
||||
// std::vector<int> _include_column_ids; // columns that need to get from file
|
||||
// std::shared_ptr<Statistics> _statistics;
|
||||
|
||||
// parquet file reader object
|
||||
// std::vector<Block*> _batch;
|
||||
// std::string _timezone;
|
||||
// int64_t _range_start_offset;
|
||||
// int64_t _range_size;
|
||||
|
||||
private:
|
||||
std::atomic<bool> _closed = false;
|
||||
};
|
||||
|
||||
} // namespace doris::vectorized
|
||||
@ -59,6 +59,7 @@ set(EXEC_TEST_FILES
|
||||
exec/s3_reader_test.cpp
|
||||
exec/multi_bytes_separator_test.cpp
|
||||
exec/hdfs_file_reader_test.cpp
|
||||
vec/exec/parquet/parquet_thrift_test.cpp
|
||||
# exec/new_olap_scan_node_test.cpp
|
||||
# exec/pre_aggregation_node_test.cpp
|
||||
# exec/partitioned_hash_table_test.cpp
|
||||
|
||||
66
be/test/vec/exec/parquet/parquet_thrift_test.cpp
Normal file
66
be/test/vec/exec/parquet/parquet_thrift_test.cpp
Normal file
@ -0,0 +1,66 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include <glog/logging.h>
|
||||
#include <gtest/gtest.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "io/buffered_reader.h"
|
||||
#include "io/file_reader.h"
|
||||
#include "io/local_file_reader.h"
|
||||
#include "util/runtime_profile.h"
|
||||
#include "vec/exec/format/parquet/parquet_thrift_util.h"
|
||||
#include "vec/exec/format/parquet/vparquet_file_metadata.h"
|
||||
|
||||
namespace doris {
|
||||
namespace vectorized {
|
||||
|
||||
class ParquetThriftReaderTest : public testing::Test {
|
||||
public:
|
||||
ParquetThriftReaderTest() {}
|
||||
};
|
||||
|
||||
TEST_F(ParquetThriftReaderTest, normal) {
|
||||
LocalFileReader reader("./be/test/exec/test_data/parquet_scanner/localfile.parquet", 0);
|
||||
|
||||
auto st = reader.open();
|
||||
EXPECT_TRUE(st.ok());
|
||||
|
||||
std::shared_ptr<FileMetaData> metaData;
|
||||
parse_thrift_footer(&reader, metaData);
|
||||
tparquet::FileMetaData t_metadata = metaData->to_thrift_metadata();
|
||||
LOG(WARNING) << "num row groups: " << metaData->num_row_groups();
|
||||
LOG(WARNING) << "num columns: " << metaData->num_columns();
|
||||
LOG(WARNING) << "=====================================";
|
||||
for (auto value : t_metadata.row_groups) {
|
||||
LOG(WARNING) << "row group num_rows: " << value.num_rows;
|
||||
}
|
||||
LOG(WARNING) << "=====================================";
|
||||
for (auto value : t_metadata.schema) {
|
||||
LOG(WARNING) << "schema column name: " << value.name;
|
||||
LOG(WARNING) << "schema column type: " << value.type;
|
||||
LOG(WARNING) << "schema column repetition_type: " << value.repetition_type;
|
||||
LOG(WARNING) << "schema column num children: " << value.num_children;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace vectorized
|
||||
|
||||
} // namespace doris
|
||||
Reference in New Issue
Block a user