Files
doris/thirdparty/patches/apache-arrow-7.0.0.patch
camby 015f8ab78d [enhancement](thirdparty) support create stripe reader by column names (#14184)
ORC NextStripeReader now only support read columns by indices, but it is hard to get column indices for complex types.
We patch ORC adapter to support read columns by column names.
Co-authored-by: cambyzju <zhuxiaoli01@baidu.com>
2022-11-11 15:10:20 +08:00

123 lines
4.5 KiB
Diff

diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc
index 03243e751..1eb9b2c81 100644
--- a/cpp/src/arrow/adapters/orc/adapter.cc
+++ b/cpp/src/arrow/adapters/orc/adapter.cc
@@ -47,9 +47,6 @@
#include "arrow/util/visibility.h"
#include "orc/Exceptions.hh"
-// alias to not interfere with nested orc namespace
-namespace liborc = orc;
-
#define ORC_THROW_NOT_OK(s) \
do { \
Status _s = (s); \
@@ -198,6 +195,8 @@ class ORCFileReader::Impl {
return Init();
}
+ virtual liborc::Reader* GetRawORCReader() { return reader_.get(); }
+
Status Init() {
int64_t nstripes = reader_->getNumberOfStripes();
stripes_.resize(nstripes);
@@ -504,6 +503,32 @@ class ORCFileReader::Impl {
return Status::OK();
}
+ Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
+ int64_t batch_size, const std::vector<std::string>& include_names) {
+ if (current_row_ >= NumberOfRows()) {
+ return nullptr;
+ }
+
+ liborc::RowReaderOptions opts;
+ if (!include_names.empty()) {
+ RETURN_NOT_OK(SelectNames(&opts, include_names));
+ }
+ StripeInformation stripe_info({0, 0, 0, 0});
+ RETURN_NOT_OK(SelectStripeWithRowNumber(&opts, current_row_, &stripe_info));
+ std::shared_ptr<Schema> schema;
+ RETURN_NOT_OK(ReadSchema(opts, &schema));
+ std::unique_ptr<liborc::RowReader> row_reader;
+
+ ORC_BEGIN_CATCH_NOT_OK
+ row_reader = reader_->createRowReader(opts);
+ row_reader->seekToRow(current_row_);
+ current_row_ = stripe_info.first_row_of_stripe + stripe_info.num_rows;
+ ORC_END_CATCH_NOT_OK
+
+ return std::make_shared<OrcStripeReader>(std::move(row_reader), schema, batch_size,
+ pool_);
+ }
+
Status NextStripeReader(int64_t batch_size, std::shared_ptr<RecordBatchReader>* out) {
return NextStripeReader(batch_size, {}, out);
}
@@ -531,6 +556,8 @@ Result<std::unique_ptr<ORCFileReader>> ORCFileReader::Open(
return std::move(result);
}
+liborc::Reader* ORCFileReader::GetRawORCReader() { return impl_->GetRawORCReader(); }
+
Result<std::shared_ptr<const KeyValueMetadata>> ORCFileReader::ReadMetadata() {
return impl_->ReadMetadata();
}
@@ -653,6 +680,11 @@ Result<std::shared_ptr<RecordBatchReader>> ORCFileReader::NextStripeReader(
return reader;
}
+Result<std::shared_ptr<RecordBatchReader>> ORCFileReader::NextStripeReader(
+ int64_t batch_size, const std::vector<std::string>& include_names) {
+ return impl_->NextStripeReader(batch_size, include_names);
+}
+
int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); }
int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); }
diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h
index 223efa515..04e6b0612 100644
--- a/cpp/src/arrow/adapters/orc/adapter.h
+++ b/cpp/src/arrow/adapters/orc/adapter.h
@@ -30,6 +30,10 @@
#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
+#include "orc/Reader.hh"
+
+// alias to not interfere with nested orc namespace
+namespace liborc = orc;
namespace arrow {
namespace adapters {
@@ -51,6 +55,9 @@ class ARROW_EXPORT ORCFileReader {
static Status Open(const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool,
std::unique_ptr<ORCFileReader>* reader);
+ /// \brief Get ORC reader from inside.
+ liborc::Reader* GetRawORCReader();
+
/// \brief Creates a new ORC reader
///
/// \param[in] file the data source
@@ -240,6 +247,19 @@ class ARROW_EXPORT ORCFileReader {
Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
int64_t batch_size, const std::vector<int>& include_indices);
+ /// \brief Get a stripe level record batch iterator with specified row count
+ /// in each record batch. NextStripeReader serves as a fine grain
+ /// alternative to ReadStripe which may cause OOM issue by loading
+ /// the whole stripes into memory.
+ ///
+ /// \param[in] batch_size Get a stripe level record batch iterator with specified row
+ /// count in each record batch.
+ ///
+ /// \param[in] include_names the selected field names to read
+ /// \return the returned stripe reader
+ Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
+ int64_t batch_size, const std::vector<std::string>& include_names);
+
/// \brief The number of stripes in the file
int64_t NumberOfStripes();