diff --git a/thirdparty/patches/apache-arrow-7.0.0.patch b/thirdparty/patches/apache-arrow-7.0.0.patch index 9cb1812b35..072ea9b567 100644 --- a/thirdparty/patches/apache-arrow-7.0.0.patch +++ b/thirdparty/patches/apache-arrow-7.0.0.patch @@ -1,50 +1,82 @@ diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc -index 03243e7..cbb3ed9 100644 +index 03243e751..1eb9b2c81 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -47,9 +47,6 @@ #include "arrow/util/visibility.h" #include "orc/Exceptions.hh" - + -// alias to not interfere with nested orc namespace -namespace liborc = orc; - #define ORC_THROW_NOT_OK(s) \ do { \ Status _s = (s); \ -@@ -198,6 +195,11 @@ class ORCFileReader::Impl { +@@ -198,6 +195,8 @@ class ORCFileReader::Impl { return Init(); } - -+ virtual liborc::Reader* GetRawORCReader() { -+ return reader_.get(); -+ } -+ + ++ virtual liborc::Reader* GetRawORCReader() { return reader_.get(); } + Status Init() { int64_t nstripes = reader_->getNumberOfStripes(); stripes_.resize(nstripes); -@@ -504,6 +506,7 @@ class ORCFileReader::Impl { +@@ -504,6 +503,32 @@ class ORCFileReader::Impl { return Status::OK(); } - + ++ Result> NextStripeReader( ++ int64_t batch_size, const std::vector& include_names) { ++ if (current_row_ >= NumberOfRows()) { ++ return nullptr; ++ } ++ ++ liborc::RowReaderOptions opts; ++ if (!include_names.empty()) { ++ RETURN_NOT_OK(SelectNames(&opts, include_names)); ++ } ++ StripeInformation stripe_info({0, 0, 0, 0}); ++ RETURN_NOT_OK(SelectStripeWithRowNumber(&opts, current_row_, &stripe_info)); ++ std::shared_ptr schema; ++ RETURN_NOT_OK(ReadSchema(opts, &schema)); ++ std::unique_ptr row_reader; ++ ++ ORC_BEGIN_CATCH_NOT_OK ++ row_reader = reader_->createRowReader(opts); ++ row_reader->seekToRow(current_row_); ++ current_row_ = stripe_info.first_row_of_stripe + stripe_info.num_rows; ++ ORC_END_CATCH_NOT_OK ++ ++ return std::make_shared(std::move(row_reader), schema, batch_size, ++ pool_); ++ } + Status NextStripeReader(int64_t batch_size, std::shared_ptr* out) { return NextStripeReader(batch_size, {}, out); } -@@ -531,6 +534,10 @@ Result> ORCFileReader::Open( +@@ -531,6 +556,8 @@ Result> ORCFileReader::Open( return std::move(result); } - -+liborc::Reader* ORCFileReader::GetRawORCReader() { -+ return impl_->GetRawORCReader(); -+} + ++liborc::Reader* ORCFileReader::GetRawORCReader() { return impl_->GetRawORCReader(); } + Result> ORCFileReader::ReadMetadata() { return impl_->ReadMetadata(); } +@@ -653,6 +680,11 @@ Result> ORCFileReader::NextStripeReader( + return reader; + } + ++Result> ORCFileReader::NextStripeReader( ++ int64_t batch_size, const std::vector& include_names) { ++ return impl_->NextStripeReader(batch_size, include_names); ++} ++ + int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); } + + int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); } diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h -index 223efa5..a0d112a 100644 +index 223efa515..04e6b0612 100644 --- a/cpp/src/arrow/adapters/orc/adapter.h +++ b/cpp/src/arrow/adapters/orc/adapter.h @@ -30,6 +30,10 @@ @@ -55,16 +87,36 @@ index 223efa5..a0d112a 100644 + +// alias to not interfere with nested orc namespace +namespace liborc = orc; - + namespace arrow { namespace adapters { -@@ -50,6 +54,9 @@ class ARROW_EXPORT ORCFileReader { - ARROW_DEPRECATED("Deprecated in 6.0.0. Use Result-returning overload instead.") +@@ -51,6 +55,9 @@ class ARROW_EXPORT ORCFileReader { static Status Open(const std::shared_ptr& file, MemoryPool* pool, std::unique_ptr* reader); -+ + + /// \brief Get ORC reader from inside. + liborc::Reader* GetRawORCReader(); - ++ /// \brief Creates a new ORC reader - /// \ No newline at end of file + /// + /// \param[in] file the data source +@@ -240,6 +247,19 @@ class ARROW_EXPORT ORCFileReader { + Result> NextStripeReader( + int64_t batch_size, const std::vector& include_indices); + ++ /// \brief Get a stripe level record batch iterator with specified row count ++ /// in each record batch. NextStripeReader serves as a fine grain ++ /// alternative to ReadStripe which may cause OOM issue by loading ++ /// the whole stripes into memory. ++ /// ++ /// \param[in] batch_size Get a stripe level record batch iterator with specified row ++ /// count in each record batch. ++ /// ++ /// \param[in] include_names the selected field names to read ++ /// \return the returned stripe reader ++ Result> NextStripeReader( ++ int64_t batch_size, const std::vector& include_names); ++ + /// \brief The number of stripes in the file + int64_t NumberOfStripes(); +