submit again #23290 (not upgrade brpc, because bthread local has error) protobuf 3.15.0 -> 21.11 glog 0.4.0 -> 0.6.0 lz4 1.9.3 -> 1.9.4 curl 7.79.0 -> 8.2.1 zstd 1.5.2 -> 1.5.5 arrow 7.0.0 -> 13.0.0 abseil 20220623.1 -> 20230125.3 orc 1.7.2 -> 1.9.0 jemalloc for arrow 5.2.1 -> 5.3.0 xsimd 7.0.0 -> 13.0.0 opentelemetry-proto 0.19.0 -> 1.0.0 opentelemetry 1.8.3 -> 1.10.0 new: c-ares -> 1.19.1 grpc -> 1.54.3
121 lines
4.4 KiB
Diff
121 lines
4.4 KiB
Diff
diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc
|
|
index 2466e7433..46b4402d4 100644
|
|
--- a/cpp/src/arrow/adapters/orc/adapter.cc
|
|
+++ b/cpp/src/arrow/adapters/orc/adapter.cc
|
|
@@ -47,9 +47,6 @@
|
|
#include "arrow/util/visibility.h"
|
|
#include "orc/Exceptions.hh"
|
|
|
|
-// alias to not interfere with nested orc namespace
|
|
-namespace liborc = orc;
|
|
-
|
|
#define ORC_THROW_NOT_OK(s) \
|
|
do { \
|
|
Status _s = (s); \
|
|
@@ -202,6 +199,8 @@ class ORCFileReader::Impl {
|
|
return Init();
|
|
}
|
|
|
|
+ virtual liborc::Reader* GetRawORCReader() { return reader_.get(); }
|
|
+
|
|
Status Init() {
|
|
int64_t nstripes = reader_->getNumberOfStripes();
|
|
stripes_.resize(nstripes);
|
|
@@ -479,6 +478,31 @@ class ORCFileReader::Impl {
|
|
return Status::OK();
|
|
}
|
|
|
|
+ Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
|
|
+ int64_t batch_size, const std::vector<std::string>& include_names) {
|
|
+ if (current_row_ >= NumberOfRows()) {
|
|
+ return nullptr;
|
|
+ }
|
|
+
|
|
+ liborc::RowReaderOptions opts = default_row_reader_options();
|
|
+ if (!include_names.empty()) {
|
|
+ RETURN_NOT_OK(SelectNames(&opts, include_names));
|
|
+ }
|
|
+ StripeInformation stripe_info({0, 0, 0, 0});
|
|
+ RETURN_NOT_OK(SelectStripeWithRowNumber(&opts, current_row_, &stripe_info));
|
|
+ ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts));
|
|
+ std::unique_ptr<liborc::RowReader> row_reader;
|
|
+
|
|
+ ORC_BEGIN_CATCH_NOT_OK
|
|
+ row_reader = reader_->createRowReader(opts);
|
|
+ row_reader->seekToRow(current_row_);
|
|
+ current_row_ = stripe_info.first_row_id + stripe_info.num_rows;
|
|
+ ORC_END_CATCH_NOT_OK
|
|
+
|
|
+ return std::make_shared<OrcStripeReader>(std::move(row_reader), schema, batch_size,
|
|
+ pool_);
|
|
+ }
|
|
+
|
|
Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
|
|
int64_t batch_size, const std::vector<int>& include_indices) {
|
|
if (current_row_ >= NumberOfRows()) {
|
|
@@ -544,6 +568,8 @@ Result<std::unique_ptr<ORCFileReader>> ORCFileReader::Open(
|
|
return std::move(result);
|
|
}
|
|
|
|
+liborc::Reader* ORCFileReader::GetRawORCReader() { return impl_->GetRawORCReader(); }
|
|
+
|
|
Result<std::shared_ptr<const KeyValueMetadata>> ORCFileReader::ReadMetadata() {
|
|
return impl_->ReadMetadata();
|
|
}
|
|
@@ -605,6 +631,11 @@ Result<std::shared_ptr<RecordBatchReader>> ORCFileReader::NextStripeReader(
|
|
return impl_->NextStripeReader(batch_size, include_indices);
|
|
}
|
|
|
|
+Result<std::shared_ptr<RecordBatchReader>> ORCFileReader::NextStripeReader(
|
|
+ int64_t batch_size, const std::vector<std::string>& include_names) {
|
|
+ return impl_->NextStripeReader(batch_size, include_names);
|
|
+}
|
|
+
|
|
int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); }
|
|
|
|
int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); }
|
|
diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h
|
|
index 013be7860..7fd06bcb8 100644
|
|
--- a/cpp/src/arrow/adapters/orc/adapter.h
|
|
+++ b/cpp/src/arrow/adapters/orc/adapter.h
|
|
@@ -30,6 +30,10 @@
|
|
#include "arrow/type_fwd.h"
|
|
#include "arrow/util/macros.h"
|
|
#include "arrow/util/visibility.h"
|
|
+#include "orc/Reader.hh"
|
|
+
|
|
+// alias to not interfere with nested orc namespace
|
|
+namespace liborc = orc;
|
|
|
|
namespace arrow {
|
|
namespace adapters {
|
|
@@ -53,6 +57,9 @@ class ARROW_EXPORT ORCFileReader {
|
|
public:
|
|
~ORCFileReader();
|
|
|
|
+ /// \brief Get ORC reader from inside.
|
|
+ liborc::Reader* GetRawORCReader();
|
|
+
|
|
/// \brief Creates a new ORC reader
|
|
///
|
|
/// \param[in] file the data source
|
|
@@ -174,6 +181,19 @@ class ARROW_EXPORT ORCFileReader {
|
|
Result<std::shared_ptr<RecordBatchReader>> GetRecordBatchReader(
|
|
int64_t batch_size, const std::vector<std::string>& include_names);
|
|
|
|
+ /// \brief Get a stripe level record batch iterator with specified row count
|
|
+ /// in each record batch. NextStripeReader serves as a fine grain
|
|
+ /// alternative to ReadStripe which may cause OOM issue by loading
|
|
+ /// the whole stripes into memory.
|
|
+ ///
|
|
+ /// \param[in] batch_size Get a stripe level record batch iterator with specified row
|
|
+ /// count in each record batch.
|
|
+ ///
|
|
+ /// \param[in] include_names the selected field names to read
|
|
+ /// \return the returned stripe reader
|
|
+ Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
|
|
+ int64_t batch_size, const std::vector<std::string>& include_names);
|
|
+
|
|
/// \brief The number of stripes in the file
|
|
int64_t NumberOfStripes();
|