From b9f1278495ad3e8cdb9e678cac0b06f829dd9da1 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Fri, 20 Dec 2024 11:52:54 -0500 Subject: [PATCH] GH-45092: [C++][Parquet] Add GetReadRanges function to FileReader --- cpp/src/parquet/file_reader.cc | 16 ++++++++++++++++ cpp/src/parquet/file_reader.h | 26 ++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 1c9b2323de500..96ff364f170da 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -29,6 +29,7 @@ #include "arrow/io/caching.h" #include "arrow/io/file.h" #include "arrow/io/memory.h" +#include "arrow/io/util_internal.h" #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/future.h" @@ -400,6 +401,21 @@ class SerializedFile : public ParquetFileReader::Contents { PARQUET_THROW_NOT_OK(cached_source_->Cache(ranges)); } + ::arrow::Result> GetReadRanges( + const std::vector& row_groups, const std::vector& column_indices, + int64_t hole_size_limit, int64_t range_size_limit) { + std::vector<::arrow::io::ReadRange> ranges; + for (int row : row_groups) { + for (int col : column_indices) { + ranges.push_back( + ComputeColumnChunkRange(file_metadata_.get(), source_size_, row, col)); + } + } + + return ::arrow::io::internal::CoalesceReadRanges(std::move(ranges), hole_size_limit, + range_size_limit); + } + ::arrow::Future<> WhenBuffered(const std::vector& row_groups, const std::vector& column_indices) const { if (!cached_source_) { diff --git a/cpp/src/parquet/file_reader.h b/cpp/src/parquet/file_reader.h index b59b59f95c2d8..1c85c10a686e1 100644 --- a/cpp/src/parquet/file_reader.h +++ b/cpp/src/parquet/file_reader.h @@ -201,6 +201,32 @@ class PARQUET_EXPORT ParquetFileReader { const ::arrow::io::IOContext& ctx, const ::arrow::io::CacheOptions& options); + // Retrieve the list of byte ranges that would need to be read to retrieve + // the data for the specified row groups and column indices. + // + // A reader can optionally call this if they wish to handle their own + // caching and management of file reads (or offload them to other readers). + // Unlike PreBuffer, this method will not perform any actual caching or + // reads, instead just using the file metadata to determine the byte ranges + // that would need to be read if you were to consume the entirety of the column + // chunks for the provided columns in the specified row groups. + // + // If row_groups or column_indices are empty, then the result of this will be empty. + // + // hole_size_limit represents the maximum distance, in bytes, between two + // consecutive ranges; beyond this value, ranges will not be combined. The default + // value is 1MB. + // + // range_size_limit is the maximum size in bytes of a combined range; if combining + // two consecutive ranges would produce a range larger than this, they are not combined. + // The default values is 64MB. This *must* be larger than hole_size_limit. + // + // This will not take into account page indexes or any other predicate push down + // benefits that may be available. + ::arrow::Result> GetReadRanges( + const std::vector& row_groups, const std::vector& column_indices, + int64_t hole_size_limit = 1024 * 1024, int64_t range_size_limit = 64 * 1024 * 1024); + /// Wait for the specified row groups and column indices to be pre-buffered. /// /// After the returned Future completes, reading the specified row