From 16e56fa181f58526f68285cd1b5953a4df55d2d9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 25 Sep 2023 11:59:12 +0200 Subject: [PATCH 1/3] [Python][Dataset] Change default of pre_buffer to True for reading Parquet files --- python/pyarrow/_dataset_parquet.pyx | 4 ++-- python/pyarrow/tests/test_dataset.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index cf5c44c1c96..cce6777a108 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -666,7 +666,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): Disabled by default. buffer_size : int, default 8192 Size of buffered stream, if enabled. Default is 8KB. - pre_buffer : bool, default False + pre_buffer : bool, default True If enabled, pre-buffer the raw Parquet data instead of issuing one read per column chunk. This can improve performance on high-latency filesystems. @@ -688,7 +688,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): def __init__(self, *, bint use_buffered_stream=False, buffer_size=8192, - bint pre_buffer=False, + bint pre_buffer=True, thrift_string_size_limit=None, thrift_container_size_limit=None): self.init(shared_ptr[CFragmentScanOptions]( diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 39c3c43daea..671405d1ee6 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -784,28 +784,28 @@ def test_parquet_scan_options(): opts2 = ds.ParquetFragmentScanOptions(buffer_size=4096) opts3 = ds.ParquetFragmentScanOptions( buffer_size=2**13, use_buffered_stream=True) - opts4 = ds.ParquetFragmentScanOptions(buffer_size=2**13, pre_buffer=True) + opts4 = ds.ParquetFragmentScanOptions(buffer_size=2**13, pre_buffer=False) opts5 = ds.ParquetFragmentScanOptions( thrift_string_size_limit=123456, thrift_container_size_limit=987654,) assert opts1.use_buffered_stream is False assert opts1.buffer_size == 2**13 - assert opts1.pre_buffer is False + assert opts1.pre_buffer is True assert opts1.thrift_string_size_limit == 100_000_000 # default in C++ assert opts1.thrift_container_size_limit == 1_000_000 # default in C++ assert opts2.use_buffered_stream is False assert opts2.buffer_size == 2**12 - assert opts2.pre_buffer is False + assert opts2.pre_buffer is True assert opts3.use_buffered_stream is True assert opts3.buffer_size == 2**13 - assert opts3.pre_buffer is False + assert opts3.pre_buffer is True assert opts4.use_buffered_stream is False assert opts4.buffer_size == 2**13 - assert opts4.pre_buffer is True + assert opts4.pre_buffer is False assert opts5.thrift_string_size_limit == 123456 assert opts5.thrift_container_size_limit == 987654 From 0c21100243eea8ddb0dcac97cac1990351c686b7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 5 Oct 2023 15:20:43 +0200 Subject: [PATCH 2/3] update defaults on C++ as well --- cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 7 ++++--- cpp/src/parquet/properties.h | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 8585b1ccf11..7a94b1f3a1c 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -2354,10 +2354,11 @@ void TestGetRecordBatchReader( TEST(TestArrowReadWrite, GetRecordBatchReader) { TestGetRecordBatchReader(); } -// Same as the test above, but using coalesced reads. -TEST(TestArrowReadWrite, CoalescedReads) { +// Same as the test above, but using non-coalesced reads. +TEST(TestArrowReadWrite, NoneCoalescedReads) { ArrowReaderProperties arrow_properties = default_arrow_reader_properties(); - arrow_properties.set_pre_buffer(true); + arrow_properties.set_pre_buffer(false); + arrow_properties.set_cache_options(::arrow::io::CacheOptions::Defaults()) TestGetRecordBatchReader(arrow_properties); } diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index bdc5b15332d..fe56fe2f894 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -831,8 +831,8 @@ class PARQUET_EXPORT ArrowReaderProperties { : use_threads_(use_threads), read_dict_indices_(), batch_size_(kArrowDefaultBatchSize), - pre_buffer_(false), - cache_options_(::arrow::io::CacheOptions::Defaults()), + pre_buffer_(true), + cache_options_(::arrow::io::CacheOptions::LazyDefaults()), coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO) {} /// \brief Set whether to use the IO thread pool to parse columns in parallel. From f6e56e100e0334b69c15e8aa2a3080aea65aa4ca Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 5 Oct 2023 15:25:58 +0200 Subject: [PATCH 3/3] update docstring --- python/pyarrow/_dataset_parquet.pyx | 5 ++++- python/pyarrow/parquet/core.py | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index cce6777a108..9d851425640 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -669,7 +669,10 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): pre_buffer : bool, default True If enabled, pre-buffer the raw Parquet data instead of issuing one read per column chunk. This can improve performance on high-latency - filesystems. + filesystems (e.g. S3, GCS) by coalesing and issuing file reads in + parallel using a background I/O thread pool. + Set to False if you want to prioritize minimal memory usage + over maximum speed. thrift_string_size_limit : int, default None If not None, override the maximum total string size allocated when decoding Thrift structures. The default limit should be diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index e0cdfee62ef..a3e5ef76c99 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -1742,11 +1742,12 @@ class ParquetDataset: different partitioning schemes, etc. pre_buffer : bool, default True Coalesce and issue file reads in parallel to improve performance on - high-latency filesystems (e.g. S3). If True, Arrow will use a + high-latency filesystems (e.g. S3, GCS). If True, Arrow will use a background I/O thread pool. This option is only supported for use_legacy_dataset=False. If using a filesystem layer that itself performs readahead (e.g. fsspec's S3FS), disable readahead for best - results. + results. Set to False if you want to prioritize minimal memory usage + over maximum speed. coerce_int96_timestamp_unit : str, default None Cast timestamps that are stored in INT96 format to a particular resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96