From 16e56fa181f58526f68285cd1b5953a4df55d2d9 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 25 Sep 2023 11:59:12 +0200
Subject: [PATCH 1/3] [Python][Dataset] Change default of pre_buffer to True
 for reading Parquet files

---
 python/pyarrow/_dataset_parquet.pyx  |  4 ++--
 python/pyarrow/tests/test_dataset.py | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx
index cf5c44c1c96..cce6777a108 100644
--- a/python/pyarrow/_dataset_parquet.pyx
+++ b/python/pyarrow/_dataset_parquet.pyx
@@ -666,7 +666,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions):
         Disabled by default.
     buffer_size : int, default 8192
         Size of buffered stream, if enabled. Default is 8KB.
-    pre_buffer : bool, default False
+    pre_buffer : bool, default True
         If enabled, pre-buffer the raw Parquet data instead of issuing one
         read per column chunk. This can improve performance on high-latency
         filesystems.
@@ -688,7 +688,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions):
 
     def __init__(self, *, bint use_buffered_stream=False,
                  buffer_size=8192,
-                 bint pre_buffer=False,
+                 bint pre_buffer=True,
                  thrift_string_size_limit=None,
                  thrift_container_size_limit=None):
         self.init(shared_ptr[CFragmentScanOptions](
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 39c3c43daea..671405d1ee6 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -784,28 +784,28 @@ def test_parquet_scan_options():
     opts2 = ds.ParquetFragmentScanOptions(buffer_size=4096)
     opts3 = ds.ParquetFragmentScanOptions(
         buffer_size=2**13, use_buffered_stream=True)
-    opts4 = ds.ParquetFragmentScanOptions(buffer_size=2**13, pre_buffer=True)
+    opts4 = ds.ParquetFragmentScanOptions(buffer_size=2**13, pre_buffer=False)
     opts5 = ds.ParquetFragmentScanOptions(
         thrift_string_size_limit=123456,
         thrift_container_size_limit=987654,)
 
     assert opts1.use_buffered_stream is False
     assert opts1.buffer_size == 2**13
-    assert opts1.pre_buffer is False
+    assert opts1.pre_buffer is True
     assert opts1.thrift_string_size_limit == 100_000_000  # default in C++
     assert opts1.thrift_container_size_limit == 1_000_000  # default in C++
 
     assert opts2.use_buffered_stream is False
     assert opts2.buffer_size == 2**12
-    assert opts2.pre_buffer is False
+    assert opts2.pre_buffer is True
 
     assert opts3.use_buffered_stream is True
     assert opts3.buffer_size == 2**13
-    assert opts3.pre_buffer is False
+    assert opts3.pre_buffer is True
 
     assert opts4.use_buffered_stream is False
     assert opts4.buffer_size == 2**13
-    assert opts4.pre_buffer is True
+    assert opts4.pre_buffer is False
 
     assert opts5.thrift_string_size_limit == 123456
     assert opts5.thrift_container_size_limit == 987654

From 0c21100243eea8ddb0dcac97cac1990351c686b7 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 5 Oct 2023 15:20:43 +0200
Subject: [PATCH 2/3] update defaults on C++ as well

---
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 7 ++++---
 cpp/src/parquet/properties.h                      | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 8585b1ccf11..7a94b1f3a1c 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -2354,10 +2354,11 @@ void TestGetRecordBatchReader(
 
 TEST(TestArrowReadWrite, GetRecordBatchReader) { TestGetRecordBatchReader(); }
 
-// Same as the test above, but using coalesced reads.
-TEST(TestArrowReadWrite, CoalescedReads) {
+// Same as the test above, but using non-coalesced reads.
+TEST(TestArrowReadWrite, NoneCoalescedReads) {
   ArrowReaderProperties arrow_properties = default_arrow_reader_properties();
-  arrow_properties.set_pre_buffer(true);
+  arrow_properties.set_pre_buffer(false);
+  arrow_properties.set_cache_options(::arrow::io::CacheOptions::Defaults())
   TestGetRecordBatchReader(arrow_properties);
 }
 
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index bdc5b15332d..fe56fe2f894 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -831,8 +831,8 @@ class PARQUET_EXPORT ArrowReaderProperties {
       : use_threads_(use_threads),
         read_dict_indices_(),
         batch_size_(kArrowDefaultBatchSize),
-        pre_buffer_(false),
-        cache_options_(::arrow::io::CacheOptions::Defaults()),
+        pre_buffer_(true),
+        cache_options_(::arrow::io::CacheOptions::LazyDefaults()),
         coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO) {}
 
   /// \brief Set whether to use the IO thread pool to parse columns in parallel.

From f6e56e100e0334b69c15e8aa2a3080aea65aa4ca Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 5 Oct 2023 15:25:58 +0200
Subject: [PATCH 3/3] update docstring

---
 python/pyarrow/_dataset_parquet.pyx | 5 ++++-
 python/pyarrow/parquet/core.py      | 5 +++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx
index cce6777a108..9d851425640 100644
--- a/python/pyarrow/_dataset_parquet.pyx
+++ b/python/pyarrow/_dataset_parquet.pyx
@@ -669,7 +669,10 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions):
     pre_buffer : bool, default True
         If enabled, pre-buffer the raw Parquet data instead of issuing one
         read per column chunk. This can improve performance on high-latency
-        filesystems.
+        filesystems (e.g. S3, GCS) by coalesing and issuing file reads in
+        parallel using a background I/O thread pool.
+        Set to False if you want to prioritize minimal memory usage
+        over maximum speed.
     thrift_string_size_limit : int, default None
         If not None, override the maximum total string size allocated
         when decoding Thrift structures. The default limit should be
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index e0cdfee62ef..a3e5ef76c99 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -1742,11 +1742,12 @@ class ParquetDataset:
     different partitioning schemes, etc.
 pre_buffer : bool, default True
     Coalesce and issue file reads in parallel to improve performance on
-    high-latency filesystems (e.g. S3). If True, Arrow will use a
+    high-latency filesystems (e.g. S3, GCS). If True, Arrow will use a
     background I/O thread pool. This option is only supported for
     use_legacy_dataset=False. If using a filesystem layer that itself
     performs readahead (e.g. fsspec's S3FS), disable readahead for best
-    results.
+    results. Set to False if you want to prioritize minimal memory usage
+    over maximum speed.
 coerce_int96_timestamp_unit : str, default None
     Cast timestamps that are stored in INT96 format to a particular resolution
     (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96