From 0466f0dffb96130c030f766d3082ebcd45156764 Mon Sep 17 00:00:00 2001
From: Ziheng Wang <zihengw@stanford.edu>
Date: Thu, 4 Aug 2022 13:46:55 -0700
Subject: [PATCH 01/15] implemented exposing flags

---
 cpp/src/arrow/dataset/scanner.cc             | 27 +++++++++-----
 cpp/src/arrow/dataset/scanner.h              | 20 +++++++++--
 python/pyarrow/_dataset.pyx                  | 38 +++++++++++++++++---
 python/pyarrow/includes/libarrow_dataset.pxd |  2 ++
 4 files changed, 71 insertions(+), 16 deletions(-)

diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc
index 0ef1d4577cd..badd18bf318 100644
--- a/cpp/src/arrow/dataset/scanner.cc
+++ b/cpp/src/arrow/dataset/scanner.cc
@@ -826,15 +826,6 @@ Status ScannerBuilder::UseThreads(bool use_threads) {
   return Status::OK();
 }
 
-Status ScannerBuilder::FragmentReadahead(int fragment_readahead) {
-  if (fragment_readahead <= 0) {
-    return Status::Invalid("FragmentReadahead must be greater than 0, got ",
-                           fragment_readahead);
-  }
-  scan_options_->fragment_readahead = fragment_readahead;
-  return Status::OK();
-}
-
 Status ScannerBuilder::BatchSize(int64_t batch_size) {
   if (batch_size <= 0) {
     return Status::Invalid("BatchSize must be greater than 0, got ", batch_size);
@@ -843,6 +834,24 @@ Status ScannerBuilder::BatchSize(int64_t batch_size) {
   return Status::OK();
 }
 
+Status ScannerBuilder::BatchReadahead(int32_t batch_readahead) {
+  if (batch_readahead < 0) {
+    return Status::Invalid("BatchReadahead must be greater than or equal 0, got ",
+                           batch_readahead);
+  }
+  scan_options_->batch_readahead = batch_readahead;
+  return Status::OK();
+}
+
+Status ScannerBuilder::FragmentReadahead(int32_t fragment_readahead) {
+  if (fragment_readahead < 0) {
+    return Status::Invalid("FragmentReadahead must be greater than or equal 0, got ",
+                           fragment_readahead);
+  }
+  scan_options_->fragment_readahead = fragment_readahead;
+  return Status::OK();
+}
+
 Status ScannerBuilder::Pool(MemoryPool* pool) {
   scan_options_->pool = pool;
   return Status::OK();
diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h
index 7098bad8f45..d25891d26c9 100644
--- a/cpp/src/arrow/dataset/scanner.h
+++ b/cpp/src/arrow/dataset/scanner.h
@@ -373,9 +373,6 @@ class ARROW_DS_EXPORT ScannerBuilder {
   ///        ThreadPool found in ScanOptions;
   Status UseThreads(bool use_threads = true);
 
-  /// \brief Limit how many fragments the scanner will read at once
-  Status FragmentReadahead(int fragment_readahead);
-
   /// \brief Set the maximum number of rows per RecordBatch.
   ///
   /// \param[in] batch_size the maximum number of rows.
@@ -384,6 +381,23 @@ class ARROW_DS_EXPORT ScannerBuilder {
   /// This option provides a control limiting the memory owned by any RecordBatch.
   Status BatchSize(int64_t batch_size);
 
+  /// \brief Set the number of batches to read aheadw ithin a file.
+  ///
+  /// \param[in] batch_readahead How many batches to read ahead within a file,
+  ///  might not work for all formats. refer to comments above.
+  /// \returns An error if this number is less than 0.
+  ///
+  /// This option provides a control on RAM vs IO tradeoff.
+  Status BatchReadahead(int32_t batch_readahead);
+
+  /// \brief Set the number of How many files to read ahead
+  ///
+  /// \param[in] fragment_readahead How many files to read ahead
+  /// \returns An error if this number is less than 0.
+  ///
+  /// This option provides a control on RAM vs IO tradeoff.
+  Status FragmentReadahead(int32_t fragment_readahead);
+
   /// \brief Set the pool from which materialized and scanned arrays will be allocated.
   Status Pool(MemoryPool* pool);
 
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 68833a5350e..4b4213d753a 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -2168,11 +2168,14 @@ cdef class TaggedRecordBatchIterator(_Weakrefable):
 
 
 _DEFAULT_BATCH_SIZE = 2**17
-
+_DEFAULT_BATCH_READAHEAD = 16
+_DEFAULT_FRAGMENT_READAHEAD = 4
 
 cdef void _populate_builder(const shared_ptr[CScannerBuilder]& ptr,
                             object columns=None, Expression filter=None,
                             int batch_size=_DEFAULT_BATCH_SIZE,
+                            int batch_readahead=_DEFAULT_BATCH_READAHEAD,
+                            int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD,
                             bint use_threads=True, MemoryPool memory_pool=None,
                             FragmentScanOptions fragment_scan_options=None)\
         except *:
@@ -2207,6 +2210,8 @@ cdef void _populate_builder(const shared_ptr[CScannerBuilder]& ptr,
             )
 
     check_status(builder.BatchSize(batch_size))
+    check_status(builder.BatchReadahead(batch_readahead))
+    check_status(builder.FragmentReadahead(fragment_readahead))
     check_status(builder.UseThreads(use_threads))
     if memory_pool:
         check_status(builder.Pool(maybe_unbox_memory_pool(memory_pool)))
@@ -2254,6 +2259,13 @@ cdef class Scanner(_Weakrefable):
         The maximum row count for scanned record batches. If scanned
         record batches are overflowing memory then this method can be
         called to reduce their size.
+    batch_readahead : int, default 16
+        The number of batches to read ahead in a file. This might not work
+        for all file formats like CSV. Increasing this number will increase
+        RAM usage but also improve IO utilization.
+    fragment_readahead : int, default 4
+        The number of files to read ahead. Increasing this number will increase
+        RAM usage but also improve IO utilization.
     use_threads : bool, default True
         If enabled, then maximum parallelism will be used determined by
         the number of available CPU cores.
@@ -2291,6 +2303,8 @@ cdef class Scanner(_Weakrefable):
                      MemoryPool memory_pool=None,
                      object columns=None, Expression filter=None,
                      int batch_size=_DEFAULT_BATCH_SIZE,
+                     int batch_readahead=_DEFAULT_BATCH_READAHEAD,
+                     int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD,
                      FragmentScanOptions fragment_scan_options=None):
         """
         Create Scanner from Dataset,
@@ -2328,6 +2342,13 @@ cdef class Scanner(_Weakrefable):
             The maximum row count for scanned record batches. If scanned
             record batches are overflowing memory then this method can be
             called to reduce their size.
+        batch_readahead : int, default 16
+            The number of batches to read ahead in a file. This might not work
+            for all file formats like CSV. Increasing this number will increase
+            RAM usage but also improve IO utilization.
+        fragment_readahead : int, default 4
+            The number of files to read ahead. Increasing this number will increase
+            RAM usage but also improve IO utilization.
         use_threads : bool, default True
             If enabled, then maximum parallelism will be used determined by
             the number of available CPU cores.
@@ -2354,7 +2375,8 @@ cdef class Scanner(_Weakrefable):
 
         builder = make_shared[CScannerBuilder](dataset.unwrap(), options)
         _populate_builder(builder, columns=columns, filter=filter,
-                          batch_size=batch_size, use_threads=use_threads,
+                          batch_size=batch_size, batch_readahead=batch_readahead,
+                          fragment_readahead=fragment_readahead, use_threads=use_threads,
                           memory_pool=memory_pool,
                           fragment_scan_options=fragment_scan_options)
 
@@ -2367,6 +2389,7 @@ cdef class Scanner(_Weakrefable):
                       MemoryPool memory_pool=None,
                       object columns=None, Expression filter=None,
                       int batch_size=_DEFAULT_BATCH_SIZE,
+                      int batch_readahead=_DEFAULT_BATCH_READAHEAD,
                       FragmentScanOptions fragment_scan_options=None):
         """
         Create Scanner from Fragment,
@@ -2406,6 +2429,10 @@ cdef class Scanner(_Weakrefable):
             The maximum row count for scanned record batches. If scanned
             record batches are overflowing memory then this method can be
             called to reduce their size.
+        batch_readahead : int, default 16
+            The number of batches to read ahead in a file. This might not work
+            for all file formats like CSV. Increasing this number will increase
+            RAM usage but also improve IO utilization.
         use_threads : bool, default True
             If enabled, then maximum parallelism will be used determined by
             the number of available CPU cores.
@@ -2435,7 +2462,9 @@ cdef class Scanner(_Weakrefable):
         builder = make_shared[CScannerBuilder](pyarrow_unwrap_schema(schema),
                                                fragment.unwrap(), options)
         _populate_builder(builder, columns=columns, filter=filter,
-                          batch_size=batch_size, use_threads=use_threads,
+                          batch_size=batch_size, batch_readahead=batch_readahead,
+                          fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD,
+                          use_threads=use_threads,
                           memory_pool=memory_pool,
                           fragment_scan_options=fragment_scan_options)
 
@@ -2508,7 +2537,8 @@ cdef class Scanner(_Weakrefable):
                           FutureWarning)
 
         _populate_builder(builder, columns=columns, filter=filter,
-                          batch_size=batch_size, use_threads=use_threads,
+                          batch_size=batch_size, batch_readahead=_DEFAULT_BATCH_READAHEAD,
+                          fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, use_threads=use_threads,
                           memory_pool=memory_pool,
                           fragment_scan_options=fragment_scan_options)
         scanner = GetResultValue(builder.get().Finish())
diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd
index bd8fbd1b56a..d418830bc2f 100644
--- a/python/pyarrow/includes/libarrow_dataset.pxd
+++ b/python/pyarrow/includes/libarrow_dataset.pxd
@@ -122,6 +122,8 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
         CStatus UseThreads(c_bool use_threads)
         CStatus Pool(CMemoryPool* pool)
         CStatus BatchSize(int64_t batch_size)
+        CStatus BatchReadahead(int32_t batch_readahead)
+        CStatus FragmentReadahead(int32_t fragment_readahead)
         CStatus FragmentScanOptions(
             shared_ptr[CFragmentScanOptions] fragment_scan_options)
         CResult[shared_ptr[CScanner]] Finish()

From 0e6368983a128928d197a74520e266f46d47233e Mon Sep 17 00:00:00 2001
From: Ziheng Wang <zihengw@stanford.edu>
Date: Thu, 4 Aug 2022 14:20:21 -0700
Subject: [PATCH 02/15] fix typo

---
 cpp/src/arrow/dataset/scanner.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h
index d25891d26c9..6f6541719ba 100644
--- a/cpp/src/arrow/dataset/scanner.h
+++ b/cpp/src/arrow/dataset/scanner.h
@@ -381,7 +381,7 @@ class ARROW_DS_EXPORT ScannerBuilder {
   /// This option provides a control limiting the memory owned by any RecordBatch.
   Status BatchSize(int64_t batch_size);
 
-  /// \brief Set the number of batches to read aheadw ithin a file.
+  /// \brief Set the number of batches to read ahead within a file.
   ///
   /// \param[in] batch_readahead How many batches to read ahead within a file,
   ///  might not work for all formats. refer to comments above.

From 3103dc6346fc41da5ebd415014ec7d0533be236a Mon Sep 17 00:00:00 2001
From: Ziheng Wang <zihengw@stanford.edu>
Date: Thu, 4 Aug 2022 17:00:50 -0700
Subject: [PATCH 03/15] updated docs

---
 cpp/src/arrow/dataset/scanner.h | 8 ++++----
 python/pyarrow/_dataset.pyx     | 5 ++---
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h
index 6f6541719ba..e9b66e70a6d 100644
--- a/cpp/src/arrow/dataset/scanner.h
+++ b/cpp/src/arrow/dataset/scanner.h
@@ -384,15 +384,15 @@ class ARROW_DS_EXPORT ScannerBuilder {
   /// \brief Set the number of batches to read ahead within a file.
   ///
   /// \param[in] batch_readahead How many batches to read ahead within a file,
-  ///  might not work for all formats. refer to comments above.
+  ///  might not work for all formats. 
   /// \returns An error if this number is less than 0.
   ///
-  /// This option provides a control on RAM vs IO tradeoff.
+  /// This option provides a control on RAM vs I/O tradeoff.
   Status BatchReadahead(int32_t batch_readahead);
 
-  /// \brief Set the number of How many files to read ahead
+  /// \brief Set the number of fragments to read ahead
   ///
-  /// \param[in] fragment_readahead How many files to read ahead
+  /// \param[in] fragment_readahead How many fragments to read ahead
   /// \returns An error if this number is less than 0.
   ///
   /// This option provides a control on RAM vs IO tradeoff.
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 4b4213d753a..caec6fb7882 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -2260,9 +2260,8 @@ cdef class Scanner(_Weakrefable):
         record batches are overflowing memory then this method can be
         called to reduce their size.
     batch_readahead : int, default 16
-        The number of batches to read ahead in a file. This might not work
-        for all file formats like CSV. Increasing this number will increase
-        RAM usage but also improve IO utilization.
+        The number of batches to read ahead in a file. Increasing this number 
+        will increase RAM usage but also improve IO utilization.
     fragment_readahead : int, default 4
         The number of files to read ahead. Increasing this number will increase
         RAM usage but also improve IO utilization.

From 0277eafe0cb29d818a42ac16fd6bf3ef2186f5fe Mon Sep 17 00:00:00 2001
From: Ziheng Wang <zihengw@stanford.edu>
Date: Tue, 9 Aug 2022 20:48:10 -0700
Subject: [PATCH 04/15] addressed suggestions

---
 cpp/src/arrow/dataset/scanner.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h
index e9b66e70a6d..56a7168c82b 100644
--- a/cpp/src/arrow/dataset/scanner.h
+++ b/cpp/src/arrow/dataset/scanner.h
@@ -82,7 +82,7 @@ struct ARROW_DS_EXPORT ScanOptions {
   /// Maximum row count for scanned batches.
   int64_t batch_size = kDefaultBatchSize;
 
-  /// How many batches to read ahead within a file
+  /// How many batches to read ahead within a fragment.
   ///
   /// Set to 0 to disable batch readahead
   ///
@@ -381,13 +381,15 @@ class ARROW_DS_EXPORT ScannerBuilder {
   /// This option provides a control limiting the memory owned by any RecordBatch.
   Status BatchSize(int64_t batch_size);
 
-  /// \brief Set the number of batches to read ahead within a file.
+  /// \brief Set the number of batches to read ahead within a fragment.
   ///
-  /// \param[in] batch_readahead How many batches to read ahead within a file,
-  ///  might not work for all formats. 
+  /// \param[in] batch_readahead How many batches to read ahead within a fragment,
+  ///  might not work for all formats.
   /// \returns An error if this number is less than 0.
   ///
   /// This option provides a control on RAM vs I/O tradeoff.
+  /// It might not be support by all file formats, in which case it will
+  /// simply be ignored.
   Status BatchReadahead(int32_t batch_readahead);
 
   /// \brief Set the number of fragments to read ahead

From c2f2cf6e79526fda7a34ee1a28e844e6039dba7c Mon Sep 17 00:00:00 2001
From: Ziheng Wang <zihengw@stanford.edu>
Date: Fri, 19 Aug 2022 16:06:20 -0700
Subject: [PATCH 05/15] Update cpp/src/arrow/dataset/scanner.h

Co-authored-by: Weston Pace <weston.pace@gmail.com>
---
 cpp/src/arrow/dataset/scanner.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h
index 56a7168c82b..01c62d08c0c 100644
--- a/cpp/src/arrow/dataset/scanner.h
+++ b/cpp/src/arrow/dataset/scanner.h
@@ -383,12 +383,11 @@ class ARROW_DS_EXPORT ScannerBuilder {
 
   /// \brief Set the number of batches to read ahead within a fragment.
   ///
-  /// \param[in] batch_readahead How many batches to read ahead within a fragment,
-  ///  might not work for all formats.
-  /// \returns An error if this number is less than 0.
+  /// \param[in] batch_readahead How many batches to read ahead within a fragment
+  /// \returns an error if this number is less than 0.
   ///
-  /// This option provides a control on RAM vs I/O tradeoff.
-  /// It might not be support by all file formats, in which case it will
+  /// This option provides a control on the RAM vs I/O tradeoff.
+  /// It might not be supported by all file formats, in which case it will
   /// simply be ignored.
   Status BatchReadahead(int32_t batch_readahead);
 

From 606807197ebc21c6c964a72aefb6fca921751e09 Mon Sep 17 00:00:00 2001
From: Ziheng Wang <zihengw@stanford.edu>
Date: Fri, 19 Aug 2022 16:06:27 -0700
Subject: [PATCH 06/15] Update cpp/src/arrow/dataset/scanner.h

Co-authored-by: Weston Pace <weston.pace@gmail.com>
---
 cpp/src/arrow/dataset/scanner.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h
index 01c62d08c0c..48177884822 100644
--- a/cpp/src/arrow/dataset/scanner.h
+++ b/cpp/src/arrow/dataset/scanner.h
@@ -396,7 +396,7 @@ class ARROW_DS_EXPORT ScannerBuilder {
   /// \param[in] fragment_readahead How many fragments to read ahead
   /// \returns An error if this number is less than 0.
   ///
-  /// This option provides a control on RAM vs IO tradeoff.
+  /// This option provides a control on the RAM vs IO tradeoff.
   Status FragmentReadahead(int32_t fragment_readahead);
 
   /// \brief Set the pool from which materialized and scanned arrays will be allocated.

From 185f2529198d854ee95ede94c6b3637875483bc8 Mon Sep 17 00:00:00 2001
From: Ziheng Wang <zihengw@stanford.edu>
Date: Fri, 19 Aug 2022 16:06:32 -0700
Subject: [PATCH 07/15] Update python/pyarrow/_dataset.pyx

Co-authored-by: Weston Pace <weston.pace@gmail.com>
---
 python/pyarrow/_dataset.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index caec6fb7882..bfef73a7c4e 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -2343,8 +2343,8 @@ cdef class Scanner(_Weakrefable):
             called to reduce their size.
         batch_readahead : int, default 16
             The number of batches to read ahead in a file. This might not work
-            for all file formats like CSV. Increasing this number will increase
-            RAM usage but also improve IO utilization.
+            for all file formats. Increasing this number will increase
+            RAM usage but could also improve IO utilization.
         fragment_readahead : int, default 4
             The number of files to read ahead. Increasing this number will increase
             RAM usage but also improve IO utilization.

From 16ee6d7dc3cc121a982a2753d182155aa53d2d98 Mon Sep 17 00:00:00 2001
From: Ziheng Wang <zihengw@stanford.edu>
Date: Fri, 19 Aug 2022 16:06:38 -0700
Subject: [PATCH 08/15] Update python/pyarrow/_dataset.pyx

Co-authored-by: Weston Pace <weston.pace@gmail.com>
---
 python/pyarrow/_dataset.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index bfef73a7c4e..e89acac2f44 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -2430,8 +2430,8 @@ cdef class Scanner(_Weakrefable):
             called to reduce their size.
         batch_readahead : int, default 16
             The number of batches to read ahead in a file. This might not work
-            for all file formats like CSV. Increasing this number will increase
-            RAM usage but also improve IO utilization.
+            for all file formats. Increasing this number will increase
+            RAM usage but could also improve IO utilization.
         use_threads : bool, default True
             If enabled, then maximum parallelism will be used determined by
             the number of available CPU cores.

From b266b95567027810a6d1f661ce5a210bcb68a0f0 Mon Sep 17 00:00:00 2001
From: Ziheng Wang <zihengw@stanford.edu>
Date: Thu, 25 Aug 2022 14:18:37 -0700
Subject: [PATCH 09/15] Update python/pyarrow/_dataset.pyx

Co-authored-by: Weston Pace <weston.pace@gmail.com>
---
 python/pyarrow/_dataset.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index e89acac2f44..dd51060fcc6 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -2347,7 +2347,7 @@ cdef class Scanner(_Weakrefable):
             RAM usage but could also improve IO utilization.
         fragment_readahead : int, default 4
             The number of files to read ahead. Increasing this number will increase
-            RAM usage but also improve IO utilization.
+            RAM usage but could also improve IO utilization.
         use_threads : bool, default True
             If enabled, then maximum parallelism will be used determined by
             the number of available CPU cores.

From 6a1eeb1521883ae8f55bb75fdf16c70b793c58af Mon Sep 17 00:00:00 2001
From: Ziheng Wang <zihengw@stanford.edu>
Date: Thu, 25 Aug 2022 14:18:43 -0700
Subject: [PATCH 10/15] Update python/pyarrow/_dataset.pyx

Co-authored-by: Weston Pace <weston.pace@gmail.com>
---
 python/pyarrow/_dataset.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index dd51060fcc6..b9e333c3d28 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -2261,10 +2261,10 @@ cdef class Scanner(_Weakrefable):
         called to reduce their size.
     batch_readahead : int, default 16
         The number of batches to read ahead in a file. Increasing this number 
-        will increase RAM usage but also improve IO utilization.
+        will increase RAM usage but could also improve IO utilization.
     fragment_readahead : int, default 4
         The number of files to read ahead. Increasing this number will increase
-        RAM usage but also improve IO utilization.
+        RAM usage but could also improve IO utilization.
     use_threads : bool, default True
         If enabled, then maximum parallelism will be used determined by
         the number of available CPU cores.

From 7a8b85eb2618aea176f63bc55824147399ff447b Mon Sep 17 00:00:00 2001
From: Ziheng Wang <zihengw@stanford.edu>
Date: Thu, 25 Aug 2022 14:18:51 -0700
Subject: [PATCH 11/15] Update cpp/src/arrow/dataset/scanner.h

Co-authored-by: Weston Pace <weston.pace@gmail.com>
---
 cpp/src/arrow/dataset/scanner.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h
index 48177884822..1e2f88fd6e0 100644
--- a/cpp/src/arrow/dataset/scanner.h
+++ b/cpp/src/arrow/dataset/scanner.h
@@ -394,7 +394,7 @@ class ARROW_DS_EXPORT ScannerBuilder {
   /// \brief Set the number of fragments to read ahead
   ///
   /// \param[in] fragment_readahead How many fragments to read ahead
-  /// \returns An error if this number is less than 0.
+  /// \returns an error if this number is less than 0.
   ///
   /// This option provides a control on the RAM vs IO tradeoff.
   Status FragmentReadahead(int32_t fragment_readahead);

From 5d05d6374bbb469baec4fc7cd8dc008514e3da9d Mon Sep 17 00:00:00 2001
From: Ziheng Wang <zihengw@stanford.edu>
Date: Wed, 31 Aug 2022 18:59:31 -0700
Subject: [PATCH 12/15] added a very simple test

---
 python/pyarrow/tests/test_dataset.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index b900e694a91..34594fa8d14 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -423,6 +423,16 @@ def test_dataset(dataset, dataset_reader):
                              False, False, True, True]
 
 
+@pytest.mark.parquet
+def test_scanner_options(dataset, dataset_reader):
+    scanner = dataset_reader.scanner(
+        dataset, memory_pool=pa.default_memory_pool())
+    assert isinstance(scanner, ds.Scanner)
+    for batch in scanner.to_batches(fragment_readahead=16, batch_readahead=8):
+        assert batch.num_columns == 1
+        assert batch.schema == scanner.projected_schema
+
+
 @pytest.mark.parquet
 def test_scanner(dataset, dataset_reader):
     scanner = dataset_reader.scanner(

From 66e3fe2988dd674132dd1e48115c8f3ae40358a6 Mon Sep 17 00:00:00 2001
From: Ziheng Wang <zihengw@stanford.edu>
Date: Wed, 31 Aug 2022 19:00:06 -0700
Subject: [PATCH 13/15] Update cpp/src/arrow/dataset/scanner.h

Co-authored-by: Antoine Pitrou <pitrou@free.fr>
---
 cpp/src/arrow/dataset/scanner.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h
index 1e2f88fd6e0..646cc0de72e 100644
--- a/cpp/src/arrow/dataset/scanner.h
+++ b/cpp/src/arrow/dataset/scanner.h
@@ -396,7 +396,7 @@ class ARROW_DS_EXPORT ScannerBuilder {
   /// \param[in] fragment_readahead How many fragments to read ahead
   /// \returns an error if this number is less than 0.
   ///
-  /// This option provides a control on the RAM vs IO tradeoff.
+  /// This option provides a control on the RAM vs I/O tradeoff.
   Status FragmentReadahead(int32_t fragment_readahead);
 
   /// \brief Set the pool from which materialized and scanned arrays will be allocated.

From 4ce34732ecefac3bb203e6817e81e4edf9fec9be Mon Sep 17 00:00:00 2001
From: Ziheng Wang <zihengw@stanford.edu>
Date: Wed, 31 Aug 2022 20:07:19 -0700
Subject: [PATCH 14/15] try again

---
 python/pyarrow/tests/test_dataset.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 34594fa8d14..e2b1a137ef2 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -424,13 +424,10 @@ def test_dataset(dataset, dataset_reader):
 
 
 @pytest.mark.parquet
-def test_scanner_options(dataset, dataset_reader):
-    scanner = dataset_reader.scanner(
-        dataset, memory_pool=pa.default_memory_pool())
-    assert isinstance(scanner, ds.Scanner)
-    for batch in scanner.to_batches(fragment_readahead=16, batch_readahead=8):
-        assert batch.num_columns == 1
-        assert batch.schema == scanner.projected_schema
+def test_scanner_options(dataset):
+    scanner = dataset.to_batches(fragment_readahead=16, batch_readahead=8)
+    batch = next(scanner)
+    assert batch.num_columns == 1
 
 
 @pytest.mark.parquet

From e85e9e3a771fa9bd4aa58da163b5566c35863ce1 Mon Sep 17 00:00:00 2001
From: Ziheng Wang <zihengw@stanford.edu>
Date: Wed, 31 Aug 2022 20:46:45 -0700
Subject: [PATCH 15/15] now it works

---
 python/pyarrow/tests/test_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index e2b1a137ef2..851b25a2642 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -427,7 +427,7 @@ def test_dataset(dataset, dataset_reader):
 def test_scanner_options(dataset):
     scanner = dataset.to_batches(fragment_readahead=16, batch_readahead=8)
     batch = next(scanner)
-    assert batch.num_columns == 1
+    assert batch.num_columns == 7
 
 
 @pytest.mark.parquet