From 0466f0dffb96130c030f766d3082ebcd45156764 Mon Sep 17 00:00:00 2001 From: Ziheng Wang Date: Thu, 4 Aug 2022 13:46:55 -0700 Subject: [PATCH 01/15] implemented exposing flags --- cpp/src/arrow/dataset/scanner.cc | 27 +++++++++----- cpp/src/arrow/dataset/scanner.h | 20 +++++++++-- python/pyarrow/_dataset.pyx | 38 +++++++++++++++++--- python/pyarrow/includes/libarrow_dataset.pxd | 2 ++ 4 files changed, 71 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc index 0ef1d4577cd..badd18bf318 100644 --- a/cpp/src/arrow/dataset/scanner.cc +++ b/cpp/src/arrow/dataset/scanner.cc @@ -826,15 +826,6 @@ Status ScannerBuilder::UseThreads(bool use_threads) { return Status::OK(); } -Status ScannerBuilder::FragmentReadahead(int fragment_readahead) { - if (fragment_readahead <= 0) { - return Status::Invalid("FragmentReadahead must be greater than 0, got ", - fragment_readahead); - } - scan_options_->fragment_readahead = fragment_readahead; - return Status::OK(); -} - Status ScannerBuilder::BatchSize(int64_t batch_size) { if (batch_size <= 0) { return Status::Invalid("BatchSize must be greater than 0, got ", batch_size); @@ -843,6 +834,24 @@ Status ScannerBuilder::BatchSize(int64_t batch_size) { return Status::OK(); } +Status ScannerBuilder::BatchReadahead(int32_t batch_readahead) { + if (batch_readahead < 0) { + return Status::Invalid("BatchReadahead must be greater than or equal 0, got ", + batch_readahead); + } + scan_options_->batch_readahead = batch_readahead; + return Status::OK(); +} + +Status ScannerBuilder::FragmentReadahead(int32_t fragment_readahead) { + if (fragment_readahead < 0) { + return Status::Invalid("FragmentReadahead must be greater than or equal 0, got ", + fragment_readahead); + } + scan_options_->fragment_readahead = fragment_readahead; + return Status::OK(); +} + Status ScannerBuilder::Pool(MemoryPool* pool) { scan_options_->pool = pool; return Status::OK(); diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h index 7098bad8f45..d25891d26c9 100644 --- a/cpp/src/arrow/dataset/scanner.h +++ b/cpp/src/arrow/dataset/scanner.h @@ -373,9 +373,6 @@ class ARROW_DS_EXPORT ScannerBuilder { /// ThreadPool found in ScanOptions; Status UseThreads(bool use_threads = true); - /// \brief Limit how many fragments the scanner will read at once - Status FragmentReadahead(int fragment_readahead); - /// \brief Set the maximum number of rows per RecordBatch. /// /// \param[in] batch_size the maximum number of rows. @@ -384,6 +381,23 @@ class ARROW_DS_EXPORT ScannerBuilder { /// This option provides a control limiting the memory owned by any RecordBatch. Status BatchSize(int64_t batch_size); + /// \brief Set the number of batches to read aheadw ithin a file. + /// + /// \param[in] batch_readahead How many batches to read ahead within a file, + /// might not work for all formats. refer to comments above. + /// \returns An error if this number is less than 0. + /// + /// This option provides a control on RAM vs IO tradeoff. + Status BatchReadahead(int32_t batch_readahead); + + /// \brief Set the number of How many files to read ahead + /// + /// \param[in] fragment_readahead How many files to read ahead + /// \returns An error if this number is less than 0. + /// + /// This option provides a control on RAM vs IO tradeoff. + Status FragmentReadahead(int32_t fragment_readahead); + /// \brief Set the pool from which materialized and scanned arrays will be allocated. Status Pool(MemoryPool* pool); diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 68833a5350e..4b4213d753a 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -2168,11 +2168,14 @@ cdef class TaggedRecordBatchIterator(_Weakrefable): _DEFAULT_BATCH_SIZE = 2**17 - +_DEFAULT_BATCH_READAHEAD = 16 +_DEFAULT_FRAGMENT_READAHEAD = 4 cdef void _populate_builder(const shared_ptr[CScannerBuilder]& ptr, object columns=None, Expression filter=None, int batch_size=_DEFAULT_BATCH_SIZE, + int batch_readahead=_DEFAULT_BATCH_READAHEAD, + int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, bint use_threads=True, MemoryPool memory_pool=None, FragmentScanOptions fragment_scan_options=None)\ except *: @@ -2207,6 +2210,8 @@ cdef void _populate_builder(const shared_ptr[CScannerBuilder]& ptr, ) check_status(builder.BatchSize(batch_size)) + check_status(builder.BatchReadahead(batch_readahead)) + check_status(builder.FragmentReadahead(fragment_readahead)) check_status(builder.UseThreads(use_threads)) if memory_pool: check_status(builder.Pool(maybe_unbox_memory_pool(memory_pool))) @@ -2254,6 +2259,13 @@ cdef class Scanner(_Weakrefable): The maximum row count for scanned record batches. If scanned record batches are overflowing memory then this method can be called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats like CSV. Increasing this number will increase + RAM usage but also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but also improve IO utilization. use_threads : bool, default True If enabled, then maximum parallelism will be used determined by the number of available CPU cores. @@ -2291,6 +2303,8 @@ cdef class Scanner(_Weakrefable): MemoryPool memory_pool=None, object columns=None, Expression filter=None, int batch_size=_DEFAULT_BATCH_SIZE, + int batch_readahead=_DEFAULT_BATCH_READAHEAD, + int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, FragmentScanOptions fragment_scan_options=None): """ Create Scanner from Dataset, @@ -2328,6 +2342,13 @@ cdef class Scanner(_Weakrefable): The maximum row count for scanned record batches. If scanned record batches are overflowing memory then this method can be called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats like CSV. Increasing this number will increase + RAM usage but also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but also improve IO utilization. use_threads : bool, default True If enabled, then maximum parallelism will be used determined by the number of available CPU cores. @@ -2354,7 +2375,8 @@ cdef class Scanner(_Weakrefable): builder = make_shared[CScannerBuilder](dataset.unwrap(), options) _populate_builder(builder, columns=columns, filter=filter, - batch_size=batch_size, use_threads=use_threads, + batch_size=batch_size, batch_readahead=batch_readahead, + fragment_readahead=fragment_readahead, use_threads=use_threads, memory_pool=memory_pool, fragment_scan_options=fragment_scan_options) @@ -2367,6 +2389,7 @@ cdef class Scanner(_Weakrefable): MemoryPool memory_pool=None, object columns=None, Expression filter=None, int batch_size=_DEFAULT_BATCH_SIZE, + int batch_readahead=_DEFAULT_BATCH_READAHEAD, FragmentScanOptions fragment_scan_options=None): """ Create Scanner from Fragment, @@ -2406,6 +2429,10 @@ cdef class Scanner(_Weakrefable): The maximum row count for scanned record batches. If scanned record batches are overflowing memory then this method can be called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats like CSV. Increasing this number will increase + RAM usage but also improve IO utilization. use_threads : bool, default True If enabled, then maximum parallelism will be used determined by the number of available CPU cores. @@ -2435,7 +2462,9 @@ cdef class Scanner(_Weakrefable): builder = make_shared[CScannerBuilder](pyarrow_unwrap_schema(schema), fragment.unwrap(), options) _populate_builder(builder, columns=columns, filter=filter, - batch_size=batch_size, use_threads=use_threads, + batch_size=batch_size, batch_readahead=batch_readahead, + fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, + use_threads=use_threads, memory_pool=memory_pool, fragment_scan_options=fragment_scan_options) @@ -2508,7 +2537,8 @@ cdef class Scanner(_Weakrefable): FutureWarning) _populate_builder(builder, columns=columns, filter=filter, - batch_size=batch_size, use_threads=use_threads, + batch_size=batch_size, batch_readahead=_DEFAULT_BATCH_READAHEAD, + fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, use_threads=use_threads, memory_pool=memory_pool, fragment_scan_options=fragment_scan_options) scanner = GetResultValue(builder.get().Finish()) diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd index bd8fbd1b56a..d418830bc2f 100644 --- a/python/pyarrow/includes/libarrow_dataset.pxd +++ b/python/pyarrow/includes/libarrow_dataset.pxd @@ -122,6 +122,8 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: CStatus UseThreads(c_bool use_threads) CStatus Pool(CMemoryPool* pool) CStatus BatchSize(int64_t batch_size) + CStatus BatchReadahead(int32_t batch_readahead) + CStatus FragmentReadahead(int32_t fragment_readahead) CStatus FragmentScanOptions( shared_ptr[CFragmentScanOptions] fragment_scan_options) CResult[shared_ptr[CScanner]] Finish() From 0e6368983a128928d197a74520e266f46d47233e Mon Sep 17 00:00:00 2001 From: Ziheng Wang Date: Thu, 4 Aug 2022 14:20:21 -0700 Subject: [PATCH 02/15] fix typo --- cpp/src/arrow/dataset/scanner.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h index d25891d26c9..6f6541719ba 100644 --- a/cpp/src/arrow/dataset/scanner.h +++ b/cpp/src/arrow/dataset/scanner.h @@ -381,7 +381,7 @@ class ARROW_DS_EXPORT ScannerBuilder { /// This option provides a control limiting the memory owned by any RecordBatch. Status BatchSize(int64_t batch_size); - /// \brief Set the number of batches to read aheadw ithin a file. + /// \brief Set the number of batches to read ahead within a file. /// /// \param[in] batch_readahead How many batches to read ahead within a file, /// might not work for all formats. refer to comments above. From 3103dc6346fc41da5ebd415014ec7d0533be236a Mon Sep 17 00:00:00 2001 From: Ziheng Wang Date: Thu, 4 Aug 2022 17:00:50 -0700 Subject: [PATCH 03/15] updated docs --- cpp/src/arrow/dataset/scanner.h | 8 ++++---- python/pyarrow/_dataset.pyx | 5 ++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h index 6f6541719ba..e9b66e70a6d 100644 --- a/cpp/src/arrow/dataset/scanner.h +++ b/cpp/src/arrow/dataset/scanner.h @@ -384,15 +384,15 @@ class ARROW_DS_EXPORT ScannerBuilder { /// \brief Set the number of batches to read ahead within a file. /// /// \param[in] batch_readahead How many batches to read ahead within a file, - /// might not work for all formats. refer to comments above. + /// might not work for all formats. /// \returns An error if this number is less than 0. /// - /// This option provides a control on RAM vs IO tradeoff. + /// This option provides a control on RAM vs I/O tradeoff. Status BatchReadahead(int32_t batch_readahead); - /// \brief Set the number of How many files to read ahead + /// \brief Set the number of fragments to read ahead /// - /// \param[in] fragment_readahead How many files to read ahead + /// \param[in] fragment_readahead How many fragments to read ahead /// \returns An error if this number is less than 0. /// /// This option provides a control on RAM vs IO tradeoff. diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 4b4213d753a..caec6fb7882 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -2260,9 +2260,8 @@ cdef class Scanner(_Weakrefable): record batches are overflowing memory then this method can be called to reduce their size. batch_readahead : int, default 16 - The number of batches to read ahead in a file. This might not work - for all file formats like CSV. Increasing this number will increase - RAM usage but also improve IO utilization. + The number of batches to read ahead in a file. Increasing this number + will increase RAM usage but also improve IO utilization. fragment_readahead : int, default 4 The number of files to read ahead. Increasing this number will increase RAM usage but also improve IO utilization. From 0277eafe0cb29d818a42ac16fd6bf3ef2186f5fe Mon Sep 17 00:00:00 2001 From: Ziheng Wang Date: Tue, 9 Aug 2022 20:48:10 -0700 Subject: [PATCH 04/15] addressed suggestions --- cpp/src/arrow/dataset/scanner.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h index e9b66e70a6d..56a7168c82b 100644 --- a/cpp/src/arrow/dataset/scanner.h +++ b/cpp/src/arrow/dataset/scanner.h @@ -82,7 +82,7 @@ struct ARROW_DS_EXPORT ScanOptions { /// Maximum row count for scanned batches. int64_t batch_size = kDefaultBatchSize; - /// How many batches to read ahead within a file + /// How many batches to read ahead within a fragment. /// /// Set to 0 to disable batch readahead /// @@ -381,13 +381,15 @@ class ARROW_DS_EXPORT ScannerBuilder { /// This option provides a control limiting the memory owned by any RecordBatch. Status BatchSize(int64_t batch_size); - /// \brief Set the number of batches to read ahead within a file. + /// \brief Set the number of batches to read ahead within a fragment. /// - /// \param[in] batch_readahead How many batches to read ahead within a file, - /// might not work for all formats. + /// \param[in] batch_readahead How many batches to read ahead within a fragment, + /// might not work for all formats. /// \returns An error if this number is less than 0. /// /// This option provides a control on RAM vs I/O tradeoff. + /// It might not be support by all file formats, in which case it will + /// simply be ignored. Status BatchReadahead(int32_t batch_readahead); /// \brief Set the number of fragments to read ahead From c2f2cf6e79526fda7a34ee1a28e844e6039dba7c Mon Sep 17 00:00:00 2001 From: Ziheng Wang Date: Fri, 19 Aug 2022 16:06:20 -0700 Subject: [PATCH 05/15] Update cpp/src/arrow/dataset/scanner.h Co-authored-by: Weston Pace --- cpp/src/arrow/dataset/scanner.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h index 56a7168c82b..01c62d08c0c 100644 --- a/cpp/src/arrow/dataset/scanner.h +++ b/cpp/src/arrow/dataset/scanner.h @@ -383,12 +383,11 @@ class ARROW_DS_EXPORT ScannerBuilder { /// \brief Set the number of batches to read ahead within a fragment. /// - /// \param[in] batch_readahead How many batches to read ahead within a fragment, - /// might not work for all formats. - /// \returns An error if this number is less than 0. + /// \param[in] batch_readahead How many batches to read ahead within a fragment + /// \returns an error if this number is less than 0. /// - /// This option provides a control on RAM vs I/O tradeoff. - /// It might not be support by all file formats, in which case it will + /// This option provides a control on the RAM vs I/O tradeoff. + /// It might not be supported by all file formats, in which case it will /// simply be ignored. Status BatchReadahead(int32_t batch_readahead); From 606807197ebc21c6c964a72aefb6fca921751e09 Mon Sep 17 00:00:00 2001 From: Ziheng Wang Date: Fri, 19 Aug 2022 16:06:27 -0700 Subject: [PATCH 06/15] Update cpp/src/arrow/dataset/scanner.h Co-authored-by: Weston Pace --- cpp/src/arrow/dataset/scanner.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h index 01c62d08c0c..48177884822 100644 --- a/cpp/src/arrow/dataset/scanner.h +++ b/cpp/src/arrow/dataset/scanner.h @@ -396,7 +396,7 @@ class ARROW_DS_EXPORT ScannerBuilder { /// \param[in] fragment_readahead How many fragments to read ahead /// \returns An error if this number is less than 0. /// - /// This option provides a control on RAM vs IO tradeoff. + /// This option provides a control on the RAM vs IO tradeoff. Status FragmentReadahead(int32_t fragment_readahead); /// \brief Set the pool from which materialized and scanned arrays will be allocated. From 185f2529198d854ee95ede94c6b3637875483bc8 Mon Sep 17 00:00:00 2001 From: Ziheng Wang Date: Fri, 19 Aug 2022 16:06:32 -0700 Subject: [PATCH 07/15] Update python/pyarrow/_dataset.pyx Co-authored-by: Weston Pace --- python/pyarrow/_dataset.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index caec6fb7882..bfef73a7c4e 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -2343,8 +2343,8 @@ cdef class Scanner(_Weakrefable): called to reduce their size. batch_readahead : int, default 16 The number of batches to read ahead in a file. This might not work - for all file formats like CSV. Increasing this number will increase - RAM usage but also improve IO utilization. + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. fragment_readahead : int, default 4 The number of files to read ahead. Increasing this number will increase RAM usage but also improve IO utilization. From 16ee6d7dc3cc121a982a2753d182155aa53d2d98 Mon Sep 17 00:00:00 2001 From: Ziheng Wang Date: Fri, 19 Aug 2022 16:06:38 -0700 Subject: [PATCH 08/15] Update python/pyarrow/_dataset.pyx Co-authored-by: Weston Pace --- python/pyarrow/_dataset.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index bfef73a7c4e..e89acac2f44 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -2430,8 +2430,8 @@ cdef class Scanner(_Weakrefable): called to reduce their size. batch_readahead : int, default 16 The number of batches to read ahead in a file. This might not work - for all file formats like CSV. Increasing this number will increase - RAM usage but also improve IO utilization. + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. use_threads : bool, default True If enabled, then maximum parallelism will be used determined by the number of available CPU cores. From b266b95567027810a6d1f661ce5a210bcb68a0f0 Mon Sep 17 00:00:00 2001 From: Ziheng Wang Date: Thu, 25 Aug 2022 14:18:37 -0700 Subject: [PATCH 09/15] Update python/pyarrow/_dataset.pyx Co-authored-by: Weston Pace --- python/pyarrow/_dataset.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index e89acac2f44..dd51060fcc6 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -2347,7 +2347,7 @@ cdef class Scanner(_Weakrefable): RAM usage but could also improve IO utilization. fragment_readahead : int, default 4 The number of files to read ahead. Increasing this number will increase - RAM usage but also improve IO utilization. + RAM usage but could also improve IO utilization. use_threads : bool, default True If enabled, then maximum parallelism will be used determined by the number of available CPU cores. From 6a1eeb1521883ae8f55bb75fdf16c70b793c58af Mon Sep 17 00:00:00 2001 From: Ziheng Wang Date: Thu, 25 Aug 2022 14:18:43 -0700 Subject: [PATCH 10/15] Update python/pyarrow/_dataset.pyx Co-authored-by: Weston Pace --- python/pyarrow/_dataset.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index dd51060fcc6..b9e333c3d28 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -2261,10 +2261,10 @@ cdef class Scanner(_Weakrefable): called to reduce their size. batch_readahead : int, default 16 The number of batches to read ahead in a file. Increasing this number - will increase RAM usage but also improve IO utilization. + will increase RAM usage but could also improve IO utilization. fragment_readahead : int, default 4 The number of files to read ahead. Increasing this number will increase - RAM usage but also improve IO utilization. + RAM usage but could also improve IO utilization. use_threads : bool, default True If enabled, then maximum parallelism will be used determined by the number of available CPU cores. From 7a8b85eb2618aea176f63bc55824147399ff447b Mon Sep 17 00:00:00 2001 From: Ziheng Wang Date: Thu, 25 Aug 2022 14:18:51 -0700 Subject: [PATCH 11/15] Update cpp/src/arrow/dataset/scanner.h Co-authored-by: Weston Pace --- cpp/src/arrow/dataset/scanner.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h index 48177884822..1e2f88fd6e0 100644 --- a/cpp/src/arrow/dataset/scanner.h +++ b/cpp/src/arrow/dataset/scanner.h @@ -394,7 +394,7 @@ class ARROW_DS_EXPORT ScannerBuilder { /// \brief Set the number of fragments to read ahead /// /// \param[in] fragment_readahead How many fragments to read ahead - /// \returns An error if this number is less than 0. + /// \returns an error if this number is less than 0. /// /// This option provides a control on the RAM vs IO tradeoff. Status FragmentReadahead(int32_t fragment_readahead); From 5d05d6374bbb469baec4fc7cd8dc008514e3da9d Mon Sep 17 00:00:00 2001 From: Ziheng Wang Date: Wed, 31 Aug 2022 18:59:31 -0700 Subject: [PATCH 12/15] added a very simple test --- python/pyarrow/tests/test_dataset.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index b900e694a91..34594fa8d14 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -423,6 +423,16 @@ def test_dataset(dataset, dataset_reader): False, False, True, True] +@pytest.mark.parquet +def test_scanner_options(dataset, dataset_reader): + scanner = dataset_reader.scanner( + dataset, memory_pool=pa.default_memory_pool()) + assert isinstance(scanner, ds.Scanner) + for batch in scanner.to_batches(fragment_readahead=16, batch_readahead=8): + assert batch.num_columns == 1 + assert batch.schema == scanner.projected_schema + + @pytest.mark.parquet def test_scanner(dataset, dataset_reader): scanner = dataset_reader.scanner( From 66e3fe2988dd674132dd1e48115c8f3ae40358a6 Mon Sep 17 00:00:00 2001 From: Ziheng Wang Date: Wed, 31 Aug 2022 19:00:06 -0700 Subject: [PATCH 13/15] Update cpp/src/arrow/dataset/scanner.h Co-authored-by: Antoine Pitrou --- cpp/src/arrow/dataset/scanner.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h index 1e2f88fd6e0..646cc0de72e 100644 --- a/cpp/src/arrow/dataset/scanner.h +++ b/cpp/src/arrow/dataset/scanner.h @@ -396,7 +396,7 @@ class ARROW_DS_EXPORT ScannerBuilder { /// \param[in] fragment_readahead How many fragments to read ahead /// \returns an error if this number is less than 0. /// - /// This option provides a control on the RAM vs IO tradeoff. + /// This option provides a control on the RAM vs I/O tradeoff. Status FragmentReadahead(int32_t fragment_readahead); /// \brief Set the pool from which materialized and scanned arrays will be allocated. From 4ce34732ecefac3bb203e6817e81e4edf9fec9be Mon Sep 17 00:00:00 2001 From: Ziheng Wang Date: Wed, 31 Aug 2022 20:07:19 -0700 Subject: [PATCH 14/15] try again --- python/pyarrow/tests/test_dataset.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 34594fa8d14..e2b1a137ef2 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -424,13 +424,10 @@ def test_dataset(dataset, dataset_reader): @pytest.mark.parquet -def test_scanner_options(dataset, dataset_reader): - scanner = dataset_reader.scanner( - dataset, memory_pool=pa.default_memory_pool()) - assert isinstance(scanner, ds.Scanner) - for batch in scanner.to_batches(fragment_readahead=16, batch_readahead=8): - assert batch.num_columns == 1 - assert batch.schema == scanner.projected_schema +def test_scanner_options(dataset): + scanner = dataset.to_batches(fragment_readahead=16, batch_readahead=8) + batch = next(scanner) + assert batch.num_columns == 1 @pytest.mark.parquet From e85e9e3a771fa9bd4aa58da163b5566c35863ce1 Mon Sep 17 00:00:00 2001 From: Ziheng Wang Date: Wed, 31 Aug 2022 20:46:45 -0700 Subject: [PATCH 15/15] now it works --- python/pyarrow/tests/test_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index e2b1a137ef2..851b25a2642 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -427,7 +427,7 @@ def test_dataset(dataset, dataset_reader): def test_scanner_options(dataset): scanner = dataset.to_batches(fragment_readahead=16, batch_readahead=8) batch = next(scanner) - assert batch.num_columns == 1 + assert batch.num_columns == 7 @pytest.mark.parquet