From 3f9508a368fb80f37d6739bc569bed840bb9494e Mon Sep 17 00:00:00 2001
From: David Li
Date: Fri, 12 Mar 2021 10:46:13 -0500
Subject: [PATCH 1/4] ARROW-9749: [C++][Dataset] Add ConvertOptions for CSV
datasets
---
cpp/src/arrow/dataset/dataset.h | 13 +++++++++++++
cpp/src/arrow/dataset/file_csv.cc | 19 ++++++++++++++-----
cpp/src/arrow/dataset/file_csv.h | 12 +++++++++++-
cpp/src/arrow/dataset/file_csv_test.cc | 22 ++++++++++++++++++++++
cpp/src/arrow/dataset/scanner.h | 3 +++
5 files changed, 63 insertions(+), 6 deletions(-)
diff --git a/cpp/src/arrow/dataset/dataset.h b/cpp/src/arrow/dataset/dataset.h
index afdbe328d5f..30b7449f31a 100644
--- a/cpp/src/arrow/dataset/dataset.h
+++ b/cpp/src/arrow/dataset/dataset.h
@@ -89,6 +89,19 @@ class ARROW_DS_EXPORT Fragment : public std::enable_shared_from_this {
std::shared_ptr physical_schema_;
};
+/// \brief Per-scan options for fragment(s) in a dataset.
+///
+/// These options are not intrinsic to the format or fragment itself, but do affect
+/// the results of a scan. These are options which make sense to change between
+/// repeated reads of the same dataset, such as format-specific conversion options
+/// (that do not affect the schema).
+class ARROW_DS_EXPORT FragmentScanOptions {
+ public:
+ virtual std::string type_name() const = 0;
+ virtual std::string ToString() const { return type_name(); }
+ virtual ~FragmentScanOptions() = default;
+};
+
/// \brief A trivial Fragment that yields ScanTask out of a fixed set of
/// RecordBatch.
class ARROW_DS_EXPORT InMemoryFragment : public Fragment {
diff --git a/cpp/src/arrow/dataset/file_csv.cc b/cpp/src/arrow/dataset/file_csv.cc
index 0c023b87dcd..7cc4a8375f0 100644
--- a/cpp/src/arrow/dataset/file_csv.cc
+++ b/cpp/src/arrow/dataset/file_csv.cc
@@ -76,12 +76,19 @@ Result> GetColumnNames(
static inline Result GetConvertOptions(
const CsvFileFormat& format, const std::shared_ptr& scan_options,
- const Buffer& first_block, MemoryPool* pool) {
+ const std::shared_ptr& scan_context, const Buffer& first_block,
+ MemoryPool* pool) {
ARROW_ASSIGN_OR_RAISE(
auto column_names,
GetColumnNames(format.parse_options, util::string_view{first_block}, pool));
auto convert_options = csv::ConvertOptions::Defaults();
+ if (scan_context && scan_context->fragment_scan_options &&
+ scan_context->fragment_scan_options->type_name() == kCsvTypeName) {
+ auto csv_scan_options = internal::checked_pointer_cast(
+ scan_context->fragment_scan_options);
+ convert_options = csv_scan_options->convert_options;
+ }
for (FieldRef ref : scan_options->MaterializedFields()) {
ARROW_ASSIGN_OR_RAISE(auto field, ref.GetOne(*scan_options->dataset_schema));
@@ -104,6 +111,7 @@ static inline csv::ReadOptions GetReadOptions(const CsvFileFormat& format) {
static inline Result> OpenReader(
const FileSource& source, const CsvFileFormat& format,
const std::shared_ptr& scan_options = nullptr,
+ const std::shared_ptr& scan_context = nullptr,
MemoryPool* pool = default_memory_pool()) {
ARROW_ASSIGN_OR_RAISE(auto input, source.Open());
@@ -115,8 +123,9 @@ static inline Result> OpenReader(
auto convert_options = csv::ConvertOptions::Defaults();
if (scan_options != nullptr) {
- ARROW_ASSIGN_OR_RAISE(convert_options,
- GetConvertOptions(format, scan_options, *first_block, pool));
+ ARROW_ASSIGN_OR_RAISE(
+ convert_options,
+ GetConvertOptions(format, scan_options, scan_context, *first_block, pool));
}
auto maybe_reader =
@@ -141,8 +150,8 @@ class CsvScanTask : public ScanTask {
source_(fragment->source()) {}
Result Execute() override {
- ARROW_ASSIGN_OR_RAISE(auto reader,
- OpenReader(source_, *format_, options(), context()->pool));
+ ARROW_ASSIGN_OR_RAISE(auto reader, OpenReader(source_, *format_, options(), context(),
+ context()->pool));
return IteratorFromReader(std::move(reader));
}
diff --git a/cpp/src/arrow/dataset/file_csv.h b/cpp/src/arrow/dataset/file_csv.h
index 1e83189ce04..b7a9388a324 100644
--- a/cpp/src/arrow/dataset/file_csv.h
+++ b/cpp/src/arrow/dataset/file_csv.h
@@ -21,6 +21,7 @@
#include
#include "arrow/csv/options.h"
+#include "arrow/dataset/dataset.h"
#include "arrow/dataset/file_base.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
@@ -29,13 +30,15 @@
namespace arrow {
namespace dataset {
+constexpr char kCsvTypeName[] = "csv";
+
/// \brief A FileFormat implementation that reads from and writes to Csv files
class ARROW_DS_EXPORT CsvFileFormat : public FileFormat {
public:
/// Options affecting the parsing of CSV files
csv::ParseOptions parse_options = csv::ParseOptions::Defaults();
- std::string type_name() const override { return "csv"; }
+ std::string type_name() const override { return kCsvTypeName; }
bool Equals(const FileFormat& other) const override;
@@ -58,5 +61,12 @@ class ARROW_DS_EXPORT CsvFileFormat : public FileFormat {
std::shared_ptr DefaultWriteOptions() override { return NULLPTR; }
};
+class ARROW_DS_EXPORT CsvFragmentScanOptions : public FragmentScanOptions {
+ public:
+ std::string type_name() const override { return kCsvTypeName; }
+
+ csv::ConvertOptions convert_options = csv::ConvertOptions::Defaults();
+};
+
} // namespace dataset
} // namespace arrow
diff --git a/cpp/src/arrow/dataset/file_csv_test.cc b/cpp/src/arrow/dataset/file_csv_test.cc
index 5c27f81b094..aa9a2b186d0 100644
--- a/cpp/src/arrow/dataset/file_csv_test.cc
+++ b/cpp/src/arrow/dataset/file_csv_test.cc
@@ -81,6 +81,28 @@ N/A
ASSERT_EQ(row_count, 3);
}
+TEST_F(TestCsvFileFormat, CustomConvertOptions) {
+ auto source = GetFileSource(R"(str
+foo
+MYNULL
+N/A
+bar)");
+ SetSchema({field("str", utf8())});
+ ASSERT_OK_AND_ASSIGN(auto fragment, format_->MakeFragment(*source));
+ auto fragment_scan_options = std::make_shared();
+ fragment_scan_options->convert_options.null_values = {"MYNULL"};
+ fragment_scan_options->convert_options.strings_can_be_null = true;
+ ctx_->fragment_scan_options = fragment_scan_options;
+
+ int64_t null_count = 0;
+ for (auto maybe_batch : Batches(fragment.get())) {
+ ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch);
+ null_count += batch->GetColumnByName("str")->null_count();
+ }
+
+ ASSERT_EQ(null_count, 1);
+}
+
TEST_F(TestCsvFileFormat, ScanRecordBatchReaderWithVirtualColumn) {
auto source = GetFileSource(R"(f64
1.0
diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h
index d335fe4d4bd..4af84a4a929 100644
--- a/cpp/src/arrow/dataset/scanner.h
+++ b/cpp/src/arrow/dataset/scanner.h
@@ -46,6 +46,9 @@ struct ARROW_DS_EXPORT ScanContext {
/// Indicate if the Scanner should make use of a ThreadPool.
bool use_threads = false;
+ /// Fragment-specific scan options.
+ std::shared_ptr fragment_scan_options;
+
/// Return a threaded or serial TaskGroup according to use_threads.
std::shared_ptr TaskGroup() const;
};
From 86224290b056d7fa321711c09dba7a9fe1e35b48 Mon Sep 17 00:00:00 2001
From: David Li
Date: Fri, 12 Mar 2021 11:12:38 -0500
Subject: [PATCH 2/4] ARROW-9749: [C++][Python][Dataset][R] Eliminate all
fields from ScanContext
---
cpp/src/arrow/dataset/dataset.cc | 5 +++
cpp/src/arrow/dataset/dataset.h | 1 +
cpp/src/arrow/dataset/file_base.cc | 8 ++--
cpp/src/arrow/dataset/file_csv.cc | 19 ++++------
cpp/src/arrow/dataset/file_csv_test.cc | 2 +-
cpp/src/arrow/dataset/file_ipc.cc | 2 +-
cpp/src/arrow/dataset/file_parquet.cc | 4 +-
cpp/src/arrow/dataset/scanner.cc | 39 ++++++++++++++------
cpp/src/arrow/dataset/scanner.h | 32 +++++++++-------
cpp/src/arrow/dataset/scanner_internal.h | 4 +-
cpp/src/arrow/dataset/scanner_test.cc | 4 +-
python/pyarrow/_dataset.pyx | 26 ++++++-------
python/pyarrow/includes/libarrow_dataset.pxd | 4 +-
r/src/dataset.cpp | 6 +--
14 files changed, 90 insertions(+), 66 deletions(-)
diff --git a/cpp/src/arrow/dataset/dataset.cc b/cpp/src/arrow/dataset/dataset.cc
index 051e446974d..436b891bd74 100644
--- a/cpp/src/arrow/dataset/dataset.cc
+++ b/cpp/src/arrow/dataset/dataset.cc
@@ -97,6 +97,11 @@ Dataset::Dataset(std::shared_ptr schema, Expression partition_expression
: schema_(std::move(schema)),
partition_expression_(std::move(partition_expression)) {}
+Result> Dataset::NewScan(
+ std::shared_ptr options) {
+ return std::make_shared(this->shared_from_this(), options);
+}
+
Result> Dataset::NewScan(
std::shared_ptr context) {
return std::make_shared(this->shared_from_this(), context);
diff --git a/cpp/src/arrow/dataset/dataset.h b/cpp/src/arrow/dataset/dataset.h
index 30b7449f31a..b5af67e6239 100644
--- a/cpp/src/arrow/dataset/dataset.h
+++ b/cpp/src/arrow/dataset/dataset.h
@@ -131,6 +131,7 @@ class ARROW_DS_EXPORT InMemoryFragment : public Fragment {
class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this {
public:
/// \brief Begin to build a new Scan operation against this Dataset
+ Result> NewScan(std::shared_ptr options);
Result> NewScan(std::shared_ptr context);
Result> NewScan();
diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc
index 3ee23549130..ba6cd901305 100644
--- a/cpp/src/arrow/dataset/file_base.cc
+++ b/cpp/src/arrow/dataset/file_base.cc
@@ -339,7 +339,7 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio
std::shared_ptr scanner) {
RETURN_NOT_OK(ValidateBasenameTemplate(write_options.basename_template));
- auto task_group = scanner->context()->TaskGroup();
+ auto task_group = scanner->options()->TaskGroup();
// Things we'll un-lazy for the sake of simplicity, with the tradeoff they represent:
//
@@ -356,12 +356,12 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio
ARROW_ASSIGN_OR_RAISE(FragmentVector fragments, fragment_it.ToVector());
ScanTaskVector scan_tasks;
- // Avoid contention with multithreaded readers
auto context = std::make_shared(*scanner->context());
- context->use_threads = false;
for (const auto& fragment : fragments) {
auto options = std::make_shared(*scanner->options());
+ // Avoid contention with multithreaded readers
+ options->use_threads = false;
ARROW_ASSIGN_OR_RAISE(auto scan_task_it,
Scanner(fragment, std::move(options), context).Scan());
for (auto maybe_scan_task : scan_task_it) {
@@ -434,7 +434,7 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio
}
RETURN_NOT_OK(task_group->Finish());
- task_group = scanner->context()->TaskGroup();
+ task_group = scanner->options()->TaskGroup();
for (const auto& part_queue : queues) {
task_group->Append([&] { return part_queue.second->writer()->Finish(); });
}
diff --git a/cpp/src/arrow/dataset/file_csv.cc b/cpp/src/arrow/dataset/file_csv.cc
index 7cc4a8375f0..d8d9439eba5 100644
--- a/cpp/src/arrow/dataset/file_csv.cc
+++ b/cpp/src/arrow/dataset/file_csv.cc
@@ -76,17 +76,16 @@ Result> GetColumnNames(
static inline Result GetConvertOptions(
const CsvFileFormat& format, const std::shared_ptr& scan_options,
- const std::shared_ptr& scan_context, const Buffer& first_block,
- MemoryPool* pool) {
+ const Buffer& first_block, MemoryPool* pool) {
ARROW_ASSIGN_OR_RAISE(
auto column_names,
GetColumnNames(format.parse_options, util::string_view{first_block}, pool));
auto convert_options = csv::ConvertOptions::Defaults();
- if (scan_context && scan_context->fragment_scan_options &&
- scan_context->fragment_scan_options->type_name() == kCsvTypeName) {
+ if (scan_options && scan_options->fragment_scan_options &&
+ scan_options->fragment_scan_options->type_name() == kCsvTypeName) {
auto csv_scan_options = internal::checked_pointer_cast(
- scan_context->fragment_scan_options);
+ scan_options->fragment_scan_options);
convert_options = csv_scan_options->convert_options;
}
@@ -111,7 +110,6 @@ static inline csv::ReadOptions GetReadOptions(const CsvFileFormat& format) {
static inline Result> OpenReader(
const FileSource& source, const CsvFileFormat& format,
const std::shared_ptr& scan_options = nullptr,
- const std::shared_ptr& scan_context = nullptr,
MemoryPool* pool = default_memory_pool()) {
ARROW_ASSIGN_OR_RAISE(auto input, source.Open());
@@ -123,9 +121,8 @@ static inline Result> OpenReader(
auto convert_options = csv::ConvertOptions::Defaults();
if (scan_options != nullptr) {
- ARROW_ASSIGN_OR_RAISE(
- convert_options,
- GetConvertOptions(format, scan_options, scan_context, *first_block, pool));
+ ARROW_ASSIGN_OR_RAISE(convert_options,
+ GetConvertOptions(format, scan_options, *first_block, pool));
}
auto maybe_reader =
@@ -150,8 +147,8 @@ class CsvScanTask : public ScanTask {
source_(fragment->source()) {}
Result Execute() override {
- ARROW_ASSIGN_OR_RAISE(auto reader, OpenReader(source_, *format_, options(), context(),
- context()->pool));
+ ARROW_ASSIGN_OR_RAISE(auto reader,
+ OpenReader(source_, *format_, options(), options()->pool));
return IteratorFromReader(std::move(reader));
}
diff --git a/cpp/src/arrow/dataset/file_csv_test.cc b/cpp/src/arrow/dataset/file_csv_test.cc
index aa9a2b186d0..19d792e6bef 100644
--- a/cpp/src/arrow/dataset/file_csv_test.cc
+++ b/cpp/src/arrow/dataset/file_csv_test.cc
@@ -92,7 +92,7 @@ bar)");
auto fragment_scan_options = std::make_shared();
fragment_scan_options->convert_options.null_values = {"MYNULL"};
fragment_scan_options->convert_options.strings_can_be_null = true;
- ctx_->fragment_scan_options = fragment_scan_options;
+ opts_->fragment_scan_options = fragment_scan_options;
int64_t null_count = 0;
for (auto maybe_batch : Batches(fragment.get())) {
diff --git a/cpp/src/arrow/dataset/file_ipc.cc b/cpp/src/arrow/dataset/file_ipc.cc
index a8a6425b345..441ea0cd4d4 100644
--- a/cpp/src/arrow/dataset/file_ipc.cc
+++ b/cpp/src/arrow/dataset/file_ipc.cc
@@ -108,7 +108,7 @@ class IpcScanTask : public ScanTask {
int i_;
};
- return Impl::Make(source_, options_->MaterializedFields(), context_->pool);
+ return Impl::Make(source_, options_->MaterializedFields(), options_->pool);
}
private:
diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc
index d642ed36dc0..672f90ca24f 100644
--- a/cpp/src/arrow/dataset/file_parquet.cc
+++ b/cpp/src/arrow/dataset/file_parquet.cc
@@ -291,7 +291,7 @@ Result> ParquetFileFormat::Inspect(
Result> ParquetFileFormat::GetReader(
const FileSource& source, ScanOptions* options, ScanContext* context) const {
- MemoryPool* pool = context ? context->pool : default_memory_pool();
+ MemoryPool* pool = options ? options->pool : default_memory_pool();
auto properties = MakeReaderProperties(*this, pool);
ARROW_ASSIGN_OR_RAISE(auto input, source.Open());
@@ -310,7 +310,7 @@ Result> ParquetFileFormat::GetReader
arrow_properties.set_batch_size(options->batch_size);
}
- if (context && !context->use_threads) {
+ if (options && !options->use_threads) {
arrow_properties.set_use_threads(reader_options.enable_parallel_column_conversion);
}
diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc
index 7ce9bdd9a29..3ae0bb51d00 100644
--- a/cpp/src/arrow/dataset/scanner.cc
+++ b/cpp/src/arrow/dataset/scanner.cc
@@ -47,6 +47,16 @@ std::vector ScanOptions::MaterializedFields() const {
return fields;
}
+using arrow::internal::TaskGroup;
+
+std::shared_ptr ScanOptions::TaskGroup() const {
+ if (use_threads) {
+ auto* thread_pool = arrow::internal::GetCpuThreadPool();
+ return TaskGroup::MakeThreaded(thread_pool);
+ }
+ return TaskGroup::MakeSerial();
+}
+
Result InMemoryScanTask::Execute() {
return MakeVectorIterator(record_batches_);
}
@@ -82,6 +92,16 @@ Result ScanTaskIteratorFromRecordBatch(
return fragment->Scan(std::move(options), std::move(context));
}
+ScannerBuilder::ScannerBuilder(std::shared_ptr dataset,
+ std::shared_ptr scan_options)
+ : dataset_(std::move(dataset)),
+ fragment_(nullptr),
+ scan_options_(std::move(scan_options)),
+ scan_context_(std::make_shared()) {
+ scan_options_->dataset_schema = dataset_->schema();
+ DCHECK_OK(Filter(literal(true)));
+}
+
ScannerBuilder::ScannerBuilder(std::shared_ptr dataset,
std::shared_ptr scan_context)
: dataset_(std::move(dataset)),
@@ -121,7 +141,7 @@ Status ScannerBuilder::Filter(const Expression& filter) {
}
Status ScannerBuilder::UseThreads(bool use_threads) {
- scan_context_->use_threads = use_threads;
+ scan_options_->use_threads = use_threads;
return Status::OK();
}
@@ -133,6 +153,11 @@ Status ScannerBuilder::BatchSize(int64_t batch_size) {
return Status::OK();
}
+Status ScannerBuilder::Pool(MemoryPool* pool) {
+ scan_options_->pool = pool;
+ return Status::OK();
+}
+
Result> ScannerBuilder::Finish() {
if (!scan_options_->projection.IsBound()) {
RETURN_NOT_OK(Project(scan_options_->dataset_schema->field_names()));
@@ -144,16 +169,6 @@ Result> ScannerBuilder::Finish() {
return std::make_shared(dataset_, scan_options_, scan_context_);
}
-using arrow::internal::TaskGroup;
-
-std::shared_ptr ScanContext::TaskGroup() const {
- if (use_threads) {
- auto* thread_pool = arrow::internal::GetCpuThreadPool();
- return TaskGroup::MakeThreaded(thread_pool);
- }
- return TaskGroup::MakeSerial();
-}
-
static inline RecordBatchVector FlattenRecordBatchVector(
std::vector nested_batches) {
RecordBatchVector flattened;
@@ -183,7 +198,7 @@ struct TableAssemblyState {
Result> Scanner::ToTable() {
ARROW_ASSIGN_OR_RAISE(auto scan_task_it, Scan());
- auto task_group = scan_context_->TaskGroup();
+ auto task_group = scan_options_->TaskGroup();
/// Wraps the state in a shared_ptr to ensure that failing ScanTasks don't
/// invalidate concurrently running tasks when Finish() early returns
diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h
index 4af84a4a929..b946d45e3ca 100644
--- a/cpp/src/arrow/dataset/scanner.h
+++ b/cpp/src/arrow/dataset/scanner.h
@@ -39,19 +39,7 @@ namespace dataset {
constexpr int64_t kDefaultBatchSize = 1 << 20;
/// \brief Shared state for a Scan operation
-struct ARROW_DS_EXPORT ScanContext {
- /// A pool from which materialized and scanned arrays will be allocated.
- MemoryPool* pool = arrow::default_memory_pool();
-
- /// Indicate if the Scanner should make use of a ThreadPool.
- bool use_threads = false;
-
- /// Fragment-specific scan options.
- std::shared_ptr fragment_scan_options;
-
- /// Return a threaded or serial TaskGroup according to use_threads.
- std::shared_ptr TaskGroup() const;
-};
+struct ARROW_DS_EXPORT ScanContext {};
struct ARROW_DS_EXPORT ScanOptions {
// Filter and projection
@@ -78,6 +66,15 @@ struct ARROW_DS_EXPORT ScanOptions {
// Maximum row count for scanned batches.
int64_t batch_size = kDefaultBatchSize;
+ /// A pool from which materialized and scanned arrays will be allocated.
+ MemoryPool* pool = arrow::default_memory_pool();
+
+ /// Indicate if the Scanner should make use of a ThreadPool.
+ bool use_threads = false;
+
+ /// Fragment-specific scan options.
+ std::shared_ptr fragment_scan_options;
+
// Return a vector of fields that requires materialization.
//
// This is usually the union of the fields referenced in the projection and the
@@ -93,6 +90,9 @@ struct ARROW_DS_EXPORT ScanOptions {
// This is used by Fragment implementations to apply the column
// sub-selection optimization.
std::vector MaterializedFields() const;
+
+ /// Return a threaded or serial TaskGroup according to use_threads.
+ std::shared_ptr TaskGroup() const;
};
/// \brief Read record batches from a range of a single data fragment. A
@@ -200,6 +200,9 @@ class ARROW_DS_EXPORT Scanner {
/// columns to materialize.
class ARROW_DS_EXPORT ScannerBuilder {
public:
+ ScannerBuilder(std::shared_ptr dataset,
+ std::shared_ptr scan_options);
+
ScannerBuilder(std::shared_ptr dataset,
std::shared_ptr scan_context);
@@ -253,6 +256,9 @@ class ARROW_DS_EXPORT ScannerBuilder {
/// This option provides a control limiting the memory owned by any RecordBatch.
Status BatchSize(int64_t batch_size);
+ /// \brief Set the pool from which materialized and scanned arrays will be allocated.
+ Status Pool(MemoryPool* pool);
+
/// \brief Return the constructed now-immutable Scanner object
Result> Finish();
diff --git a/cpp/src/arrow/dataset/scanner_internal.h b/cpp/src/arrow/dataset/scanner_internal.h
index 25c53fc8146..782af187aac 100644
--- a/cpp/src/arrow/dataset/scanner_internal.h
+++ b/cpp/src/arrow/dataset/scanner_internal.h
@@ -100,10 +100,10 @@ class FilterAndProjectScanTask : public ScanTask {
SimplifyWithGuarantee(options()->projection, partition_));
RecordBatchIterator filter_it =
- FilterRecordBatch(std::move(it), simplified_filter, context_->pool);
+ FilterRecordBatch(std::move(it), simplified_filter, options_->pool);
return ProjectRecordBatch(std::move(filter_it), simplified_projection,
- context_->pool);
+ options_->pool);
}
private:
diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc
index 0ceb5bc4434..a8ed6c3b2b0 100644
--- a/cpp/src/arrow/dataset/scanner_test.cc
+++ b/cpp/src/arrow/dataset/scanner_test.cc
@@ -140,13 +140,13 @@ TEST_F(TestScanner, ToTable) {
auto scanner = MakeScanner(batch);
std::shared_ptr actual;
- ctx_->use_threads = false;
+ options_->use_threads = false;
ASSERT_OK_AND_ASSIGN(actual, scanner.ToTable());
AssertTablesEqual(*expected, *actual);
// There is no guarantee on the ordering when using multiple threads, but
// since the RecordBatch is always the same it will pass.
- ctx_->use_threads = true;
+ options_->use_threads = true;
ASSERT_OK_AND_ASSIGN(actual, scanner.ToTable());
AssertTablesEqual(*expected, *actual);
}
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 1c4e5d302c5..d1afb228750 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -2163,16 +2163,11 @@ cdef class ScanTask(_Weakrefable):
yield pyarrow_wrap_batch(record_batch)
-cdef shared_ptr[CScanContext] _build_scan_context(bint use_threads=True,
- MemoryPool memory_pool=None):
+cdef shared_ptr[CScanContext] _build_scan_context():
cdef:
shared_ptr[CScanContext] context
context = make_shared[CScanContext]()
- context.get().pool = maybe_unbox_memory_pool(memory_pool)
- if use_threads is not None:
- context.get().use_threads = use_threads
-
return context
@@ -2181,7 +2176,9 @@ _DEFAULT_BATCH_SIZE = 2**20
cdef void _populate_builder(const shared_ptr[CScannerBuilder]& ptr,
list columns=None, Expression filter=None,
- int batch_size=_DEFAULT_BATCH_SIZE) except *:
+ int batch_size=_DEFAULT_BATCH_SIZE,
+ bint use_threads=True,
+ MemoryPool memory_pool=None) except *:
cdef:
CScannerBuilder *builder
builder = ptr.get()
@@ -2193,6 +2190,9 @@ cdef void _populate_builder(const shared_ptr[CScannerBuilder]& ptr,
check_status(builder.Project([tobytes(c) for c in columns]))
check_status(builder.BatchSize(batch_size))
+ check_status(builder.UseThreads(use_threads))
+ if memory_pool:
+ check_status(builder.Pool(maybe_unbox_memory_pool(memory_pool)))
cdef class Scanner(_Weakrefable):
@@ -2261,11 +2261,11 @@ cdef class Scanner(_Weakrefable):
shared_ptr[CScannerBuilder] builder
shared_ptr[CScanner] scanner
- context = _build_scan_context(use_threads=use_threads,
- memory_pool=memory_pool)
+ context = _build_scan_context()
builder = make_shared[CScannerBuilder](dataset.unwrap(), context)
_populate_builder(builder, columns=columns, filter=filter,
- batch_size=batch_size)
+ batch_size=batch_size, use_threads=use_threads,
+ memory_pool=memory_pool)
scanner = GetResultValue(builder.get().Finish())
return Scanner.wrap(scanner)
@@ -2280,15 +2280,15 @@ cdef class Scanner(_Weakrefable):
shared_ptr[CScannerBuilder] builder
shared_ptr[CScanner] scanner
- context = _build_scan_context(use_threads=use_threads,
- memory_pool=memory_pool)
+ context = _build_scan_context()
schema = schema or fragment.physical_schema
builder = make_shared[CScannerBuilder](pyarrow_unwrap_schema(schema),
fragment.unwrap(), context)
_populate_builder(builder, columns=columns, filter=filter,
- batch_size=batch_size)
+ batch_size=batch_size, use_threads=use_threads,
+ memory_pool=memory_pool)
scanner = GetResultValue(builder.get().Finish())
return Scanner.wrap(scanner)
diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd
index bbe545cf794..76504c32435 100644
--- a/python/pyarrow/includes/libarrow_dataset.pxd
+++ b/python/pyarrow/includes/libarrow_dataset.pxd
@@ -60,8 +60,7 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
shared_ptr[CScanOptions] Make(shared_ptr[CSchema] schema)
cdef cppclass CScanContext "arrow::dataset::ScanContext":
- c_bool use_threads
- CMemoryPool * pool
+ pass
ctypedef CIterator[shared_ptr[CScanTask]] CScanTaskIterator \
"arrow::dataset::ScanTaskIterator"
@@ -106,6 +105,7 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
CStatus Project(const vector[c_string]& columns)
CStatus Filter(CExpression filter)
CStatus UseThreads(c_bool use_threads)
+ CStatus Pool(CMemoryPool* pool)
CStatus BatchSize(int64_t batch_size)
CResult[shared_ptr[CScanner]] Finish()
shared_ptr[CSchema] schema() const
diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp
index 001cd9da0f6..83c7cbb844c 100644
--- a/r/src/dataset.cpp
+++ b/r/src/dataset.cpp
@@ -66,9 +66,9 @@ const char* r6_class_name::get(
// [[dataset::export]]
std::shared_ptr dataset___Dataset__NewScan(
const std::shared_ptr& ds) {
- auto context = std::make_shared();
- context->pool = gc_memory_pool();
- return ValueOrStop(ds->NewScan(std::move(context)));
+ auto options = std::make_shared();
+ options->pool = gc_memory_pool();
+ return ValueOrStop(ds->NewScan(std::move(options)));
}
// [[dataset::export]]
From 149cb232613e8095791560f86b00f5f142b4567b Mon Sep 17 00:00:00 2001
From: David Li
Date: Fri, 12 Mar 2021 14:00:30 -0500
Subject: [PATCH 3/4] ARROW-9749: [C++][Python][Dataset] Remove ScanContext
---
cpp/src/arrow/dataset/dataset.cc | 14 ++----
cpp/src/arrow/dataset/dataset.h | 7 +--
cpp/src/arrow/dataset/file_base.cc | 9 ++--
cpp/src/arrow/dataset/file_base.h | 5 +--
cpp/src/arrow/dataset/file_csv.cc | 8 ++--
cpp/src/arrow/dataset/file_csv.h | 2 +-
cpp/src/arrow/dataset/file_csv_test.cc | 8 ++--
cpp/src/arrow/dataset/file_ipc.cc | 22 +++-------
cpp/src/arrow/dataset/file_ipc.h | 2 +-
cpp/src/arrow/dataset/file_ipc_test.cc | 5 +--
cpp/src/arrow/dataset/file_parquet.cc | 12 +++--
cpp/src/arrow/dataset/file_parquet.h | 6 +--
cpp/src/arrow/dataset/file_parquet_test.cc | 9 ++--
cpp/src/arrow/dataset/scanner.cc | 31 +++++--------
cpp/src/arrow/dataset/scanner.h | 46 ++++++--------------
cpp/src/arrow/dataset/scanner_internal.h | 10 ++---
cpp/src/arrow/dataset/scanner_test.cc | 10 ++---
cpp/src/arrow/dataset/test_util.h | 15 +++----
cpp/src/arrow/dataset/type_fwd.h | 1 -
cpp/src/jni/dataset/jni_wrapper.cc | 6 +--
python/pyarrow/_dataset.pyx | 19 ++------
python/pyarrow/includes/libarrow_dataset.pxd | 18 +++-----
22 files changed, 91 insertions(+), 174 deletions(-)
diff --git a/cpp/src/arrow/dataset/dataset.cc b/cpp/src/arrow/dataset/dataset.cc
index 436b891bd74..df155784924 100644
--- a/cpp/src/arrow/dataset/dataset.cc
+++ b/cpp/src/arrow/dataset/dataset.cc
@@ -69,8 +69,7 @@ InMemoryFragment::InMemoryFragment(RecordBatchVector record_batches,
: InMemoryFragment(record_batches.empty() ? schema({}) : record_batches[0]->schema(),
std::move(record_batches), std::move(partition_expression)) {}
-Result InMemoryFragment::Scan(std::shared_ptr options,
- std::shared_ptr context) {
+Result InMemoryFragment::Scan(std::shared_ptr options) {
// Make an explicit copy of record_batches_ to ensure Scan can be called
// multiple times.
auto batches_it = MakeVectorIterator(record_batches_);
@@ -86,8 +85,8 @@ Result InMemoryFragment::Scan(std::shared_ptr opt
batches.push_back(batch->Slice(batch_size * i, batch_size));
}
- return ::arrow::internal::make_unique(
- std::move(batches), std::move(options), std::move(context), self);
+ return ::arrow::internal::make_unique(std::move(batches),
+ std::move(options), self);
};
return MakeMapIterator(fn, std::move(batches_it));
@@ -102,13 +101,8 @@ Result> Dataset::NewScan(
return std::make_shared(this->shared_from_this(), options);
}
-Result> Dataset::NewScan(
- std::shared_ptr context) {
- return std::make_shared(this->shared_from_this(), context);
-}
-
Result> Dataset::NewScan() {
- return NewScan(std::make_shared());
+ return NewScan(std::make_shared());
}
Result Dataset::GetFragments() {
diff --git a/cpp/src/arrow/dataset/dataset.h b/cpp/src/arrow/dataset/dataset.h
index b5af67e6239..a28b79840d6 100644
--- a/cpp/src/arrow/dataset/dataset.h
+++ b/cpp/src/arrow/dataset/dataset.h
@@ -62,8 +62,7 @@ class ARROW_DS_EXPORT Fragment : public std::enable_shared_from_this {
/// columns may be absent if they were not present in this fragment.
///
/// To receive a record batch stream which is fully filtered and projected, use Scanner.
- virtual Result Scan(std::shared_ptr options,
- std::shared_ptr context) = 0;
+ virtual Result Scan(std::shared_ptr options) = 0;
/// \brief Return true if the fragment can benefit from parallel scanning.
virtual bool splittable() const = 0;
@@ -110,8 +109,7 @@ class ARROW_DS_EXPORT InMemoryFragment : public Fragment {
Expression = literal(true));
explicit InMemoryFragment(RecordBatchVector record_batches, Expression = literal(true));
- Result Scan(std::shared_ptr options,
- std::shared_ptr context) override;
+ Result Scan(std::shared_ptr options) override;
bool splittable() const override { return false; }
@@ -132,7 +130,6 @@ class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this {
public:
/// \brief Begin to build a new Scan operation against this Dataset
Result> NewScan(std::shared_ptr options);
- Result> NewScan(std::shared_ptr context);
Result> NewScan();
/// \brief GetFragments returns an iterator of Fragments given a predicate.
diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc
index ba6cd901305..eff3f5231d8 100644
--- a/cpp/src/arrow/dataset/file_base.cc
+++ b/cpp/src/arrow/dataset/file_base.cc
@@ -78,10 +78,9 @@ Result> FileFragment::ReadPhysicalSchemaImpl() {
return format_->Inspect(source_);
}
-Result FileFragment::Scan(std::shared_ptr options,
- std::shared_ptr context) {
+Result FileFragment::Scan(std::shared_ptr options) {
auto self = std::dynamic_pointer_cast(shared_from_this());
- return format_->ScanFile(std::move(options), std::move(context), self);
+ return format_->ScanFile(std::move(options), self);
}
struct FileSystemDataset::FragmentSubtrees {
@@ -356,14 +355,12 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio
ARROW_ASSIGN_OR_RAISE(FragmentVector fragments, fragment_it.ToVector());
ScanTaskVector scan_tasks;
- auto context = std::make_shared(*scanner->context());
-
for (const auto& fragment : fragments) {
auto options = std::make_shared(*scanner->options());
// Avoid contention with multithreaded readers
options->use_threads = false;
ARROW_ASSIGN_OR_RAISE(auto scan_task_it,
- Scanner(fragment, std::move(options), context).Scan());
+ Scanner(fragment, std::move(options)).Scan());
for (auto maybe_scan_task : scan_task_it) {
ARROW_ASSIGN_OR_RAISE(auto scan_task, maybe_scan_task);
scan_tasks.push_back(std::move(scan_task));
diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h
index 0d1dd9cd0bf..4b19f3083eb 100644
--- a/cpp/src/arrow/dataset/file_base.h
+++ b/cpp/src/arrow/dataset/file_base.h
@@ -137,7 +137,7 @@ class ARROW_DS_EXPORT FileFormat : public std::enable_shared_from_this ScanFile(
- std::shared_ptr options, std::shared_ptr context,
+ std::shared_ptr options,
const std::shared_ptr& file) const = 0;
/// \brief Open a fragment
@@ -161,8 +161,7 @@ class ARROW_DS_EXPORT FileFormat : public std::enable_shared_from_this Scan(std::shared_ptr options,
- std::shared_ptr context) override;
+ Result Scan(std::shared_ptr options) override;
std::string type_name() const override { return format_->type_name(); }
std::string ToString() const override { return source_.path(); };
diff --git a/cpp/src/arrow/dataset/file_csv.cc b/cpp/src/arrow/dataset/file_csv.cc
index d8d9439eba5..87d97ea7d40 100644
--- a/cpp/src/arrow/dataset/file_csv.cc
+++ b/cpp/src/arrow/dataset/file_csv.cc
@@ -140,9 +140,9 @@ static inline Result> OpenReader(
class CsvScanTask : public ScanTask {
public:
CsvScanTask(std::shared_ptr format,
- std::shared_ptr options, std::shared_ptr context,
+ std::shared_ptr options,
std::shared_ptr fragment)
- : ScanTask(std::move(options), std::move(context), fragment),
+ : ScanTask(std::move(options), fragment),
format_(std::move(format)),
source_(fragment->source()) {}
@@ -184,11 +184,11 @@ Result> CsvFileFormat::Inspect(const FileSource& source)
}
Result CsvFileFormat::ScanFile(
- std::shared_ptr options, std::shared_ptr context,
+ std::shared_ptr options,
const std::shared_ptr& fragment) const {
auto this_ = checked_pointer_cast(shared_from_this());
auto task = std::make_shared(std::move(this_), std::move(options),
- std::move(context), std::move(fragment));
+ std::move(fragment));
return MakeVectorIterator>({std::move(task)});
}
diff --git a/cpp/src/arrow/dataset/file_csv.h b/cpp/src/arrow/dataset/file_csv.h
index b7a9388a324..e93ae1fa6ad 100644
--- a/cpp/src/arrow/dataset/file_csv.h
+++ b/cpp/src/arrow/dataset/file_csv.h
@@ -49,7 +49,7 @@ class ARROW_DS_EXPORT CsvFileFormat : public FileFormat {
/// \brief Open a file for scanning
Result ScanFile(
- std::shared_ptr options, std::shared_ptr context,
+ std::shared_ptr options,
const std::shared_ptr& fragment) const override;
Result> MakeWriter(
diff --git a/cpp/src/arrow/dataset/file_csv_test.cc b/cpp/src/arrow/dataset/file_csv_test.cc
index 19d792e6bef..7399ae71ec0 100644
--- a/cpp/src/arrow/dataset/file_csv_test.cc
+++ b/cpp/src/arrow/dataset/file_csv_test.cc
@@ -47,19 +47,17 @@ class TestCsvFileFormat : public testing::Test {
}
RecordBatchIterator Batches(Fragment* fragment) {
- EXPECT_OK_AND_ASSIGN(auto scan_task_it, fragment->Scan(opts_, ctx_));
+ EXPECT_OK_AND_ASSIGN(auto scan_task_it, fragment->Scan(opts_));
return Batches(std::move(scan_task_it));
}
void SetSchema(std::vector> fields) {
- opts_ = std::make_shared();
opts_->dataset_schema = schema(std::move(fields));
ASSERT_OK(SetProjection(opts_.get(), opts_->dataset_schema->field_names()));
}
std::shared_ptr format_ = std::make_shared();
- std::shared_ptr opts_;
- std::shared_ptr ctx_ = std::make_shared();
+ std::shared_ptr opts_ = std::make_shared();
};
TEST_F(TestCsvFileFormat, ScanRecordBatchReader) {
@@ -188,7 +186,7 @@ N/A,bar
auto dataset_schema =
schema({field("betrayal_not_really_f64", not_float64), field("str", utf8())});
- ScannerBuilder builder(dataset_schema, fragment, ctx_);
+ ScannerBuilder builder(dataset_schema, fragment, opts_);
// This filter is valid with declared schema, but would *not* be valid
// if betrayal_not_really_f64 were read as double rather than string.
diff --git a/cpp/src/arrow/dataset/file_ipc.cc b/cpp/src/arrow/dataset/file_ipc.cc
index 441ea0cd4d4..a81e8b74e86 100644
--- a/cpp/src/arrow/dataset/file_ipc.cc
+++ b/cpp/src/arrow/dataset/file_ipc.cc
@@ -76,9 +76,8 @@ static inline Result> GetIncludedFields(
class IpcScanTask : public ScanTask {
public:
IpcScanTask(std::shared_ptr fragment,
- std::shared_ptr options, std::shared_ptr context)
- : ScanTask(std::move(options), std::move(context), fragment),
- source_(fragment->source()) {}
+ std::shared_ptr options)
+ : ScanTask(std::move(options), fragment), source_(fragment->source()) {}
Result Execute() override {
struct Impl {
@@ -118,10 +117,8 @@ class IpcScanTask : public ScanTask {
class IpcScanTaskIterator {
public:
static Result Make(std::shared_ptr options,
- std::shared_ptr context,
std::shared_ptr fragment) {
- return ScanTaskIterator(
- IpcScanTaskIterator(std::move(options), std::move(context), std::move(fragment)));
+ return ScanTaskIterator(IpcScanTaskIterator(std::move(options), std::move(fragment)));
}
Result> Next() {
@@ -131,20 +128,16 @@ class IpcScanTaskIterator {
}
once_ = true;
- return std::shared_ptr(new IpcScanTask(fragment_, options_, context_));
+ return std::shared_ptr(new IpcScanTask(fragment_, options_));
}
private:
IpcScanTaskIterator(std::shared_ptr options,
- std::shared_ptr context,
std::shared_ptr fragment)
- : options_(std::move(options)),
- context_(std::move(context)),
- fragment_(std::move(fragment)) {}
+ : options_(std::move(options)), fragment_(std::move(fragment)) {}
bool once_ = false;
std::shared_ptr options_;
- std::shared_ptr context_;
std::shared_ptr fragment_;
};
@@ -159,10 +152,9 @@ Result> IpcFileFormat::Inspect(const FileSource& source)
}
Result IpcFileFormat::ScanFile(
- std::shared_ptr options, std::shared_ptr context,
+ std::shared_ptr options,
const std::shared_ptr& fragment) const {
- return IpcScanTaskIterator::Make(std::move(options), std::move(context),
- std::move(fragment));
+ return IpcScanTaskIterator::Make(std::move(options), std::move(fragment));
}
//
diff --git a/cpp/src/arrow/dataset/file_ipc.h b/cpp/src/arrow/dataset/file_ipc.h
index c0e311ae3e7..cbfb6b858cd 100644
--- a/cpp/src/arrow/dataset/file_ipc.h
+++ b/cpp/src/arrow/dataset/file_ipc.h
@@ -49,7 +49,7 @@ class ARROW_DS_EXPORT IpcFileFormat : public FileFormat {
/// \brief Open a file for scanning
Result ScanFile(
- std::shared_ptr options, std::shared_ptr context,
+ std::shared_ptr options,
const std::shared_ptr& fragment) const override;
Result> MakeWriter(
diff --git a/cpp/src/arrow/dataset/file_ipc_test.cc b/cpp/src/arrow/dataset/file_ipc_test.cc
index e5347fa0cda..8a5fd024575 100644
--- a/cpp/src/arrow/dataset/file_ipc_test.cc
+++ b/cpp/src/arrow/dataset/file_ipc_test.cc
@@ -102,7 +102,7 @@ class TestIpcFileFormat : public ArrowIpcWriterMixin {
}
RecordBatchIterator Batches(Fragment* fragment) {
- EXPECT_OK_AND_ASSIGN(auto scan_task_it, fragment->Scan(opts_, ctx_));
+ EXPECT_OK_AND_ASSIGN(auto scan_task_it, fragment->Scan(opts_));
return Batches(std::move(scan_task_it));
}
@@ -115,7 +115,6 @@ class TestIpcFileFormat : public ArrowIpcWriterMixin {
protected:
std::shared_ptr format_ = std::make_shared();
std::shared_ptr opts_;
- std::shared_ptr ctx_ = std::make_shared();
};
TEST_F(TestIpcFileFormat, ScanRecordBatchReader) {
@@ -238,7 +237,7 @@ TEST_F(TestIpcFileSystemDataset, WriteExceedsMaxPartitions) {
// require that no batch be grouped into more than 2 written batches:
write_options_.max_partitions = 2;
- auto scanner = std::make_shared(dataset_, scan_options_, scan_context_);
+ auto scanner = std::make_shared(dataset_, scan_options_);
EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("This exceeds the maximum"),
FileSystemDataset::Write(write_options_, scanner));
}
diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc
index 672f90ca24f..d255787d55f 100644
--- a/cpp/src/arrow/dataset/file_parquet.cc
+++ b/cpp/src/arrow/dataset/file_parquet.cc
@@ -60,9 +60,8 @@ class ParquetScanTask : public ScanTask {
std::vector pre_buffer_row_groups, arrow::io::IOContext io_context,
arrow::io::CacheOptions cache_options,
std::shared_ptr options,
- std::shared_ptr context,
std::shared_ptr fragment)
- : ScanTask(std::move(options), std::move(context), std::move(fragment)),
+ : ScanTask(std::move(options), std::move(fragment)),
row_group_(row_group),
column_projection_(std::move(column_projection)),
reader_(std::move(reader)),
@@ -290,7 +289,7 @@ Result> ParquetFileFormat::Inspect(
}
Result> ParquetFileFormat::GetReader(
- const FileSource& source, ScanOptions* options, ScanContext* context) const {
+ const FileSource& source, ScanOptions* options) const {
MemoryPool* pool = options ? options->pool : default_memory_pool();
auto properties = MakeReaderProperties(*this, pool);
@@ -321,7 +320,7 @@ Result> ParquetFileFormat::GetReader
}
Result ParquetFileFormat::ScanFile(
- std::shared_ptr options, std::shared_ptr context,
+ std::shared_ptr options,
const std::shared_ptr& fragment) const {
auto* parquet_fragment = checked_cast(fragment.get());
std::vector row_groups;
@@ -342,7 +341,7 @@ Result ParquetFileFormat::ScanFile(
// Open the reader and pay the real IO cost.
ARROW_ASSIGN_OR_RAISE(std::shared_ptr reader,
- GetReader(fragment->source(), options.get(), context.get()));
+ GetReader(fragment->source(), options.get()));
// Ensure that parquet_fragment has FileMetaData
RETURN_NOT_OK(parquet_fragment->EnsureCompleteMetadata(reader.get()));
@@ -365,8 +364,7 @@ Result ParquetFileFormat::ScanFile(
for (size_t i = 0; i < row_groups.size(); ++i) {
tasks[i] = std::make_shared(
row_groups[i], column_projection, reader, pre_buffer_once, row_groups,
- reader_options.io_context, reader_options.cache_options, options, context,
- fragment);
+ reader_options.io_context, reader_options.cache_options, options, fragment);
}
return MakeVectorIterator(std::move(tasks));
diff --git a/cpp/src/arrow/dataset/file_parquet.h b/cpp/src/arrow/dataset/file_parquet.h
index e72984ebdb5..869857e4d34 100644
--- a/cpp/src/arrow/dataset/file_parquet.h
+++ b/cpp/src/arrow/dataset/file_parquet.h
@@ -77,7 +77,7 @@ class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat {
/// members of parquet::ReaderProperties.
///
/// We don't embed parquet::ReaderProperties directly because we get memory_pool from
- /// ScanContext at scan time and provide differing defaults.
+ /// ScanOptions at scan time and provide differing defaults.
///
/// @{
bool use_buffered_stream = false;
@@ -114,7 +114,7 @@ class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat {
/// \brief Open a file for scanning
Result ScanFile(
- std::shared_ptr options, std::shared_ptr context,
+ std::shared_ptr options,
const std::shared_ptr& file) const override;
using FileFormat::MakeFragment;
@@ -131,7 +131,7 @@ class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat {
/// \brief Return a FileReader on the given source.
Result> GetReader(
- const FileSource& source, ScanOptions* = NULLPTR, ScanContext* = NULLPTR) const;
+ const FileSource& source, ScanOptions* = NULLPTR) const;
Result> MakeWriter(
std::shared_ptr destination, std::shared_ptr schema,
diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc
index 977af6be81a..cf14a2b7caf 100644
--- a/cpp/src/arrow/dataset/file_parquet_test.cc
+++ b/cpp/src/arrow/dataset/file_parquet_test.cc
@@ -157,7 +157,7 @@ class TestParquetFileFormat : public ArrowParquetWriterMixin {
}
RecordBatchIterator Batches(Fragment* fragment) {
- EXPECT_OK_AND_ASSIGN(auto scan_task_it, fragment->Scan(opts_, ctx_));
+ EXPECT_OK_AND_ASSIGN(auto scan_task_it, fragment->Scan(opts_));
return Batches(std::move(scan_task_it));
}
@@ -217,7 +217,6 @@ class TestParquetFileFormat : public ArrowParquetWriterMixin {
protected:
std::shared_ptr format_ = std::make_shared();
std::shared_ptr opts_;
- std::shared_ptr ctx_ = std::make_shared();
};
TEST_F(TestParquetFileFormat, ScanRecordBatchReader) {
@@ -248,7 +247,7 @@ TEST_F(TestParquetFileFormat, ScanRecordBatchReaderDictEncoded) {
format_->reader_options.dict_columns = {"utf8"};
ASSERT_OK_AND_ASSIGN(auto fragment, format_->MakeFragment(*source));
- ASSERT_OK_AND_ASSIGN(auto scan_task_it, fragment->Scan(opts_, ctx_));
+ ASSERT_OK_AND_ASSIGN(auto scan_task_it, fragment->Scan(opts_));
int64_t row_count = 0;
Schema expected_schema({field("utf8", dictionary(int32(), utf8()))});
@@ -275,7 +274,7 @@ TEST_F(TestParquetFileFormat, ScanRecordBatchReaderPreBuffer) {
format_->reader_options.pre_buffer = true;
ASSERT_OK_AND_ASSIGN(auto fragment, format_->MakeFragment(*source));
- ASSERT_OK_AND_ASSIGN(auto scan_task_it, fragment->Scan(opts_, ctx_));
+ ASSERT_OK_AND_ASSIGN(auto scan_task_it, fragment->Scan(opts_));
int64_t task_count = 0;
int64_t row_count = 0;
@@ -594,7 +593,7 @@ TEST_F(TestParquetFileFormat, ExplicitRowGroupSelection) {
EXPECT_RAISES_WITH_MESSAGE_THAT(
IndexError,
testing::HasSubstr("only has " + std::to_string(kNumRowGroups) + " row groups"),
- row_groups_fragment({kNumRowGroups + 1})->Scan(opts_, ctx_));
+ row_groups_fragment({kNumRowGroups + 1})->Scan(opts_));
}
TEST_F(TestParquetFileFormat, WriteRecordBatchReader) {
diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc
index 3ae0bb51d00..1aca9fa4882 100644
--- a/cpp/src/arrow/dataset/scanner.cc
+++ b/cpp/src/arrow/dataset/scanner.cc
@@ -77,48 +77,39 @@ Result Scanner::Scan() {
// Iterator. The first Iterator::Next invocation is going to do
// all the work of unwinding the chained iterators.
ARROW_ASSIGN_OR_RAISE(auto fragment_it, GetFragments());
- return GetScanTaskIterator(std::move(fragment_it), scan_options_, scan_context_);
+ return GetScanTaskIterator(std::move(fragment_it), scan_options_);
}
Result ScanTaskIteratorFromRecordBatch(
std::vector> batches,
- std::shared_ptr options, std::shared_ptr context) {
+ std::shared_ptr options) {
if (batches.empty()) {
return MakeVectorIterator(ScanTaskVector());
}
auto schema = batches[0]->schema();
auto fragment =
std::make_shared(std::move(schema), std::move(batches));
- return fragment->Scan(std::move(options), std::move(context));
+ return fragment->Scan(std::move(options));
}
-ScannerBuilder::ScannerBuilder(std::shared_ptr dataset,
- std::shared_ptr scan_options)
- : dataset_(std::move(dataset)),
- fragment_(nullptr),
- scan_options_(std::move(scan_options)),
- scan_context_(std::make_shared()) {
- scan_options_->dataset_schema = dataset_->schema();
- DCHECK_OK(Filter(literal(true)));
-}
+ScannerBuilder::ScannerBuilder(std::shared_ptr dataset)
+ : ScannerBuilder(std::move(dataset), std::make_shared()) {}
ScannerBuilder::ScannerBuilder(std::shared_ptr dataset,
- std::shared_ptr scan_context)
+ std::shared_ptr scan_options)
: dataset_(std::move(dataset)),
fragment_(nullptr),
- scan_options_(std::make_shared()),
- scan_context_(std::move(scan_context)) {
+ scan_options_(std::move(scan_options)) {
scan_options_->dataset_schema = dataset_->schema();
DCHECK_OK(Filter(literal(true)));
}
ScannerBuilder::ScannerBuilder(std::shared_ptr schema,
std::shared_ptr