From 754e559a04f09ddbc588d1514924d87a72d4a33f Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 28 Sep 2020 13:07:24 -0400 Subject: [PATCH 01/34] ARROW-9782: [C++][Dataset] More configurable Dataset writing --- cpp/src/arrow/dataset/file_base.cc | 239 +++++++++++++------ cpp/src/arrow/dataset/file_base.h | 85 +++++-- cpp/src/arrow/dataset/file_csv.cc | 16 ++ cpp/src/arrow/dataset/file_csv.h | 8 +- cpp/src/arrow/dataset/file_ipc.cc | 46 +++- cpp/src/arrow/dataset/file_ipc.h | 43 +++- cpp/src/arrow/dataset/file_ipc_test.cc | 9 +- cpp/src/arrow/dataset/file_parquet.cc | 83 ++++--- cpp/src/arrow/dataset/file_parquet.h | 48 +++- cpp/src/arrow/dataset/file_parquet_test.cc | 32 ++- cpp/src/arrow/dataset/partition.cc | 18 +- cpp/src/arrow/dataset/partition.h | 14 +- cpp/src/arrow/dataset/partition_test.cc | 2 +- cpp/src/arrow/dataset/test_util.h | 196 +++++++-------- cpp/src/arrow/dataset/type_fwd.h | 7 + cpp/src/arrow/util/mutex.cc | 10 + cpp/src/arrow/util/mutex.h | 29 +++ python/pyarrow/_dataset.pyx | 133 ++++------- python/pyarrow/dataset.py | 16 +- python/pyarrow/includes/libarrow_dataset.pxd | 25 +- python/pyarrow/tests/test_dataset.py | 86 +++---- 21 files changed, 738 insertions(+), 407 deletions(-) diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index 1e47b15ee43..90d270a4b30 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -31,6 +31,7 @@ #include "arrow/io/memory.h" #include "arrow/util/iterator.h" #include "arrow/util/logging.h" +#include "arrow/util/mutex.h" #include "arrow/util/task_group.h" namespace arrow { @@ -143,91 +144,193 @@ FragmentIterator FileSystemDataset::GetFragmentsImpl( return MakeVectorIterator(std::move(fragments)); } -struct WriteTask { - Status Execute(); +Status FileWriter::Write(RecordBatchReader* batches) { + for (std::shared_ptr batch;;) { + RETURN_NOT_OK(batches->ReadNext(&batch)); + if (batch == nullptr) break; + RETURN_NOT_OK(Write(batch)); + } + return Status::OK(); +} - /// The basename of files written by this WriteTask. Extensions - /// are derived from format - std::string basename; +struct NextBasenameGenerator { + static Result Make(const std::string& basename_template) { + if (basename_template.find(fs::internal::kSep) != std::string::npos) { + return Status::Invalid("basename_template contained '/'"); + } + size_t token_start = basename_template.find(token()); + if (token_start == std::string::npos) { + return Status::Invalid("basename_template did not contain '{i}'"); + } + return NextBasenameGenerator{basename_template, 0, token_start, + token_start + token().size()}; + } - /// The partitioning with which paths will be generated - std::shared_ptr partitioning; + static const std::string& token() { + static const std::string token = "{i}"; + return token; + } - /// The format in which fragments will be written - std::shared_ptr format; + const std::string& template_; + size_t i_, token_start_, token_end_; - /// The FileSystem and base directory into which fragments will be written - std::shared_ptr filesystem; - std::string base_dir; + std::string operator()() { + return template_.substr(0, token_start_) + std::to_string(i_++) + + template_.substr(token_end_); + } +}; - /// Batches to be written - std::shared_ptr batches; +using MutexedWriter = util::Mutexed>; - /// An Expression already satisfied by every batch to be written - std::shared_ptr partition_expression; -}; +struct WriterSet { + WriterSet(NextBasenameGenerator next_basename, + const FileSystemDatasetWriteOptions& write_options) + : next_basename_(std::move(next_basename)), + base_dir_(fs::internal::EnsureTrailingSlash(write_options.base_dir)), + write_options_(write_options) {} + + Result> Get(const Expression& partition_expression, + const std::shared_ptr& schema) { + ARROW_ASSIGN_OR_RAISE(auto part_segments, + write_options_.partitioning->Format(partition_expression)); + std::string dir = base_dir_ + part_segments; -Status WriteTask::Execute() { - std::unordered_map path_to_batches; - - // TODO(bkietz) these calls to Partition() should be scattered across a TaskGroup - for (auto maybe_batch : IteratorFromReader(batches)) { - ARROW_ASSIGN_OR_RAISE(auto batch, maybe_batch); - ARROW_ASSIGN_OR_RAISE(auto partitioned_batches, partitioning->Partition(batch)); - for (auto&& partitioned_batch : partitioned_batches) { - AndExpression expr(std::move(partitioned_batch.partition_expression), - partition_expression); - ARROW_ASSIGN_OR_RAISE(std::string path, partitioning->Format(expr)); - path = fs::internal::EnsureLeadingSlash(path); - path_to_batches[path].push_back(std::move(partitioned_batch.batch)); + auto lock = dir_to_writer_.Lock(); + + auto it_success = + dir_to_writer_->emplace(std::move(dir), std::make_shared()); + std::shared_ptr writer = it_success.first->second; + + if (!it_success.second) { + return writer; } + // FIXME Flush writers when at capacity for open files. + + std::string basename = next_basename_(); + auto writer_lock = writer->Lock(); + lock.Unlock(); + + dir = base_dir_ + part_segments; + RETURN_NOT_OK(write_options_.filesystem->CreateDir(dir)); + ARROW_ASSIGN_OR_RAISE(auto destination, + write_options_.filesystem->OpenOutputStream( + fs::internal::ConcatAbstractPath(dir, basename))); + ARROW_ASSIGN_OR_RAISE( + **writer, write_options_.format()->MakeWriter(std::move(destination), schema, + write_options_.file_write_options)); + return writer; } - for (auto&& path_batches : path_to_batches) { - auto dir = base_dir + path_batches.first; - RETURN_NOT_OK(filesystem->CreateDir(dir, /*recursive=*/true)); - - auto path = fs::internal::ConcatAbstractPath(dir, basename); - ARROW_ASSIGN_OR_RAISE(auto destination, filesystem->OpenOutputStream(path)); + Status FinishAll(internal::TaskGroup* task_group) { + for (const auto& dir_writer : *dir_to_writer_) { + task_group->Append([&] { + std::shared_ptr writer = **dir_writer.second; + return writer->Finish(); + }); + } - DCHECK(!path_batches.second.empty()); - ARROW_ASSIGN_OR_RAISE(auto reader, - RecordBatchReader::Make(std::move(path_batches.second))); - RETURN_NOT_OK(format->WriteFragment(reader.get(), destination.get())); + return Status::OK(); } - return Status::OK(); -} + // There should only be a single writer open for each partition directory at a time + util::Mutexed>> + dir_to_writer_; + NextBasenameGenerator next_basename_; + std::string base_dir_; + const FileSystemDatasetWriteOptions& write_options_; +}; Status FileSystemDataset::Write(std::shared_ptr schema, - std::shared_ptr format, - std::shared_ptr filesystem, - std::string base_dir, - std::shared_ptr partitioning, - std::shared_ptr scan_context, - FragmentIterator fragment_it) { - auto task_group = scan_context->TaskGroup(); - - base_dir = std::string(fs::internal::RemoveTrailingSlash(base_dir)); - - int i = 0; - for (auto maybe_fragment : fragment_it) { - ARROW_ASSIGN_OR_RAISE(auto fragment, maybe_fragment); - auto task = std::make_shared(); - - task->basename = "dat_" + std::to_string(i++) + "." + format->type_name(); - task->partition_expression = fragment->partition_expression(); - task->format = format; - task->filesystem = filesystem; - task->base_dir = base_dir; - task->partitioning = partitioning; - - // make a record batch reader which yields from a fragment - ARROW_ASSIGN_OR_RAISE(task->batches, FragmentRecordBatchReader::Make( - std::move(fragment), schema, scan_context)); - task_group->Append([task] { return task->Execute(); }); + const FileSystemDatasetWriteOptions& write_options, + std::shared_ptr scanner) { + auto task_group = scanner->context()->TaskGroup(); + + // Things we'll un-lazy for the sake of simplicity, with the tradeoff they represent: + // + // - Fragment iteration. Keeping this lazy would allow us to start partitioning/writing + // any fragments we have before waiting for discovery to complete. This isn't + // currently implemented for FileSystemDataset anyway: ARROW-8613 + // + // - ScanTask iteration. Keeping this lazy would save some unnecessary blocking when + // writing Fragments which produce scan tasks slowly. No Fragments do this. + // + // NB: neither of these will have any impact whatsoever on the common case of writing + // an in-memory table to disk. + ARROW_ASSIGN_OR_RAISE(FragmentVector fragments, scanner->GetFragments().ToVector()); + ScanTaskVector scan_tasks; + std::vector fragment_for_task; + + for (const auto& fragment : fragments) { + ARROW_ASSIGN_OR_RAISE( + auto scan_task_it, + Scanner(fragment, scanner->options(), scanner->context()).Scan()); + for (auto maybe_scan_task : scan_task_it) { + ARROW_ASSIGN_OR_RAISE(auto scan_task, maybe_scan_task); + scan_tasks.push_back(std::move(scan_task)); + fragment_for_task.push_back(fragment.get()); + } + } + + ARROW_ASSIGN_OR_RAISE(auto next_basename, + NextBasenameGenerator::Make(write_options.basename_template)); + WriterSet writers(std::move(next_basename), write_options); + + auto write_task_group = task_group->MakeSubGroup(); + auto fragment_for_task_it = fragment_for_task.begin(); + for (const auto& scan_task : scan_tasks) { + const Fragment* fragment = *fragment_for_task_it++; + + write_task_group->Append([&, scan_task, fragment] { + ARROW_ASSIGN_OR_RAISE(auto batches, scan_task->Execute()); + + for (auto maybe_batch : batches) { + ARROW_ASSIGN_OR_RAISE(auto batch, maybe_batch); + ARROW_ASSIGN_OR_RAISE(auto groups, write_options.partitioning->Partition(batch)); + batch.reset(); // drop to hopefully conserve memory + + struct PendingWrite { + std::shared_ptr writer; + std::shared_ptr batch; + + Result TryWrite() { + if (auto lock = writer->TryLock()) { + RETURN_NOT_OK(writer->get()->Write(batch)); + return true; + } + return false; + } + }; + std::vector pending_writes(groups.batches.size()); + + for (size_t i = 0; i < groups.batches.size(); ++i) { + AndExpression partition_expression(std::move(groups.expressions[i]), + fragment->partition_expression()); + + ARROW_ASSIGN_OR_RAISE(auto writer, writers.Get(partition_expression, + groups.batches[i]->schema())); + + pending_writes[i] = {writer, std::move(groups.batches[i])}; + } + + for (size_t i = 0; !pending_writes.empty(); ++i) { + if (i >= pending_writes.size()) { + i = 0; + } + + ARROW_ASSIGN_OR_RAISE(auto success, pending_writes[i].TryWrite()); + if (success) { + std::swap(pending_writes.back(), pending_writes[i]); + pending_writes.pop_back(); + } + } + } + + return Status::OK(); + }); } + RETURN_NOT_OK(write_task_group->Finish()); + RETURN_NOT_OK(writers.FinishAll(task_group.get())); return task_group->Finish(); } diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h index 957c876de72..e8c84f4afb6 100644 --- a/cpp/src/arrow/dataset/file_base.h +++ b/cpp/src/arrow/dataset/file_base.h @@ -128,6 +128,9 @@ class ARROW_DS_EXPORT FileFormat : public std::enable_shared_from_this IsSupported(const FileSource& source) const = 0; @@ -151,10 +154,11 @@ class ARROW_DS_EXPORT FileFormat : public std::enable_shared_from_this> MakeFragment( FileSource source, std::shared_ptr physical_schema = NULLPTR); - /// \brief Write a fragment. - /// FIXME(bkietz) make this pure virtual - virtual Status WriteFragment(RecordBatchReader* batches, - io::OutputStream* destination) const = 0; + virtual Result> MakeWriter( + std::shared_ptr destination, std::shared_ptr schema, + std::shared_ptr options) const = 0; + + virtual std::shared_ptr DefaultWriteOptions() = 0; }; /// \brief A Fragment that is stored in a file with a known format @@ -210,19 +214,9 @@ class ARROW_DS_EXPORT FileSystemDataset : public Dataset { std::vector> fragments); /// \brief Write a dataset. - /// - /// \param[in] schema Schema of written dataset. - /// \param[in] format FileFormat with which fragments will be written. - /// \param[in] filesystem FileSystem into which the dataset will be written. - /// \param[in] base_dir Root directory into which the dataset will be written. - /// \param[in] partitioning Partitioning used to generate fragment paths. - /// \param[in] scan_context Resource pool used to scan and write fragments. - /// \param[in] fragments Fragments to be written to disk. - static Status Write(std::shared_ptr schema, std::shared_ptr format, - std::shared_ptr filesystem, std::string base_dir, - std::shared_ptr partitioning, - std::shared_ptr scan_context, - FragmentIterator fragments); + static Status Write(std::shared_ptr schema, + const FileSystemDatasetWriteOptions& write_options, + std::shared_ptr scanner); /// \brief Return the type name of the dataset. std::string type_name() const override { return "filesystem"; } @@ -256,5 +250,62 @@ class ARROW_DS_EXPORT FileSystemDataset : public Dataset { std::vector> fragments_; }; +class ARROW_DS_EXPORT FileWriteOptions { + public: + virtual ~FileWriteOptions() = default; + + const std::shared_ptr& format() const { return format_; } + + protected: + explicit FileWriteOptions(std::shared_ptr format) + : format_(std::move(format)) {} + + std::shared_ptr format_; +}; + +class ARROW_DS_EXPORT FileWriter { + public: + virtual ~FileWriter() = default; + + virtual Status Write(const std::shared_ptr& batch) = 0; + + Status Write(RecordBatchReader* batches); + + virtual Status Finish() = 0; + + const std::shared_ptr& format() const { return options_->format(); } + const std::shared_ptr& schema() const { return schema_; } + const std::shared_ptr& options() const { return options_; } + + protected: + FileWriter(std::shared_ptr schema, std::shared_ptr options) + : schema_(std::move(schema)), options_(std::move(options)) {} + + std::shared_ptr schema_; + std::shared_ptr options_; +}; + +struct ARROW_DS_EXPORT FileSystemDatasetWriteOptions { + /// Options for individual fragment writing. + std::shared_ptr file_write_options; + + /// FileSystem into which a dataset will be written. + std::shared_ptr filesystem; + + /// Root directory into which the dataset will be written. + std::string base_dir; + + /// Partitioning used to generate fragment paths. + std::shared_ptr partitioning; + + /// Template string used to generate fragment basenames. + /// {i} will be replaced by an auto incremented integer. + std::string basename_template; + + const std::shared_ptr& format() const { + return file_write_options->format(); + } +}; + } // namespace dataset } // namespace arrow diff --git a/cpp/src/arrow/dataset/file_csv.cc b/cpp/src/arrow/dataset/file_csv.cc index 3b474b428d9..f889b12fddd 100644 --- a/cpp/src/arrow/dataset/file_csv.cc +++ b/cpp/src/arrow/dataset/file_csv.cc @@ -157,6 +157,22 @@ class CsvScanTask : public ScanTask { FileSource source_; }; +bool CsvFileFormat::Equals(const FileFormat& format) const { + if (type_name() != format.type_name()) return false; + + const auto& other_parse_options = + checked_cast(format).parse_options; + + return parse_options.delimiter == other_parse_options.delimiter && + parse_options.quoting == other_parse_options.quoting && + parse_options.quote_char == other_parse_options.quote_char && + parse_options.double_quote == other_parse_options.double_quote && + parse_options.escaping == other_parse_options.escaping && + parse_options.escape_char == other_parse_options.escape_char && + parse_options.newlines_in_values == other_parse_options.newlines_in_values && + parse_options.ignore_empty_lines == other_parse_options.ignore_empty_lines; +} + Result CsvFileFormat::IsSupported(const FileSource& source) const { RETURN_NOT_OK(source.Open().status()); return OpenReader(source, *this).ok(); diff --git a/cpp/src/arrow/dataset/file_csv.h b/cpp/src/arrow/dataset/file_csv.h index 022517f4585..df5593459e3 100644 --- a/cpp/src/arrow/dataset/file_csv.h +++ b/cpp/src/arrow/dataset/file_csv.h @@ -37,6 +37,8 @@ class ARROW_DS_EXPORT CsvFileFormat : public FileFormat { std::string type_name() const override { return "csv"; } + bool Equals(const FileFormat& other) const override; + Result IsSupported(const FileSource& source) const override; /// \brief Return the schema of the file if possible. @@ -47,9 +49,13 @@ class ARROW_DS_EXPORT CsvFileFormat : public FileFormat { std::shared_ptr context, FileFragment* fragment) const override; - Status WriteFragment(RecordBatchReader*, io::OutputStream*) const override { + Result> MakeWriter( + std::shared_ptr destination, std::shared_ptr schema, + std::shared_ptr options) const override { return Status::NotImplemented("writing fragment of CsvFileFormat"); } + + std::shared_ptr DefaultWriteOptions() override { return NULLPTR; } }; } // namespace dataset diff --git a/cpp/src/arrow/dataset/file_ipc.cc b/cpp/src/arrow/dataset/file_ipc.cc index b849295bba5..9c7d643f9ac 100644 --- a/cpp/src/arrow/dataset/file_ipc.cc +++ b/cpp/src/arrow/dataset/file_ipc.cc @@ -27,9 +27,13 @@ #include "arrow/dataset/scanner.h" #include "arrow/ipc/reader.h" #include "arrow/ipc/writer.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/iterator.h" namespace arrow { + +using internal::checked_pointer_cast; + namespace dataset { static inline ipc::IpcReadOptions default_read_options() { @@ -159,18 +163,44 @@ Result IpcFileFormat::ScanFile(std::shared_ptr op fragment->source()); } -Status IpcFileFormat::WriteFragment(RecordBatchReader* batches, - io::OutputStream* destination) const { - ARROW_ASSIGN_OR_RAISE(auto writer, ipc::MakeFileWriter(destination, batches->schema())); +/// +/// IpcFileWriter, IpcFileWriteOptions +/// + +std::shared_ptr IpcFileFormat::DefaultWriteOptions() { + std::shared_ptr options( + new IpcFileWriteOptions(shared_from_this())); + + options->ipc_options = + std::make_shared(ipc::IpcWriteOptions::Defaults()); + return options; +} - for (;;) { - ARROW_ASSIGN_OR_RAISE(auto batch, batches->Next()); - if (batch == nullptr) break; - RETURN_NOT_OK(writer->WriteRecordBatch(*batch)); +Result> IpcFileFormat::MakeWriter( + std::shared_ptr destination, std::shared_ptr schema, + std::shared_ptr options) const { + if (!Equals(*options->format())) { + return Status::TypeError("Mismatching format/write options."); } - return writer->Close(); + auto ipc_options = checked_pointer_cast(options); + + ARROW_ASSIGN_OR_RAISE(auto writer, ipc::MakeFileWriter(destination, schema)); + return std::shared_ptr( + new IpcFileWriter(std::move(writer), std::move(schema), std::move(ipc_options))); } +IpcFileWriter::IpcFileWriter(std::shared_ptr writer, + std::shared_ptr schema, + std::shared_ptr options) + : FileWriter(std::move(schema), std::move(options)), + batch_writer_(std::move(writer)) {} + +Status IpcFileWriter::Write(const std::shared_ptr& batch) { + return batch_writer_->WriteRecordBatch(*batch); +} + +Status IpcFileWriter::Finish() { return batch_writer_->Close(); } + } // namespace dataset } // namespace arrow diff --git a/cpp/src/arrow/dataset/file_ipc.h b/cpp/src/arrow/dataset/file_ipc.h index be11c71a95a..50650bfedb0 100644 --- a/cpp/src/arrow/dataset/file_ipc.h +++ b/cpp/src/arrow/dataset/file_ipc.h @@ -28,6 +28,12 @@ #include "arrow/result.h" namespace arrow { +namespace ipc { + +class RecordBatchWriter; +struct IpcWriteOptions; + +} // namespace ipc namespace dataset { /// \brief A FileFormat implementation that reads from and writes to Ipc files @@ -35,6 +41,10 @@ class ARROW_DS_EXPORT IpcFileFormat : public FileFormat { public: std::string type_name() const override { return "ipc"; } + bool Equals(const FileFormat& other) const override { + return type_name() == other.type_name(); + } + bool splittable() const override { return true; } Result IsSupported(const FileSource& source) const override; @@ -47,8 +57,37 @@ class ARROW_DS_EXPORT IpcFileFormat : public FileFormat { std::shared_ptr context, FileFragment* fragment) const override; - Status WriteFragment(RecordBatchReader* batches, - io::OutputStream* destination) const override; + Result> MakeWriter( + std::shared_ptr destination, std::shared_ptr schema, + std::shared_ptr options) const override; + + std::shared_ptr DefaultWriteOptions() override; +}; + +class ARROW_DS_EXPORT IpcFileWriteOptions : public FileWriteOptions { + public: + std::shared_ptr ipc_options; + + protected: + using FileWriteOptions::FileWriteOptions; + + friend class IpcFileFormat; +}; + +class ARROW_DS_EXPORT IpcFileWriter : public FileWriter { + public: + Status Write(const std::shared_ptr& batch) override; + + Status Finish() override; + + private: + IpcFileWriter(std::shared_ptr writer, + std::shared_ptr schema, + std::shared_ptr options); + + std::shared_ptr batch_writer_; + + friend class IpcFileFormat; }; } // namespace dataset diff --git a/cpp/src/arrow/dataset/file_ipc_test.cc b/cpp/src/arrow/dataset/file_ipc_test.cc index 41ac5a9b7b0..808bae568b4 100644 --- a/cpp/src/arrow/dataset/file_ipc_test.cc +++ b/cpp/src/arrow/dataset/file_ipc_test.cc @@ -156,7 +156,10 @@ TEST_F(TestIpcFileFormat, WriteRecordBatchReader) { EXPECT_OK_AND_ASSIGN(auto sink, GetFileSink()); - ASSERT_OK(format_->WriteFragment(reader.get(), sink.get())); + auto options = format_->DefaultWriteOptions(); + EXPECT_OK_AND_ASSIGN(auto writer, format_->MakeWriter(sink, reader->schema(), options)); + ASSERT_OK(writer->Write(reader.get())); + ASSERT_OK(writer->Finish()); EXPECT_OK_AND_ASSIGN(auto written, sink->Finish()); @@ -168,7 +171,9 @@ class TestIpcFileSystemDataset : public testing::Test, public: void SetUp() override { MakeSourceDataset(); - format_ = std::make_shared(); + auto ipc_format = std::make_shared(); + format_ = ipc_format; + SetWriteOptions(ipc_format->DefaultWriteOptions()); } }; diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 29ed18817d9..ced75c652c4 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -42,6 +42,7 @@ namespace arrow { using internal::checked_cast; +using internal::checked_pointer_cast; namespace dataset { @@ -285,12 +286,20 @@ class ParquetScanTaskIterator { size_t idx_ = 0; }; -ParquetFileFormat::ParquetFileFormat() - : writer_properties(parquet::default_writer_properties()), - arrow_writer_properties(parquet::default_arrow_writer_properties()) {} +bool ParquetFileFormat::Equals(const FileFormat& other) const { + if (other.type_name() != type_name()) return false; -ParquetFileFormat::ParquetFileFormat(const parquet::ReaderProperties& reader_properties) - : ParquetFileFormat() { + const auto& other_reader_options = + checked_cast(other).reader_options; + + // FIXME implement comparison for decryption options + // FIXME extract these to scan time options so comparison is unnecessary + return reader_options.use_buffered_stream == other_reader_options.use_buffered_stream && + reader_options.buffer_size == other_reader_options.buffer_size && + reader_options.dict_columns == other_reader_options.dict_columns; +} + +ParquetFileFormat::ParquetFileFormat(const parquet::ReaderProperties& reader_properties) { reader_options.use_buffered_stream = reader_properties.is_buffered_stream_enabled(); reader_options.buffer_size = reader_properties.buffer_size(); reader_options.file_decryption_properties = @@ -352,28 +361,6 @@ static inline bool RowGroupInfosAreComplete(const std::vector& inf [](const RowGroupInfo& i) { return i.HasStatistics(); }); } -Status ParquetFileFormat::WriteFragment(RecordBatchReader* batches, - io::OutputStream* destination) const { - using parquet::arrow::FileWriter; - - std::shared_ptr shared_destination{destination, - [](io::OutputStream*) {}}; - - std::unique_ptr writer; - RETURN_NOT_OK(FileWriter::Open(*batches->schema(), default_memory_pool(), - shared_destination, writer_properties, - arrow_writer_properties, &writer)); - - for (;;) { - ARROW_ASSIGN_OR_RAISE(auto batch, batches->Next()); - if (batch == nullptr) break; - ARROW_ASSIGN_OR_RAISE(auto table, Table::FromRecordBatches(batch->schema(), {batch})); - RETURN_NOT_OK(writer->WriteTable(*table, batch->num_rows())); - } - - return writer->Close(); -} - Result ParquetFileFormat::ScanFile(std::shared_ptr options, std::shared_ptr context, FileFragment* fragment) const { @@ -440,6 +427,48 @@ Result> ParquetFileFormat::MakeFragment( std::move(physical_schema), {})); } +/// +/// ParquetFileWriter, ParquetFileWriteOptions +/// + +std::shared_ptr ParquetFileFormat::DefaultWriteOptions() { + std::shared_ptr options( + new ParquetFileWriteOptions(shared_from_this())); + options->writer_properties = parquet::default_writer_properties(); + options->arrow_writer_properties = parquet::default_arrow_writer_properties(); + return options; +} + +Result> ParquetFileFormat::MakeWriter( + std::shared_ptr destination, std::shared_ptr schema, + std::shared_ptr options) const { + if (!Equals(*options->format())) { + return Status::TypeError("Mismatching format/write options"); + } + + auto parquet_options = checked_pointer_cast(options); + + std::unique_ptr parquet_writer; + RETURN_NOT_OK(parquet::arrow::FileWriter::Open( + *schema, default_memory_pool(), destination, parquet_options->writer_properties, + parquet_options->arrow_writer_properties, &parquet_writer)); + + return std::shared_ptr( + new ParquetFileWriter(std::move(parquet_writer), std::move(parquet_options))); +} + +ParquetFileWriter::ParquetFileWriter(std::shared_ptr writer, + std::shared_ptr options) + : FileWriter(writer->schema(), std::move(options)), + parquet_writer_(std::move(writer)) {} + +Status ParquetFileWriter::Write(const std::shared_ptr& batch) { + ARROW_ASSIGN_OR_RAISE(auto table, Table::FromRecordBatches(batch->schema(), {batch})); + return parquet_writer_->WriteTable(*table, batch->num_rows()); +} + +Status ParquetFileWriter::Finish() { return parquet_writer_->Close(); } + /// /// RowGroupInfo /// diff --git a/cpp/src/arrow/dataset/file_parquet.h b/cpp/src/arrow/dataset/file_parquet.h index 939fdc53687..25235128257 100644 --- a/cpp/src/arrow/dataset/file_parquet.h +++ b/cpp/src/arrow/dataset/file_parquet.h @@ -41,11 +41,13 @@ class FileEncryptionProperties; class ReaderProperties; class ArrowReaderProperties; + class WriterProperties; class ArrowWriterProperties; namespace arrow { class FileReader; +class FileWriter; } // namespace arrow } // namespace parquet @@ -57,7 +59,7 @@ class RowGroupInfo; /// \brief A FileFormat implementation that reads from Parquet files class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat { public: - ParquetFileFormat(); + ParquetFileFormat() = default; /// Convenience constructor which copies properties from a parquet::ReaderProperties. /// memory_pool will be ignored. @@ -67,6 +69,8 @@ class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat { bool splittable() const override { return true; } + bool Equals(const FileFormat& other) const override; + struct ReaderOptions { /// \defgroup parquet-file-format-reader-properties properties which correspond to /// members of parquet::ReaderProperties. @@ -99,10 +103,6 @@ class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat { bool enable_parallel_column_conversion = false; } reader_options; - std::shared_ptr writer_properties; - - std::shared_ptr arrow_writer_properties; - Result IsSupported(const FileSource& source) const override; /// \brief Return the schema of the file if possible. @@ -130,8 +130,11 @@ class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat { Result> GetReader( const FileSource& source, ScanOptions* = NULLPTR, ScanContext* = NULLPTR) const; - Status WriteFragment(RecordBatchReader* batches, - io::OutputStream* destination) const override; + Result> MakeWriter( + std::shared_ptr destination, std::shared_ptr schema, + std::shared_ptr options) const override; + + std::shared_ptr DefaultWriteOptions() override; }; /// \brief Represents a parquet's RowGroup with extra information. @@ -248,6 +251,37 @@ class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment { friend class ParquetFileFormat; }; +class ARROW_DS_EXPORT ParquetFileWriteOptions : public FileWriteOptions { + public: + std::shared_ptr writer_properties; + + std::shared_ptr arrow_writer_properties; + + protected: + using FileWriteOptions::FileWriteOptions; + + friend class ParquetFileFormat; +}; + +class ARROW_DS_EXPORT ParquetFileWriter : public FileWriter { + public: + const std::shared_ptr& parquet_writer() const { + return parquet_writer_; + } + + Status Write(const std::shared_ptr& batch) override; + + Status Finish() override; + + private: + ParquetFileWriter(std::shared_ptr writer, + std::shared_ptr options); + + std::shared_ptr parquet_writer_; + + friend class ParquetFileFormat; +}; + struct ParquetFactoryOptions { // Either an explicit Partitioning or a PartitioningFactory to discover one. // diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc index d0a22e98382..ec47259d30a 100644 --- a/cpp/src/arrow/dataset/file_parquet_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_test.cc @@ -48,7 +48,6 @@ using parquet::default_writer_properties; using parquet::WriterProperties; using parquet::CreateOutputStream; -using parquet::arrow::FileWriter; using parquet::arrow::WriteTable; using testing::Pointee; @@ -57,7 +56,7 @@ using internal::checked_pointer_cast; class ArrowParquetWriterMixin : public ::testing::Test { public: - Status WriteRecordBatch(const RecordBatch& batch, FileWriter* writer) { + Status WriteRecordBatch(const RecordBatch& batch, parquet::arrow::FileWriter* writer) { auto schema = batch.schema(); auto size = batch.num_rows(); @@ -75,7 +74,8 @@ class ArrowParquetWriterMixin : public ::testing::Test { return Status::OK(); } - Status WriteRecordBatchReader(RecordBatchReader* reader, FileWriter* writer) { + Status WriteRecordBatchReader(RecordBatchReader* reader, + parquet::arrow::FileWriter* writer) { auto schema = reader->schema(); if (!schema->Equals(*writer->schema(), false)) { @@ -96,9 +96,9 @@ class ArrowParquetWriterMixin : public ::testing::Test { const std::shared_ptr& properties = default_writer_properties(), const std::shared_ptr& arrow_properties = default_arrow_writer_properties()) { - std::unique_ptr writer; - RETURN_NOT_OK(FileWriter::Open(*reader->schema(), pool, sink, properties, - arrow_properties, &writer)); + std::unique_ptr writer; + RETURN_NOT_OK(parquet::arrow::FileWriter::Open( + *reader->schema(), pool, sink, properties, arrow_properties, &writer)); RETURN_NOT_OK(WriteRecordBatchReader(reader, writer.get())); return writer->Close(); } @@ -554,7 +554,10 @@ TEST_F(TestParquetFileFormat, WriteRecordBatchReader) { EXPECT_OK_AND_ASSIGN(auto sink, GetFileSink()); - ASSERT_OK(format_->WriteFragment(reader.get(), sink.get())); + auto options = format_->DefaultWriteOptions(); + EXPECT_OK_AND_ASSIGN(auto writer, format_->MakeWriter(sink, reader->schema(), options)); + ASSERT_OK(writer->Write(reader.get())); + ASSERT_OK(writer->Finish()); EXPECT_OK_AND_ASSIGN(auto written, sink->Finish()); @@ -572,16 +575,21 @@ TEST_F(TestParquetFileFormat, WriteRecordBatchReaderCustomOptions) { EXPECT_OK_AND_ASSIGN(auto sink, GetFileSink()); - format_->writer_properties = parquet::WriterProperties::Builder() + auto options = + checked_pointer_cast(format_->DefaultWriteOptions()); + options->writer_properties = parquet::WriterProperties::Builder() .created_by("TestParquetFileFormat") ->disable_statistics() ->build(); - format_->arrow_writer_properties = parquet::ArrowWriterProperties::Builder() + options->arrow_writer_properties = parquet::ArrowWriterProperties::Builder() .coerce_timestamps(coerce_timestamps_to) ->build(); - ASSERT_OK(format_->WriteFragment(reader.get(), sink.get())); + EXPECT_OK_AND_ASSIGN(auto writer, format_->MakeWriter(sink, reader->schema(), options)); + ASSERT_OK(writer->Write(reader.get())); + ASSERT_OK(writer->Finish()); + EXPECT_OK_AND_ASSIGN(auto written, sink->Finish()); EXPECT_OK_AND_ASSIGN(auto fragment, format_->MakeFragment(FileSource{written})); @@ -596,7 +604,9 @@ class TestParquetFileSystemDataset : public WriteFileSystemDatasetMixin, void SetUp() override { MakeSourceDataset(); check_metadata_ = false; - format_ = std::make_shared(); + auto parquet_format = std::make_shared(); + format_ = parquet_format; + SetWriteOptions(parquet_format->DefaultWriteOptions()); } }; diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index a47ba849cfc..159e0ac0331 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -69,9 +69,9 @@ std::shared_ptr Partitioning::Default() { " Partitioning"); } - Result> Partition( + Result Partition( const std::shared_ptr& batch) const override { - return std::vector{{batch, scalar(true)}}; + return PartitionedBatches{{batch}, {scalar(true)}}; } }; @@ -139,7 +139,7 @@ inline std::shared_ptr ConjunctionFromGroupingRow(Scalar* row) { return and_(std::move(equality_expressions)); } -Result> KeyValuePartitioning::Partition( +Result KeyValuePartitioning::Partition( const std::shared_ptr& batch) const { FieldVector by_fields; ArrayVector by_columns; @@ -158,7 +158,7 @@ Result> KeyValuePartitioning::Partit if (by_fields.empty()) { // no fields to group by; return the whole batch - return std::vector{{batch, scalar(true)}}; + return PartitionedBatches{{batch}, {scalar(true)}}; } ARROW_ASSIGN_OR_RAISE(auto by, @@ -168,13 +168,13 @@ Result> KeyValuePartitioning::Partit checked_pointer_cast(groupings_and_values->GetFieldByName("groupings")); auto unique_rows = groupings_and_values->GetFieldByName("values"); - ARROW_ASSIGN_OR_RAISE(auto grouped_batches, ApplyGroupings(*groupings, rest)); + PartitionedBatches out; + ARROW_ASSIGN_OR_RAISE(out.batches, ApplyGroupings(*groupings, rest)); + out.expressions.resize(out.batches.size()); - std::vector out(grouped_batches.size()); - for (size_t i = 0; i < out.size(); ++i) { + for (size_t i = 0; i < out.batches.size(); ++i) { ARROW_ASSIGN_OR_RAISE(auto row, unique_rows->GetScalar(i)); - out[i].partition_expression = ConjunctionFromGroupingRow(row.get()); - out[i].batch = std::move(grouped_batches[i]); + out.expressions[i] = ConjunctionFromGroupingRow(row.get()); } return out; } diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h index 6228999b41d..165fcfb5248 100644 --- a/cpp/src/arrow/dataset/partition.h +++ b/cpp/src/arrow/dataset/partition.h @@ -60,12 +60,12 @@ class ARROW_DS_EXPORT Partitioning { virtual std::string type_name() const = 0; /// \brief If the input batch shares any fields with this partitioning, - /// produce slices of the batch which satisfy mutually exclusive Expressions. - struct PartitionedBatch { - std::shared_ptr batch; - std::shared_ptr partition_expression; + /// produce sub-batches which satisfy mutually exclusive Expressions. + struct PartitionedBatches { + RecordBatchVector batches; + ExpressionVector expressions; }; - virtual Result> Partition( + virtual Result Partition( const std::shared_ptr& batch) const = 0; /// \brief Parse a path into a partition expression @@ -133,7 +133,7 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning { static Status SetDefaultValuesFromKeys(const Expression& expr, RecordBatchProjector* projector); - Result> Partition( + Result Partition( const std::shared_ptr& batch) const override; Result> Parse(const std::string& path) const override; @@ -240,7 +240,7 @@ class ARROW_DS_EXPORT FunctionPartitioning : public Partitioning { return Status::NotImplemented("formatting paths from ", type_name(), " Partitioning"); } - Result> Partition( + Result Partition( const std::shared_ptr& batch) const override { return Status::NotImplemented("partitioning batches from ", type_name(), " Partitioning"); diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index ef92a99a967..e9ea2539e89 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -435,7 +435,7 @@ class RangePartitioning : public Partitioning { } Result Format(const Expression&) const override { return ""; } - Result> Partition( + Result Partition( const std::shared_ptr&) const override { return Status::OK(); } diff --git a/cpp/src/arrow/dataset/test_util.h b/cpp/src/arrow/dataset/test_util.h index fcecf46552a..7822f52fe82 100644 --- a/cpp/src/arrow/dataset/test_util.h +++ b/cpp/src/arrow/dataset/test_util.h @@ -199,6 +199,11 @@ class DummyFileFormat : public FileFormat { std::string type_name() const override { return "dummy"; } + bool Equals(const FileFormat& other) const override { + return type_name() == other.type_name() && + schema_->Equals(checked_cast(other).schema_); + } + Result IsSupported(const FileSource& source) const override { return true; } Result> Inspect(const FileSource& source) const override { @@ -212,10 +217,14 @@ class DummyFileFormat : public FileFormat { return MakeEmptyIterator>(); } - Status WriteFragment(RecordBatchReader*, io::OutputStream*) const override { + Result> MakeWriter( + std::shared_ptr destination, std::shared_ptr schema, + std::shared_ptr options) const override { return Status::NotImplemented("writing fragment of DummyFileFormat"); } + std::shared_ptr DefaultWriteOptions() override { return nullptr; } + protected: std::shared_ptr schema_; }; @@ -230,6 +239,8 @@ class JSONRecordBatchFileFormat : public FileFormat { explicit JSONRecordBatchFileFormat(SchemaResolver resolver) : resolver_(std::move(resolver)) {} + bool Equals(const FileFormat& other) const override { return this == &other; } + std::string type_name() const override { return "json_record_batch"; } /// \brief Return true if the given file extension @@ -255,10 +266,14 @@ class JSONRecordBatchFileFormat : public FileFormat { std::move(context)); } - Status WriteFragment(RecordBatchReader*, io::OutputStream*) const override { + Result> MakeWriter( + std::shared_ptr destination, std::shared_ptr schema, + std::shared_ptr options) const override { return Status::NotImplemented("writing fragment of JSONRecordBatchFileFormat"); } + std::shared_ptr DefaultWriteOptions() override { return nullptr; } + protected: SchemaResolver resolver_; }; @@ -560,46 +575,54 @@ class WriteFileSystemDatasetMixin : public MakeFileSystemDatasetMixin { ASSERT_OK_AND_ASSIGN(auto factory, FileSystemDatasetFactory::Make(fs_, s, source_format, options)); ASSERT_OK_AND_ASSIGN(dataset_, factory->Finish()); + + scan_options_ = ScanOptions::Make(source_schema_); } - void TestWriteWithIdenticalPartitioningSchema() { - auto desired_partitioning = std::make_shared( - SchemaFromColumnNames(source_schema_, {"year", "month"})); + void SetWriteOptions(std::shared_ptr file_write_options) { + write_options_.file_write_options = file_write_options; + write_options_.filesystem = fs_; + write_options_.base_dir = "new_root/"; + write_options_.basename_template = "dat_{i}"; + } - ASSERT_OK(FileSystemDataset::Write( - source_schema_, format_, fs_, "new_root/", desired_partitioning, - std::make_shared(), dataset_->GetFragments())); + void DoWrite(std::shared_ptr desired_partitioning) { + write_options_.partitioning = desired_partitioning; + auto scanner = std::make_shared(dataset_, scan_options_, scan_context_); + ASSERT_OK(FileSystemDataset::Write(source_schema_, write_options_, scanner)); + // re-discover the written dataset fs::FileSelector s; s.recursive = true; s.base_dir = "/new_root"; - FileSystemFactoryOptions options; - options.partitioning = desired_partitioning; - ASSERT_OK_AND_ASSIGN(auto factory, - FileSystemDatasetFactory::Make(fs_, s, format_, options)); + FileSystemFactoryOptions factory_options; + factory_options.partitioning = desired_partitioning; + ASSERT_OK_AND_ASSIGN( + auto factory, FileSystemDatasetFactory::Make(fs_, s, format_, factory_options)); ASSERT_OK_AND_ASSIGN(written_, factory->Finish()); + } + + void TestWriteWithIdenticalPartitioningSchema() { + DoWrite(std::make_shared( + SchemaFromColumnNames(source_schema_, {"year", "month"}))); - expected_files_["/new_root/2018/1/dat_0." + format_->type_name()] = R"([ + expected_files_["/new_root/2018/1/dat_0"] = R"([ {"region": "NY", "model": "3", "sales": 742.0, "country": "US"}, {"region": "NY", "model": "S", "sales": 304.125, "country": "US"}, - {"region": "NY", "model": "Y", "sales": 27.5, "country": "US"} - ])"; - expected_files_["/new_root/2018/1/dat_1." + format_->type_name()] = R"([ + {"region": "NY", "model": "Y", "sales": 27.5, "country": "US"}, {"region": "QC", "model": "3", "sales": 512, "country": "CA"}, {"region": "QC", "model": "S", "sales": 978, "country": "CA"}, {"region": "NY", "model": "X", "sales": 136.25, "country": "US"}, {"region": "QC", "model": "X", "sales": 1.0, "country": "CA"}, {"region": "QC", "model": "Y", "sales": 69, "country": "CA"} ])"; - expected_files_["/new_root/2019/1/dat_2." + format_->type_name()] = R"([ + expected_files_["/new_root/2019/1/dat_1"] = R"([ {"region": "CA", "model": "3", "sales": 273.5, "country": "US"}, {"region": "CA", "model": "S", "sales": 13, "country": "US"}, {"region": "CA", "model": "X", "sales": 54, "country": "US"}, {"region": "QC", "model": "S", "sales": 10, "country": "CA"}, - {"region": "CA", "model": "Y", "sales": 21, "country": "US"} - ])"; - expected_files_["/new_root/2019/1/dat_3." + format_->type_name()] = R"([ + {"region": "CA", "model": "Y", "sales": 21, "country": "US"}, {"region": "QC", "model": "3", "sales": 152.25, "country": "CA"}, {"region": "QC", "model": "X", "sales": 42, "country": "CA"}, {"region": "QC", "model": "Y", "sales": 37, "country": "CA"} @@ -611,52 +634,32 @@ class WriteFileSystemDatasetMixin : public MakeFileSystemDatasetMixin { } void TestWriteWithUnrelatedPartitioningSchema() { - auto desired_partitioning = std::make_shared( - SchemaFromColumnNames(source_schema_, {"country", "region"})); - - ASSERT_OK(FileSystemDataset::Write( - source_schema_, format_, fs_, "new_root/", desired_partitioning, - std::make_shared(), dataset_->GetFragments())); - - fs::FileSelector s; - s.recursive = true; - s.base_dir = "/new_root"; - - FileSystemFactoryOptions options; - options.partitioning = desired_partitioning; - ASSERT_OK_AND_ASSIGN(auto factory, - FileSystemDatasetFactory::Make(fs_, s, format_, options)); - ASSERT_OK_AND_ASSIGN(written_, factory->Finish()); + DoWrite(std::make_shared( + SchemaFromColumnNames(source_schema_, {"country", "region"}))); // XXX first thing a user will be annoyed by: we don't support left // padding the month field with 0. - expected_files_["/new_root/US/NY/dat_0." + format_->type_name()] = R"([ + expected_files_["/new_root/US/NY/dat_0"] = R"([ {"year": 2018, "month": 1, "model": "3", "sales": 742.0}, {"year": 2018, "month": 1, "model": "S", "sales": 304.125}, - {"year": 2018, "month": 1, "model": "Y", "sales": 27.5} - ])"; - expected_files_["/new_root/US/NY/dat_1." + format_->type_name()] = R"([ + {"year": 2018, "month": 1, "model": "Y", "sales": 27.5}, {"year": 2018, "month": 1, "model": "X", "sales": 136.25} ])"; - expected_files_["/new_root/CA/QC/dat_1." + format_->type_name()] = R"([ + expected_files_["/new_root/CA/QC/dat_1"] = R"([ {"year": 2018, "month": 1, "model": "3", "sales": 512}, {"year": 2018, "month": 1, "model": "S", "sales": 978}, {"year": 2018, "month": 1, "model": "X", "sales": 1.0}, - {"year": 2018, "month": 1, "model": "Y", "sales": 69} + {"year": 2018, "month": 1, "model": "Y", "sales": 69}, + {"year": 2019, "month": 1, "model": "S", "sales": 10}, + {"year": 2019, "month": 1, "model": "3", "sales": 152.25}, + {"year": 2019, "month": 1, "model": "X", "sales": 42}, + {"year": 2019, "month": 1, "model": "Y", "sales": 37} ])"; - expected_files_["/new_root/US/CA/dat_2." + format_->type_name()] = R"([ + expected_files_["/new_root/US/CA/dat_2"] = R"([ {"year": 2019, "month": 1, "model": "3", "sales": 273.5}, {"year": 2019, "month": 1, "model": "S", "sales": 13}, {"year": 2019, "month": 1, "model": "X", "sales": 54}, {"year": 2019, "month": 1, "model": "Y", "sales": 21} - ])"; - expected_files_["/new_root/CA/QC/dat_2." + format_->type_name()] = R"([ - {"year": 2019, "month": 1, "model": "S", "sales": 10} - ])"; - expected_files_["/new_root/CA/QC/dat_3." + format_->type_name()] = R"([ - {"year": 2019, "month": 1, "model": "3", "sales": 152.25}, - {"year": 2019, "month": 1, "model": "X", "sales": 42}, - {"year": 2019, "month": 1, "model": "Y", "sales": 37} ])"; expected_physical_schema_ = SchemaFromColumnNames(source_schema_, {"model", "sales", "year", "month"}); @@ -665,49 +668,31 @@ class WriteFileSystemDatasetMixin : public MakeFileSystemDatasetMixin { } void TestWriteWithSupersetPartitioningSchema() { - auto desired_partitioning = std::make_shared( - SchemaFromColumnNames(source_schema_, {"year", "month", "country", "region"})); - - ASSERT_OK(FileSystemDataset::Write( - source_schema_, format_, fs_, "new_root/", desired_partitioning, - std::make_shared(), dataset_->GetFragments())); - - fs::FileSelector s; - s.recursive = true; - s.base_dir = "/new_root"; - - FileSystemFactoryOptions options; - options.partitioning = desired_partitioning; - ASSERT_OK_AND_ASSIGN(auto factory, - FileSystemDatasetFactory::Make(fs_, s, format_, options)); - ASSERT_OK_AND_ASSIGN(written_, factory->Finish()); + DoWrite(std::make_shared( + SchemaFromColumnNames(source_schema_, {"year", "month", "country", "region"}))); // XXX first thing a user will be annoyed by: we don't support left // padding the month field with 0. - expected_files_["/new_root/2018/1/US/NY/dat_0." + format_->type_name()] = R"([ + expected_files_["/new_root/2018/1/US/NY/dat_0"] = R"([ {"model": "3", "sales": 742.0}, {"model": "S", "sales": 304.125}, - {"model": "Y", "sales": 27.5} - ])"; - expected_files_["/new_root/2018/1/US/NY/dat_1." + format_->type_name()] = R"([ + {"model": "Y", "sales": 27.5}, {"model": "X", "sales": 136.25} ])"; - expected_files_["/new_root/2018/1/CA/QC/dat_1." + format_->type_name()] = R"([ + expected_files_["/new_root/2018/1/CA/QC/dat_1"] = R"([ {"model": "3", "sales": 512}, {"model": "S", "sales": 978}, {"model": "X", "sales": 1.0}, {"model": "Y", "sales": 69} ])"; - expected_files_["/new_root/2019/1/US/CA/dat_2." + format_->type_name()] = R"([ + expected_files_["/new_root/2019/1/US/CA/dat_2"] = R"([ {"model": "3", "sales": 273.5}, {"model": "S", "sales": 13}, {"model": "X", "sales": 54}, {"model": "Y", "sales": 21} ])"; - expected_files_["/new_root/2019/1/CA/QC/dat_2." + format_->type_name()] = R"([ - {"model": "S", "sales": 10} - ])"; - expected_files_["/new_root/2019/1/CA/QC/dat_3." + format_->type_name()] = R"([ + expected_files_["/new_root/2019/1/CA/QC/dat_3"] = R"([ + {"model": "S", "sales": 10}, {"model": "3", "sales": 152.25}, {"model": "X", "sales": 42}, {"model": "Y", "sales": 37} @@ -718,43 +703,23 @@ class WriteFileSystemDatasetMixin : public MakeFileSystemDatasetMixin { } void TestWriteWithEmptyPartitioningSchema() { - auto desired_partitioning = std::make_shared( - SchemaFromColumnNames(source_schema_, {})); - - ASSERT_OK(FileSystemDataset::Write( - source_schema_, format_, fs_, "new_root/", desired_partitioning, - std::make_shared(), dataset_->GetFragments())); + DoWrite(std::make_shared( + SchemaFromColumnNames(source_schema_, {}))); - fs::FileSelector s; - s.recursive = true; - s.base_dir = "/new_root"; - - FileSystemFactoryOptions options; - options.partitioning = desired_partitioning; - ASSERT_OK_AND_ASSIGN(auto factory, - FileSystemDatasetFactory::Make(fs_, s, format_, options)); - ASSERT_OK_AND_ASSIGN(written_, factory->Finish()); - - expected_files_["/new_root/dat_0." + format_->type_name()] = R"([ + expected_files_["/new_root/dat_0"] = R"([ {"country": "US", "region": "NY", "year": 2018, "month": 1, "model": "3", "sales": 742.0}, {"country": "US", "region": "NY", "year": 2018, "month": 1, "model": "S", "sales": 304.125}, - {"country": "US", "region": "NY", "year": 2018, "month": 1, "model": "Y", "sales": 27.5} - ])"; - expected_files_["/new_root/dat_1." + format_->type_name()] = R"([ + {"country": "US", "region": "NY", "year": 2018, "month": 1, "model": "Y", "sales": 27.5}, {"country": "CA", "region": "QC", "year": 2018, "month": 1, "model": "3", "sales": 512}, {"country": "CA", "region": "QC", "year": 2018, "month": 1, "model": "S", "sales": 978}, {"country": "US", "region": "NY", "year": 2018, "month": 1, "model": "X", "sales": 136.25}, {"country": "CA", "region": "QC", "year": 2018, "month": 1, "model": "X", "sales": 1.0}, - {"country": "CA", "region": "QC", "year": 2018, "month": 1, "model": "Y", "sales": 69} - ])"; - expected_files_["/new_root/dat_2." + format_->type_name()] = R"([ + {"country": "CA", "region": "QC", "year": 2018, "month": 1, "model": "Y", "sales": 69}, {"country": "US", "region": "CA", "year": 2019, "month": 1, "model": "3", "sales": 273.5}, {"country": "US", "region": "CA", "year": 2019, "month": 1, "model": "S", "sales": 13}, {"country": "US", "region": "CA", "year": 2019, "month": 1, "model": "X", "sales": 54}, {"country": "CA", "region": "QC", "year": 2019, "month": 1, "model": "S", "sales": 10}, - {"country": "US", "region": "CA", "year": 2019, "month": 1, "model": "Y", "sales": 21} - ])"; - expected_files_["/new_root/dat_3." + format_->type_name()] = R"([ + {"country": "US", "region": "CA", "year": 2019, "month": 1, "model": "Y", "sales": 21}, {"country": "CA", "region": "QC", "year": 2019, "month": 1, "model": "3", "sales": 152.25}, {"country": "CA", "region": "QC", "year": 2019, "month": 1, "model": "X", "sales": 42}, {"country": "CA", "region": "QC", "year": 2019, "month": 1, "model": "Y", "sales": 37} @@ -765,12 +730,14 @@ class WriteFileSystemDatasetMixin : public MakeFileSystemDatasetMixin { } void AssertWrittenAsExpected() { - std::vector files; + std::unordered_set expected_paths, actual_paths; for (const auto& file_contents : expected_files_) { - files.push_back(file_contents.first); + expected_paths.insert(file_contents.first); + } + for (auto path : checked_pointer_cast(written_)->files()) { + actual_paths.insert(std::move(path)); } - EXPECT_THAT(checked_pointer_cast(written_)->files(), - testing::UnorderedElementsAreArray(files)); + EXPECT_THAT(actual_paths, testing::UnorderedElementsAreArray(expected_paths)); for (auto maybe_fragment : written_->GetFragments()) { ASSERT_OK_AND_ASSIGN(auto fragment, maybe_fragment); @@ -779,12 +746,13 @@ class WriteFileSystemDatasetMixin : public MakeFileSystemDatasetMixin { AssertSchemaEqual(*expected_physical_schema_, *actual_physical_schema, check_metadata_); - AssertSchemaEqual(*expected_physical_schema_, *actual_physical_schema); - const auto& path = checked_pointer_cast(fragment)->source().path(); - auto expected_struct = ArrayFromJSON(struct_(expected_physical_schema_->fields()), - {expected_files_[path]}); + auto file_contents = expected_files_.find(path); + if (file_contents == expected_files_.end()) { + // file wasn't expected to be written at all; nothing to compare with + continue; + } ASSERT_OK_AND_ASSIGN(auto scanner, ScannerBuilder(actual_physical_schema, fragment, std::make_shared()) @@ -799,6 +767,9 @@ class WriteFileSystemDatasetMixin : public MakeFileSystemDatasetMixin { ASSERT_OK_AND_ASSIGN(actual_struct, batch->ToStructArray()); } + auto expected_struct = ArrayFromJSON(struct_(expected_physical_schema_->fields()), + {file_contents->second}); + AssertArraysEqual(*expected_struct, *actual_struct, /*verbose=*/true); } } @@ -809,6 +780,9 @@ class WriteFileSystemDatasetMixin : public MakeFileSystemDatasetMixin { PathAndContent expected_files_; std::shared_ptr expected_physical_schema_; std::shared_ptr written_; + FileSystemDatasetWriteOptions write_options_; + std::shared_ptr scan_options_; + std::shared_ptr scan_context_ = std::make_shared(); }; } // namespace dataset diff --git a/cpp/src/arrow/dataset/type_fwd.h b/cpp/src/arrow/dataset/type_fwd.h index b7604f6bddd..e329b072ed7 100644 --- a/cpp/src/arrow/dataset/type_fwd.h +++ b/cpp/src/arrow/dataset/type_fwd.h @@ -48,16 +48,23 @@ using FragmentVector = std::vector>; class FileSource; class FileFormat; class FileFragment; +class FileWriter; +class FileWriteOptions; class FileSystemDataset; +struct FileSystemDatasetWriteOptions; class InMemoryDataset; class CsvFileFormat; class IpcFileFormat; +class IpcFileWriter; +class IpcFileWriteOptions; class ParquetFileFormat; class ParquetFileFragment; +class ParquetFileWriter; +class ParquetFileWriteOptions; class Expression; using ExpressionVector = std::vector>; diff --git a/cpp/src/arrow/util/mutex.cc b/cpp/src/arrow/util/mutex.cc index fa900fd164b..e1b7700ec1b 100644 --- a/cpp/src/arrow/util/mutex.cc +++ b/cpp/src/arrow/util/mutex.cc @@ -19,6 +19,8 @@ #include +#include "arrow/util/logging.h" + namespace arrow { namespace util { @@ -29,7 +31,14 @@ struct Mutex::Impl { Mutex::Guard::Guard(Mutex* locked) : locked_(locked, [](Mutex* locked) { locked->impl_->mutex_.unlock(); }) {} +void Mutex::Guard::Unlock() { + if (locked_) { + locked_->impl_->mutex_.unlock(); + } +} + Mutex::Guard Mutex::TryLock() { + DCHECK_NE(impl_, nullptr); if (impl_->mutex_.try_lock()) { return Guard{this}; } @@ -37,6 +46,7 @@ Mutex::Guard Mutex::TryLock() { } Mutex::Guard Mutex::Lock() { + DCHECK_NE(impl_, nullptr); impl_->mutex_.lock(); return Guard{this}; } diff --git a/cpp/src/arrow/util/mutex.h b/cpp/src/arrow/util/mutex.h index a0365b710fb..8a430266431 100644 --- a/cpp/src/arrow/util/mutex.h +++ b/cpp/src/arrow/util/mutex.h @@ -31,6 +31,8 @@ namespace util { class ARROW_EXPORT Mutex { public: Mutex(); + Mutex(Mutex&&) = default; + Mutex& operator=(Mutex&&) = default; /// A Guard is falsy if a lock could not be acquired. class Guard { @@ -41,6 +43,8 @@ class ARROW_EXPORT Mutex { explicit operator bool() const { return bool(locked_); } + void Unlock(); + private: explicit Guard(Mutex* locked); @@ -56,5 +60,30 @@ class ARROW_EXPORT Mutex { std::unique_ptr impl_; }; +/// A trivial Mutex, T pair +template +class Mutexed : Mutex { + public: + Mutexed() = default; + Mutexed(Mutexed&&) = default; + Mutexed& operator=(Mutexed&&) = default; + explicit Mutexed(T obj) : obj_(std::move(obj)) {} + + using Mutex::Lock; + using Mutex::TryLock; + + T& operator*() { return obj_; } + const T& operator*() const { return obj_; } + + T* operator->() { return &obj_; } + const T* operator->() const { return &obj_; } + + T& get() { return obj_; } + const T& get() const { return obj_; } + + private: + T obj_; +}; + } // namespace util } // namespace arrow diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index a8198b66d70..922d7b05bea 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -606,6 +606,29 @@ cdef shared_ptr[CExpression] _insert_implicit_casts(Expression filter, ) +cdef class FileWriteOptions(_Weakrefable): + + cdef: + shared_ptr[CFileWriteOptions] wrapped + CFileWriteOptions* options + + def __init__(self): + pass + + cdef void init(self, const shared_ptr[CFileWriteOptions]& sp): + self.wrapped = sp + self.options = sp.get() + + @staticmethod + cdef wrap(const shared_ptr[CFileWriteOptions]& sp): + cdef FileWriteOptions self = FileWriteOptions() + self.init(sp) + return self + + cdef inline shared_ptr[CFileWriteOptions] unwrap(self): + return self.wrapped + + cdef class FileFormat(_Weakrefable): cdef: @@ -662,6 +685,10 @@ cdef class FileFormat(_Weakrefable): nullptr)) return Fragment.wrap(move(c_fragment)) + @property + def default_write_options(self): + return FileWriteOptions.wrap(self.format.DefaultWriteOptions()) + def __eq__(self, other): try: return self.equals(other) @@ -1058,14 +1085,11 @@ cdef class ParquetFileFormat(FileFormat): cdef: CParquetFileFormat* parquet_format - object _write_options - def __init__(self, read_options=None, dict write_options=None): + def __init__(self, read_options=None): cdef: shared_ptr[CParquetFileFormat] wrapped CParquetFileFormatReaderOptions* options - shared_ptr[ArrowWriterProperties] arrow_properties - shared_ptr[WriterProperties] properties # Read options @@ -1087,33 +1111,6 @@ cdef class ParquetFileFormat(FileFormat): for column in read_options.dictionary_columns: options.dict_columns.insert(tobytes(column)) - # Write options - - self._write_options = {} - if write_options is not None: - self._write_options = write_options - properties = _create_writer_properties( - use_dictionary=write_options.get("use_dictionary", True), - compression=write_options.get("compression", "snappy"), - version=write_options.get("version", "1.0"), - write_statistics=write_options.get("write_statistics", None), - data_page_size=write_options.get("data_page_size", None), - compression_level=write_options.get("compression_level", None), - use_byte_stream_split=write_options.get( - "use_byte_stream_split", False), - data_page_version=write_options.get("data_page_version", "1.0") - ) - arrow_properties = _create_arrow_writer_properties( - use_deprecated_int96_timestamps=write_options.get( - "use_deprecated_int96_timestamps", False), - coerce_timestamps=write_options.get("coerce_timestamps", None), - allow_truncated_timestamps=write_options.get( - "allow_truncated_timestamps", False), - writer_engine_version="V2" - ) - wrapped.get().writer_properties = properties - wrapped.get().arrow_writer_properties = arrow_properties - self.init( wrapped) cdef void init(self, const shared_ptr[CFileFormat]& sp): @@ -1133,18 +1130,13 @@ cdef class ParquetFileFormat(FileFormat): enable_parallel_column_conversion=enable ) - @property - def write_options(self): - return self._write_options - def equals(self, ParquetFileFormat other): return ( - self.read_options.equals(other.read_options) and - self.write_options == other.write_options + self.read_options.equals(other.read_options) ) def __reduce__(self): - return ParquetFileFormat, (self.read_options, self.write_options) + return ParquetFileFormat, (self.read_options, ) def make_fragment(self, file, filesystem=None, Expression partition_expression=None, row_groups=None): @@ -2122,7 +2114,7 @@ def _get_partition_keys(Expression partition_expression): def _filesystemdataset_write( - data, object base_dir, Schema schema not None, + data, object base_dir, str basename_template, Schema schema not None, FileFormat format not None, FileSystem filesystem not None, Partitioning partitioning not None, bint use_threads=True, ): @@ -2130,39 +2122,27 @@ def _filesystemdataset_write( CFileSystemDataset.Write wrapper """ cdef: - c_string c_base_dir shared_ptr[CSchema] c_schema - shared_ptr[CFileFormat] c_format - shared_ptr[CFileSystem] c_filesystem - shared_ptr[CPartitioning] c_partitioning - shared_ptr[CScanContext] c_context - # to create iterator of InMemory fragments + CFileSystemDatasetWriteOptions c_options + shared_ptr[CScanner] c_scanner + + FileWriteOptions file_options vector[shared_ptr[CRecordBatch]] c_batches - shared_ptr[CFragment] c_fragment - vector[shared_ptr[CFragment]] c_fragment_vector - c_base_dir = tobytes(_stringify_path(base_dir)) + file_options = format.default_write_options + c_schema = pyarrow_unwrap_schema(schema) - c_format = format.unwrap() - c_filesystem = filesystem.unwrap() - c_partitioning = partitioning.unwrap() - c_context = _build_scan_context(use_threads=use_threads) + + c_options.file_write_options = file_options.unwrap() + c_options.filesystem = filesystem.unwrap() + c_options.base_dir = tobytes(_stringify_path(base_dir)) + c_options.partitioning = partitioning.unwrap() + c_options.basename_template = tobytes(basename_template) if isinstance(data, Dataset): - with nogil: - check_status( - CFileSystemDataset.Write( - c_schema, - c_format, - c_filesystem, - c_base_dir, - c_partitioning, - c_context, - ( data).dataset.GetFragments() - ) - ) + scanner = data._scanner(use_threads=use_threads) else: - # data is list of batches/tables, one element per fragment + # data is list of batches/tables for table in data: if isinstance(table, Table): for batch in table.to_batches(): @@ -2170,20 +2150,11 @@ def _filesystemdataset_write( else: c_batches.push_back(( table).sp_batch) - c_fragment = shared_ptr[CFragment]( - new CInMemoryFragment(c_batches, _true.unwrap())) - c_batches.clear() - c_fragment_vector.push_back(c_fragment) + data = Fragment.wrap(shared_ptr[CFragment]( + new CInMemoryFragment(move(c_batches), _true.unwrap()))) - with nogil: - check_status( - CFileSystemDataset.Write( - c_schema, - c_format, - c_filesystem, - c_base_dir, - c_partitioning, - c_context, - MakeVectorIterator(move(c_fragment_vector)) - ) - ) + scanner = Scanner.from_fragment(data, schema, use_threads=use_threads) + + c_scanner = ( scanner).unwrap() + with nogil: + check_status(CFileSystemDataset.Write(c_schema, c_options, c_scanner)) diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py index 19f3c549d56..702a50f1f35 100644 --- a/python/pyarrow/dataset.py +++ b/python/pyarrow/dataset.py @@ -694,7 +694,8 @@ def _ensure_write_partitioning(scheme): return scheme -def write_dataset(data, base_dir, format=None, partitioning=None, schema=None, +def write_dataset(data, base_dir, basename_template, format=None, + partitioning=None, schema=None, filesystem=None, use_threads=True): """ Write a dataset to a given format and partitioning. @@ -703,13 +704,13 @@ def write_dataset(data, base_dir, format=None, partitioning=None, schema=None, ---------- data : Dataset, Table/RecordBatch, or list of Table/RecordBatch The data to write. This can be a Dataset instance or - in-memory Arrow data. A Table or RecordBatch is written as a - single fragment (resulting in a single file, or multiple files if - split according to the `partitioning`). If you have a Table consisting - of multiple record batches, you can pass ``table.to_batches()`` to - handle each record batch as a separate fragment. + in-memory Arrow data. base_dir : str The root directory where to write the dataset. + basename_template : str + A template string used to generate basenames of written data files. + The token '{i}' will be replaced with an automatically incremented + integer. format : FileFormat or str The format in which to write the dataset. Currently supported: "ipc"/"feather". If a FileSystemDataset is being written and `format` @@ -750,5 +751,6 @@ def write_dataset(data, base_dir, format=None, partitioning=None, schema=None, filesystem, _ = _ensure_fs(filesystem) _filesystemdataset_write( - data, base_dir, schema, format, filesystem, partitioning, use_threads, + data, base_dir, basename_template, schema, format, + filesystem, partitioning, use_threads, ) diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd index a1f7cd4aa4b..5128a8ea0b6 100644 --- a/python/pyarrow/includes/libarrow_dataset.pxd +++ b/python/pyarrow/includes/libarrow_dataset.pxd @@ -211,6 +211,10 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: # the generated C++ is compiled). CFileSource(...) + cdef cppclass CFileWriteOptions \ + "arrow::dataset::FileWriteOptions": + pass + cdef cppclass CFileFormat "arrow::dataset::FileFormat": c_string type_name() const CResult[shared_ptr[CSchema]] Inspect(const CFileSource&) const @@ -218,6 +222,7 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: CFileSource source, shared_ptr[CExpression] partition_expression, shared_ptr[CSchema] physical_schema) + shared_ptr[CFileWriteOptions] DefaultWriteOptions() cdef cppclass CFileFragment "arrow::dataset::FileFragment"( CFragment): @@ -237,6 +242,10 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: @staticmethod vector[CRowGroupInfo] FromIdentifiers(vector[int]) + cdef cppclass CParquetFileWriteOptions \ + "arrow::dataset::ParquetFileWriteOptions": + pass + cdef cppclass CParquetFileFragment "arrow::dataset::ParquetFileFragment"( CFileFragment): const vector[CRowGroupInfo]& row_groups() const @@ -244,6 +253,14 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: shared_ptr[CExpression] predicate) CStatus EnsureCompleteMetadata() + cdef cppclass CFileSystemDatasetWriteOptions \ + "arrow::dataset::FileSystemDatasetWriteOptions": + shared_ptr[CFileWriteOptions] file_write_options + shared_ptr[CFileSystem] filesystem + c_string base_dir + shared_ptr[CPartitioning] partitioning + c_string basename_template + cdef cppclass CFileSystemDataset \ "arrow::dataset::FileSystemDataset"(CDataset): @staticmethod @@ -257,12 +274,8 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: @staticmethod CStatus Write( shared_ptr[CSchema] schema, - shared_ptr[CFileFormat] format, - shared_ptr[CFileSystem] filesystem, - c_string base_dir, - shared_ptr[CPartitioning] partitioning, - shared_ptr[CScanContext] scan_context, - CFragmentIterator fragments) + const CFileSystemDatasetWriteOptions& write_options, + shared_ptr[CScanner] scanner) c_string type() vector[c_string] files() diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 13549746755..128c58c2550 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -510,9 +510,6 @@ def test_file_format_pickling(): read_options={ 'use_buffered_stream': True, 'buffer_size': 4096, - }, - write_options={ - 'version': '2.0' } ) ] @@ -2155,7 +2152,8 @@ def _check_dataset_roundtrip(dataset, base_dir, expected_files, base_dir_path = base_dir_path or base_dir ds.write_dataset( - dataset, base_dir, format="feather", partitioning=partitioning + dataset, base_dir, basename_template='dat_{i}.feather', + format="feather", partitioning=partitioning, use_threads=False ) # check that all files are present @@ -2165,6 +2163,8 @@ def _check_dataset_roundtrip(dataset, base_dir, expected_files, # check that reading back in as dataset gives the same result dataset2 = ds.dataset( base_dir_path, format="feather", partitioning=partitioning) + print(dataset.to_table().to_pandas()) + print(dataset2.to_table().to_pandas()) assert dataset2.to_table().equals(dataset.to_table()) @@ -2178,12 +2178,12 @@ def test_write_dataset(tempdir): # full string path target = tempdir / 'single-file-target' - expected_files = [target / "dat_0.ipc"] + expected_files = [target / "dat_0.feather"] _check_dataset_roundtrip(dataset, str(target), expected_files, target) # pathlib path object target = tempdir / 'single-file-target2' - expected_files = [target / "dat_0.ipc"] + expected_files = [target / "dat_0.feather"] _check_dataset_roundtrip(dataset, target, expected_files, target) # TODO @@ -2200,7 +2200,7 @@ def test_write_dataset(tempdir): dataset = ds.dataset(directory) target = tempdir / 'single-directory-target' - expected_files = [target / "dat_0.ipc", target / "dat_1.ipc"] + expected_files = [target / "dat_0.feather"] _check_dataset_roundtrip(dataset, str(target), expected_files, target) @@ -2215,8 +2215,8 @@ def test_write_dataset_partitioned(tempdir): # hive partitioning target = tempdir / 'partitioned-hive-target' expected_paths = [ - target / "part=a", target / "part=a" / "dat_0.ipc", - target / "part=b", target / "part=b" / "dat_1.ipc" + target / "part=a", target / "part=a" / "dat_0.feather", + target / "part=b", target / "part=b" / "dat_1.feather" ] partitioning_schema = ds.partitioning( pa.schema([("part", pa.string())]), flavor="hive") @@ -2227,8 +2227,8 @@ def test_write_dataset_partitioned(tempdir): # directory partitioning target = tempdir / 'partitioned-dir-target' expected_paths = [ - target / "a", target / "a" / "dat_0.ipc", - target / "b", target / "b" / "dat_1.ipc" + target / "a", target / "a" / "dat_0.feather", + target / "b", target / "b" / "dat_1.feather" ] partitioning_schema = ds.partitioning( pa.schema([("part", pa.string())])) @@ -2249,20 +2249,15 @@ def test_write_dataset_use_threads(tempdir): target1 = tempdir / 'partitioned1' ds.write_dataset( - dataset, target1, format="feather", partitioning=partitioning, - use_threads=True + dataset, target1, basename_template='dat_{i}.feather', + format="feather", partitioning=partitioning, use_threads=True ) target2 = tempdir / 'partitioned2' ds.write_dataset( - dataset, target2, format="feather", partitioning=partitioning, - use_threads=False + dataset, target2, basename_template='dat_{i}.feather', + format="feather", partitioning=partitioning, use_threads=False ) - # check that all files are the same in both cases - paths1 = [p.relative_to(target1) for p in target1.rglob("*")] - paths2 = [p.relative_to(target2) for p in target2.rglob("*")] - assert set(paths1) == set(paths2) - # check that reading in gives same result result1 = ds.dataset(target1, format="feather", partitioning=partitioning) result2 = ds.dataset(target2, format="feather", partitioning=partitioning) @@ -2276,10 +2271,11 @@ def test_write_table(tempdir): ], names=["f1", "f2", "part"]) base_dir = tempdir / 'single' - ds.write_dataset(table, base_dir, format="feather") + ds.write_dataset(table, base_dir, + basename_template='dat_{i}.feather', format="feather") # check that all files are present file_paths = list(base_dir.rglob("*")) - expected_paths = [base_dir / "dat_0.ipc"] + expected_paths = [base_dir / "dat_0.feather"] assert set(file_paths) == set(expected_paths) # check Table roundtrip result = ds.dataset(base_dir, format="ipc").to_table() @@ -2289,12 +2285,13 @@ def test_write_table(tempdir): base_dir = tempdir / 'partitioned' partitioning = ds.partitioning( pa.schema([("part", pa.string())]), flavor="hive") - ds.write_dataset( - table, base_dir, format="feather", partitioning=partitioning) + ds.write_dataset(table, base_dir, + basename_template='dat_{i}.feather', + format="feather", partitioning=partitioning) file_paths = list(base_dir.rglob("*")) expected_paths = [ - base_dir / "part=a", base_dir / "part=a" / "dat_0.ipc", - base_dir / "part=b", base_dir / "part=b" / "dat_0.ipc" + base_dir / "part=a", base_dir / "part=a" / "dat_0.feather", + base_dir / "part=b", base_dir / "part=b" / "dat_1.feather" ] assert set(file_paths) == set(expected_paths) result = ds.dataset(base_dir, format="ipc", partitioning=partitioning) @@ -2310,28 +2307,32 @@ def test_write_table_multiple_fragments(tempdir): # Table with multiple batches written as single Fragment by default base_dir = tempdir / 'single' - ds.write_dataset(table, base_dir, format="feather") - assert set(base_dir.rglob("*")) == set([base_dir / "dat_0.ipc"]) + ds.write_dataset(table, base_dir, basename_template='dat_{i}.feather', + format="feather") + assert set(base_dir.rglob("*")) == set([base_dir / "dat_0.feather"]) assert ds.dataset(base_dir, format="ipc").to_table().equals(table) # Same for single-element list of Table base_dir = tempdir / 'single-list' - ds.write_dataset([table], base_dir, format="feather") - assert set(base_dir.rglob("*")) == set([base_dir / "dat_0.ipc"]) + ds.write_dataset([table], base_dir, basename_template='dat_{i}.feather', + format="feather") + assert set(base_dir.rglob("*")) == set([base_dir / "dat_0.feather"]) assert ds.dataset(base_dir, format="ipc").to_table().equals(table) # Provide list of batches to write multiple fragments base_dir = tempdir / 'multiple' - ds.write_dataset(table.to_batches(), base_dir, format="feather") + ds.write_dataset(table.to_batches(), base_dir, + basename_template='dat_{i}.feather', format="feather") assert set(base_dir.rglob("*")) == set( - [base_dir / "dat_0.ipc", base_dir / "dat_1.ipc"]) + [base_dir / "dat_0.feather"]) assert ds.dataset(base_dir, format="ipc").to_table().equals(table) # Provide list of tables to write multiple fragments base_dir = tempdir / 'multiple-table' - ds.write_dataset([table, table], base_dir, format="feather") + ds.write_dataset([table, table], base_dir, + basename_template='dat_{i}.feather', format="feather") assert set(base_dir.rglob("*")) == set( - [base_dir / "dat_0.ipc", base_dir / "dat_1.ipc"]) + [base_dir / "dat_0.feather"]) assert ds.dataset(base_dir, format="ipc").to_table().equals( pa.concat_tables([table]*2) ) @@ -2349,7 +2350,8 @@ def test_write_dataset_parquet(tempdir): # using default "parquet" format string base_dir = tempdir / 'parquet_dataset' - ds.write_dataset(table, base_dir, format="parquet") + ds.write_dataset(table, base_dir, + basename_template='dat_{i}.parquet', format="parquet") # check that all files are present file_paths = list(base_dir.rglob("*")) expected_paths = [base_dir / "dat_0.parquet"] @@ -2359,10 +2361,10 @@ def test_write_dataset_parquet(tempdir): assert result.equals(table) # using custom options / format object - - for version in ["1.0", "2.0"]: - format = ds.ParquetFileFormat(write_options=dict(version=version)) - base_dir = tempdir / 'parquet_dataset_version{0}'.format(version) - ds.write_dataset(table, base_dir, format=format) - meta = pq.read_metadata(base_dir / "dat_0.parquet") - assert meta.format_version == version + if False: + for version in ["1.0", "2.0"]: + format = ds.ParquetFileFormat(write_options=dict(version=version)) + base_dir = tempdir / 'parquet_dataset_version{0}'.format(version) + ds.write_dataset(table, base_dir, format=format) + meta = pq.read_metadata(base_dir / "dat_0.parquet") + assert meta.format_version == version From e2c11992a8fffffd15072f381b8da85bb9aa9d7d Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 29 Sep 2020 16:23:55 -0700 Subject: [PATCH 02/34] Minimal hacking to get the R tests passing --- r/R/arrowExports.R | 8 ++++++-- r/R/dataset-scan.R | 9 ++++++++- r/R/dataset-write.R | 36 +++++++++++++++------------------ r/R/dataset.R | 10 +-------- r/man/Dataset.Rd | 3 --- r/man/write_dataset.Rd | 10 ++++----- r/src/arrowExports.cpp | 26 +++++++++++++++++++----- r/src/dataset.cpp | 24 ++++++++++++++-------- r/tests/testthat/test-dataset.R | 4 ++-- 9 files changed, 75 insertions(+), 55 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index c059c761efe..af5b71bd436 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -464,12 +464,16 @@ dataset___Scanner__Scan <- function(scanner){ .Call(`_arrow_dataset___Scanner__Scan` , scanner) } +dataset___Scanner__schema <- function(sc){ + .Call(`_arrow_dataset___Scanner__schema` , sc) +} + dataset___ScanTask__get_batches <- function(scan_task){ .Call(`_arrow_dataset___ScanTask__get_batches` , scan_task) } -dataset___Dataset__Write <- function(ds, schema, format, filesystem, path, partitioning){ - invisible(.Call(`_arrow_dataset___Dataset__Write` , ds, schema, format, filesystem, path, partitioning)) +dataset___Dataset__Write <- function(scanner, schema, format, filesystem, path, partitioning){ + invisible(.Call(`_arrow_dataset___Dataset__Write` , scanner, schema, format, filesystem, path, partitioning)) } shared_ptr_is_null <- function(xp){ diff --git a/r/R/dataset-scan.R b/r/R/dataset-scan.R index 4aabd2c2bbf..ab542fee8f9 100644 --- a/r/R/dataset-scan.R +++ b/r/R/dataset-scan.R @@ -57,6 +57,9 @@ Scanner <- R6Class("Scanner", inherit = ArrowObject, public = list( ToTable = function() shared_ptr(Table, dataset___Scanner__ToTable(self)), Scan = function() map(dataset___Scanner__Scan(self), shared_ptr, class = ScanTask) + ), + active = list( + schema = function() shared_ptr(Schema, dataset___Scanner__schema(self)) ) ) Scanner$create <- function(dataset, @@ -65,7 +68,7 @@ Scanner$create <- function(dataset, use_threads = option_use_threads(), batch_size = NULL, ...) { - if (inherits(dataset, "arrow_dplyr_query") && inherits(dataset$.data, "Dataset")) { + if (inherits(dataset, "arrow_dplyr_query")) { return(Scanner$create( dataset$.data, dataset$selected_columns, @@ -74,7 +77,11 @@ Scanner$create <- function(dataset, ... )) } + if (inherits(dataset, c("data.frame", "RecordBatch", "Table"))) { + dataset <- InMemoryDataset$create(dataset) + } assert_is(dataset, "Dataset") + scanner_builder <- dataset$NewScan() if (use_threads) { scanner_builder$UseThreads() diff --git a/r/R/dataset-write.R b/r/R/dataset-write.R index 8cfd8dec527..b449186a9ea 100644 --- a/r/R/dataset-write.R +++ b/r/R/dataset-write.R @@ -26,18 +26,18 @@ #' `schema` and `partitioning` will be taken from the result of any `select()` #' and `group_by()` operations done on the dataset. Note that `filter()` queries #' are not currently supported, and `select`-ed columns may not be renamed. -#' @param path string path to a directory to write to (directory will be +#' @param path string path or URI to a directory to write to (directory will be #' created if it does not exist) #' @param format file format to write the dataset to. Currently supported #' formats are "feather" (aka "ipc") and "parquet". Default is to write to the #' same format as `dataset`. -#' @param schema [Schema] containing a subset of columns, possibly reordered, -#' in `dataset`. Default is `dataset$schema`, i.e. all columns. #' @param partitioning `Partitioning` or a character vector of columns to #' use as partition keys (to be written as path segments). Default is to #' use the current `group_by()` columns. #' @param hive_style logical: write partition segments as Hive-style #' (`key1=value1/key2=value2/file.ext`) or as just bare values. Default is `TRUE`. +#' @param filesystem A [FileSystem] where the dataset should be written if it is a +#' string file path; default is the local file system #' @param ... additional format-specific arguments. For available Parquet #' options, see [write_parquet()]. #' @return The input `dataset`, invisibly @@ -45,12 +45,11 @@ write_dataset <- function(dataset, path, format = dataset$format$type, - schema = dataset$schema, partitioning = dplyr::group_vars(dataset), hive_style = TRUE, + filesystem = NULL, ...) { if (inherits(dataset, "arrow_dplyr_query")) { - force(partitioning) # get the group_vars before we drop the object # Check for a filter if (!isTRUE(dataset$filtered_rows)) { # TODO: @@ -63,23 +62,18 @@ write_dataset <- function(dataset, stop("Renaming columns when writing a dataset is not yet supported", call. = FALSE) } dataset <- ensure_group_vars(dataset) - schema <- dataset$.data$schema[dataset$selected_columns] } - dataset <- dataset$.data } - if (inherits(dataset, c("data.frame", "RecordBatch", "Table"))) { - force(partitioning) # get the group_vars before we replace the object - if (inherits(dataset, "grouped_df")) { - # Drop the grouping metadata before writing; we've already consumed it - # now to construct `partitioning` and don't want it in the metadata$r - dataset <- dplyr::ungroup(dataset) - } - dataset <- InMemoryDataset$create(dataset) - } - if (!inherits(dataset, "Dataset")) { - stop("'dataset' must be a Dataset, not ", class(dataset)[1], call. = FALSE) + if (inherits(dataset, "grouped_df")) { + force(partitioning) + # Drop the grouping metadata before writing; we've already consumed it + # now to construct `partitioning` and don't want it in the metadata$r + dataset <- dplyr::ungroup(dataset) } + scanner <- Scanner$create(dataset) + schema <- scanner$schema + if (!inherits(format, "FileFormat")) { if (identical(format, "parquet")) { # We have to do some special massaging of properties @@ -93,12 +87,14 @@ write_dataset <- function(dataset, if (!inherits(partitioning, "Partitioning")) { # TODO: tidyselect? - partition_schema <- dataset$schema[partitioning] + partition_schema <- schema[partitioning] if (isTRUE(hive_style)) { partitioning <- HivePartitioning$create(partition_schema) } else { partitioning <- DirectoryPartitioning$create(partition_schema) } } - dataset$write(path, format = format, partitioning = partitioning, schema = schema, ...) + path_and_fs <- get_path_and_filesystem(path, filesystem) + + dataset___Dataset__Write(scanner, schema, path = path_and_fs$path, format = format, partitioning = partitioning, filesystem = path_and_fs$fs) } diff --git a/r/R/dataset.R b/r/R/dataset.R index 7661c33292e..7b1d6609295 100644 --- a/r/R/dataset.R +++ b/r/R/dataset.R @@ -133,9 +133,6 @@ open_dataset <- function(sources, #' may also replace the dataset's schema by using `ds$schema <- new_schema`. #' This method currently supports only adding, removing, or reordering #' fields in the schema: you cannot alter or cast the field types. -#' - `$write(path, filesystem, schema, format, partitioning, ...)`: writes the -#' dataset to `path` in the `format` file format, partitioned by `partitioning`, -#' and invisibly returns `self`. See [write_dataset()]. #' #' `FileSystemDataset` has the following methods: #' - `$files`: Active binding, returns the files of the `FileSystemDataset` @@ -162,12 +159,7 @@ Dataset <- R6Class("Dataset", inherit = ArrowObject, # Start a new scan of the data # @return A [ScannerBuilder] NewScan = function() unique_ptr(ScannerBuilder, dataset___Dataset__NewScan(self)), - ToString = function() self$schema$ToString(), - write = function(path, filesystem = NULL, schema = self$schema, format, partitioning, ...) { - path_and_fs <- get_path_and_filesystem(path, filesystem) - dataset___Dataset__Write(self, schema, format, path_and_fs$fs, path_and_fs$path, partitioning) - invisible(self) - } + ToString = function() self$schema$ToString() ), active = list( schema = function(schema) { diff --git a/r/man/Dataset.Rd b/r/man/Dataset.Rd index 056b7d529e2..3c9a314195d 100644 --- a/r/man/Dataset.Rd +++ b/r/man/Dataset.Rd @@ -61,9 +61,6 @@ A \code{Dataset} has the following methods: may also replace the dataset's schema by using \code{ds$schema <- new_schema}. This method currently supports only adding, removing, or reordering fields in the schema: you cannot alter or cast the field types. -\item \verb{$write(path, filesystem, schema, format, partitioning, ...)}: writes the -dataset to \code{path} in the \code{format} file format, partitioned by \code{partitioning}, -and invisibly returns \code{self}. See \code{\link[=write_dataset]{write_dataset()}}. } \code{FileSystemDataset} has the following methods: diff --git a/r/man/write_dataset.Rd b/r/man/write_dataset.Rd index bb020b6ce68..8133b38479d 100644 --- a/r/man/write_dataset.Rd +++ b/r/man/write_dataset.Rd @@ -8,9 +8,9 @@ write_dataset( dataset, path, format = dataset$format$type, - schema = dataset$schema, partitioning = dplyr::group_vars(dataset), hive_style = TRUE, + filesystem = NULL, ... ) } @@ -21,16 +21,13 @@ write_dataset( and \code{group_by()} operations done on the dataset. Note that \code{filter()} queries are not currently supported, and \code{select}-ed columns may not be renamed.} -\item{path}{string path to a directory to write to (directory will be +\item{path}{string path or URI to a directory to write to (directory will be created if it does not exist)} \item{format}{file format to write the dataset to. Currently supported formats are "feather" (aka "ipc") and "parquet". Default is to write to the same format as \code{dataset}.} -\item{schema}{\link{Schema} containing a subset of columns, possibly reordered, -in \code{dataset}. Default is \code{dataset$schema}, i.e. all columns.} - \item{partitioning}{\code{Partitioning} or a character vector of columns to use as partition keys (to be written as path segments). Default is to use the current \code{group_by()} columns.} @@ -38,6 +35,9 @@ use the current \code{group_by()} columns.} \item{hive_style}{logical: write partition segments as Hive-style (\code{key1=value1/key2=value2/file.ext}) or as just bare values. Default is \code{TRUE}.} +\item{filesystem}{A \link{FileSystem} where the dataset should be written if it is a +string file path; default is the local file system} + \item{...}{additional format-specific arguments. For available Parquet options, see \code{\link[=write_parquet]{write_parquet()}}.} } diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 999917188b1..2b4d0723879 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1821,6 +1821,21 @@ extern "C" SEXP _arrow_dataset___Scanner__Scan(SEXP scanner_sexp){ } #endif +// dataset.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr dataset___Scanner__schema(const std::shared_ptr& sc); +extern "C" SEXP _arrow_dataset___Scanner__schema(SEXP sc_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type sc(sc_sexp); + return cpp11::as_sexp(dataset___Scanner__schema(sc)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_dataset___Scanner__schema(SEXP sc_sexp){ + Rf_error("Cannot call dataset___Scanner__schema(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // dataset.cpp #if defined(ARROW_R_WITH_ARROW) std::vector> dataset___ScanTask__get_batches(const std::shared_ptr& scan_task); @@ -1838,21 +1853,21 @@ extern "C" SEXP _arrow_dataset___ScanTask__get_batches(SEXP scan_task_sexp){ // dataset.cpp #if defined(ARROW_R_WITH_ARROW) -void dataset___Dataset__Write(const std::shared_ptr& ds, const std::shared_ptr& schema, const std::shared_ptr& format, const std::shared_ptr& filesystem, std::string path, const std::shared_ptr& partitioning); -extern "C" SEXP _arrow_dataset___Dataset__Write(SEXP ds_sexp, SEXP schema_sexp, SEXP format_sexp, SEXP filesystem_sexp, SEXP path_sexp, SEXP partitioning_sexp){ +void dataset___Dataset__Write(const std::shared_ptr& scanner, const std::shared_ptr& schema, const std::shared_ptr& format, const std::shared_ptr& filesystem, std::string path, const std::shared_ptr& partitioning); +extern "C" SEXP _arrow_dataset___Dataset__Write(SEXP scanner_sexp, SEXP schema_sexp, SEXP format_sexp, SEXP filesystem_sexp, SEXP path_sexp, SEXP partitioning_sexp){ BEGIN_CPP11 - arrow::r::Input&>::type ds(ds_sexp); + arrow::r::Input&>::type scanner(scanner_sexp); arrow::r::Input&>::type schema(schema_sexp); arrow::r::Input&>::type format(format_sexp); arrow::r::Input&>::type filesystem(filesystem_sexp); arrow::r::Input::type path(path_sexp); arrow::r::Input&>::type partitioning(partitioning_sexp); - dataset___Dataset__Write(ds, schema, format, filesystem, path, partitioning); + dataset___Dataset__Write(scanner, schema, format, filesystem, path, partitioning); return R_NilValue; END_CPP11 } #else -extern "C" SEXP _arrow_dataset___Dataset__Write(SEXP ds_sexp, SEXP schema_sexp, SEXP format_sexp, SEXP filesystem_sexp, SEXP path_sexp, SEXP partitioning_sexp){ +extern "C" SEXP _arrow_dataset___Dataset__Write(SEXP scanner_sexp, SEXP schema_sexp, SEXP format_sexp, SEXP filesystem_sexp, SEXP path_sexp, SEXP partitioning_sexp){ Rf_error("Cannot call dataset___Dataset__Write(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif @@ -6241,6 +6256,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_dataset___Scanner__ToTable", (DL_FUNC) &_arrow_dataset___Scanner__ToTable, 1}, { "_arrow_dataset___Scanner__head", (DL_FUNC) &_arrow_dataset___Scanner__head, 2}, { "_arrow_dataset___Scanner__Scan", (DL_FUNC) &_arrow_dataset___Scanner__Scan, 1}, + { "_arrow_dataset___Scanner__schema", (DL_FUNC) &_arrow_dataset___Scanner__schema, 1}, { "_arrow_dataset___ScanTask__get_batches", (DL_FUNC) &_arrow_dataset___ScanTask__get_batches, 1}, { "_arrow_dataset___Dataset__Write", (DL_FUNC) &_arrow_dataset___Dataset__Write, 6}, { "_arrow_shared_ptr_is_null", (DL_FUNC) &_arrow_shared_ptr_is_null, 1}, diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp index c041e114d2a..349fde47db2 100644 --- a/r/src/dataset.cpp +++ b/r/src/dataset.cpp @@ -192,8 +192,8 @@ std::shared_ptr dataset___ParquetFileFormat__MakeWrite( const std::shared_ptr& arrow_props) { auto fmt = std::make_shared(); - fmt->writer_properties = writer_props; - fmt->arrow_writer_properties = arrow_props; + // fmt->writer_properties = writer_props; + // fmt->arrow_writer_properties = arrow_props; return fmt; } @@ -316,6 +316,12 @@ std::vector> dataset___Scanner__Scan( return out; } +// [[arrow::export]] +std::shared_ptr dataset___Scanner__schema( + const std::shared_ptr& sc) { + return sc->schema(); +} + // [[arrow::export]] std::vector> dataset___ScanTask__get_batches( const std::shared_ptr& scan_task) { @@ -331,17 +337,19 @@ std::vector> dataset___ScanTask__get_batches } // [[arrow::export]] -void dataset___Dataset__Write(const std::shared_ptr& ds, +void dataset___Dataset__Write(const std::shared_ptr& scanner, const std::shared_ptr& schema, const std::shared_ptr& format, const std::shared_ptr& filesystem, std::string path, const std::shared_ptr& partitioning) { - auto frags = ds->GetFragments(); - auto ctx = std::make_shared(); - ctx->use_threads = true; - StopIfNotOk(ds::FileSystemDataset::Write(schema, format, filesystem, path, partitioning, - ctx, std::move(frags))); + ds::FileSystemDatasetWriteOptions opts; + opts.file_write_options = format->DefaultWriteOptions(); + opts.filesystem = filesystem; + opts.base_dir = path; + opts.partitioning = partitioning; + opts.basename_template = "dat_{i}.feather"; + StopIfNotOk(ds::FileSystemDataset::Write(schema, opts, scanner)); return; } diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 327d0394771..5c18a8c793d 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -870,7 +870,7 @@ test_that("Dataset writing: no partitioning", { dst_dir <- tempfile() write_dataset(ds, dst_dir, format = "feather", partitioning = NULL) expect_true(dir.exists(dst_dir)) - expect_true(length(dir(dst_dir)) > 1) + expect_true(length(dir(dst_dir)) > 0) }) test_that("Dataset writing: from data.frame", { @@ -965,7 +965,7 @@ test_that("Writing a dataset: Parquet format options", { }) test_that("Dataset writing: unsupported features/input validation", { - expect_error(write_dataset(4), "'dataset' must be a Dataset") + expect_error(write_dataset(4), 'dataset must be a "Dataset"') ds <- open_dataset(hive_dir) From 9eea1bdb6318b26d7ef30538b32b5212f1744c7c Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 29 Sep 2020 20:29:08 -0400 Subject: [PATCH 03/34] fix Scanner splitting --- cpp/src/arrow/dataset/file_base.cc | 10 +++++++--- cpp/src/arrow/util/mutex.h | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index 90d270a4b30..8611a32ac32 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -260,10 +260,14 @@ Status FileSystemDataset::Write(std::shared_ptr schema, ScanTaskVector scan_tasks; std::vector fragment_for_task; + // Avoid contention with multithreaded readers + auto context = std::make_shared(*scanner->context()); + context->use_threads = false; + for (const auto& fragment : fragments) { - ARROW_ASSIGN_OR_RAISE( - auto scan_task_it, - Scanner(fragment, scanner->options(), scanner->context()).Scan()); + auto options = std::make_shared(*scanner->options()); + ARROW_ASSIGN_OR_RAISE(auto scan_task_it, + Scanner(fragment, std::move(options), context).Scan()); for (auto maybe_scan_task : scan_task_it) { ARROW_ASSIGN_OR_RAISE(auto scan_task, maybe_scan_task); scan_tasks.push_back(std::move(scan_task)); diff --git a/cpp/src/arrow/util/mutex.h b/cpp/src/arrow/util/mutex.h index 8a430266431..7e72bacc69a 100644 --- a/cpp/src/arrow/util/mutex.h +++ b/cpp/src/arrow/util/mutex.h @@ -35,7 +35,7 @@ class ARROW_EXPORT Mutex { Mutex& operator=(Mutex&&) = default; /// A Guard is falsy if a lock could not be acquired. - class Guard { + class ARROW_EXPORT Guard { public: Guard() : locked_(NULLPTR, [](Mutex* mutex) {}) {} Guard(Guard&&) = default; From c263f6c8fd3ffc3e112ea09e9c51a9f6eaaa4e7c Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 29 Sep 2020 20:44:39 -0400 Subject: [PATCH 04/34] remove debug print()s --- python/pyarrow/tests/test_dataset.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 128c58c2550..c6081c1087c 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -2163,8 +2163,6 @@ def _check_dataset_roundtrip(dataset, base_dir, expected_files, # check that reading back in as dataset gives the same result dataset2 = ds.dataset( base_dir_path, format="feather", partitioning=partitioning) - print(dataset.to_table().to_pandas()) - print(dataset2.to_table().to_pandas()) assert dataset2.to_table().equals(dataset.to_table()) From e70dd9d94622d8b96cdbc3237f2bd0f7c6fb86e4 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 30 Sep 2020 17:13:16 -0400 Subject: [PATCH 05/34] don't double unlock std::mutex --- cpp/src/arrow/util/mutex.cc | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/util/mutex.cc b/cpp/src/arrow/util/mutex.cc index e1b7700ec1b..9fc7d1c11c1 100644 --- a/cpp/src/arrow/util/mutex.cc +++ b/cpp/src/arrow/util/mutex.cc @@ -24,22 +24,24 @@ namespace arrow { namespace util { -struct Mutex::Impl { - std::mutex mutex_; -}; +struct Mutex::Impl : std::mutex {}; Mutex::Guard::Guard(Mutex* locked) - : locked_(locked, [](Mutex* locked) { locked->impl_->mutex_.unlock(); }) {} + : locked_(locked, [](Mutex* locked) { + DCHECK(!locked->impl_->try_lock()); + locked->impl_->unlock(); + }) {} void Mutex::Guard::Unlock() { if (locked_) { - locked_->impl_->mutex_.unlock(); + auto locked = std::move(locked_); + DCHECK_EQ(locked_, nullptr); } } Mutex::Guard Mutex::TryLock() { DCHECK_NE(impl_, nullptr); - if (impl_->mutex_.try_lock()) { + if (impl_->try_lock()) { return Guard{this}; } return Guard{}; @@ -47,7 +49,7 @@ Mutex::Guard Mutex::TryLock() { Mutex::Guard Mutex::Lock() { DCHECK_NE(impl_, nullptr); - impl_->mutex_.lock(); + impl_->lock(); return Guard{this}; } From fa97c5264e8eaea01fcaf2f3e5402ccf6d513c6c Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 1 Oct 2020 10:49:26 -0400 Subject: [PATCH 06/34] extract a helper for single-lookup insertion into maps --- cpp/src/arrow/dataset/file_base.cc | 52 +++++++++++++----------- cpp/src/arrow/util/map.h | 64 ++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 24 deletions(-) create mode 100644 cpp/src/arrow/util/map.h diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index 8611a32ac32..a36ff9d411b 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -31,6 +31,7 @@ #include "arrow/io/memory.h" #include "arrow/util/iterator.h" #include "arrow/util/logging.h" +#include "arrow/util/map.h" #include "arrow/util/mutex.h" #include "arrow/util/task_group.h" @@ -195,34 +196,37 @@ struct WriterSet { write_options_.partitioning->Format(partition_expression)); std::string dir = base_dir_ + part_segments; - auto lock = dir_to_writer_.Lock(); + util::Mutex::Guard writer_lock; - auto it_success = - dir_to_writer_->emplace(std::move(dir), std::make_shared()); - std::shared_ptr writer = it_success.first->second; + auto set_lock = mutex_.Lock(); - if (!it_success.second) { - return writer; + auto writer = + internal::GetOrInsertGenerated(&dir_to_writer_, dir, [&](const std::string&) { + auto writer = std::make_shared(); + writer_lock = writer->Lock(); + return writer; + })->second; + + if (writer_lock) { + // NB: next_basename_() must be invoked with the set_lock held + auto path = fs::internal::ConcatAbstractPath(dir, next_basename_()); + set_lock.Unlock(); + + RETURN_NOT_OK(write_options_.filesystem->CreateDir(dir)); + + ARROW_ASSIGN_OR_RAISE(auto destination, + write_options_.filesystem->OpenOutputStream(path)); + + ARROW_ASSIGN_OR_RAISE(**writer, write_options_.format()->MakeWriter( + std::move(destination), schema, + write_options_.file_write_options)); } - // FIXME Flush writers when at capacity for open files. - - std::string basename = next_basename_(); - auto writer_lock = writer->Lock(); - lock.Unlock(); - - dir = base_dir_ + part_segments; - RETURN_NOT_OK(write_options_.filesystem->CreateDir(dir)); - ARROW_ASSIGN_OR_RAISE(auto destination, - write_options_.filesystem->OpenOutputStream( - fs::internal::ConcatAbstractPath(dir, basename))); - ARROW_ASSIGN_OR_RAISE( - **writer, write_options_.format()->MakeWriter(std::move(destination), schema, - write_options_.file_write_options)); + return writer; } Status FinishAll(internal::TaskGroup* task_group) { - for (const auto& dir_writer : *dir_to_writer_) { + for (const auto& dir_writer : dir_to_writer_) { task_group->Append([&] { std::shared_ptr writer = **dir_writer.second; return writer->Finish(); @@ -233,8 +237,8 @@ struct WriterSet { } // There should only be a single writer open for each partition directory at a time - util::Mutexed>> - dir_to_writer_; + util::Mutex mutex_; + std::unordered_map> dir_to_writer_; NextBasenameGenerator next_basename_; std::string base_dir_; const FileSystemDatasetWriteOptions& write_options_; @@ -313,7 +317,7 @@ Status FileSystemDataset::Write(std::shared_ptr schema, ARROW_ASSIGN_OR_RAISE(auto writer, writers.Get(partition_expression, groups.batches[i]->schema())); - pending_writes[i] = {writer, std::move(groups.batches[i])}; + pending_writes[i] = {std::move(writer), std::move(groups.batches[i])}; } for (size_t i = 0; !pending_writes.empty(); ++i) { diff --git a/cpp/src/arrow/util/map.h b/cpp/src/arrow/util/map.h new file mode 100644 index 00000000000..3872e972bd9 --- /dev/null +++ b/cpp/src/arrow/util/map.h @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/result.h" + +namespace arrow { +namespace internal { + +/// Helper providing single-lookup conditional insertion into std::map or +/// std::unordered_map. If `key` exists in the container, an iterator to that pair +/// will be returned. If `key` does not exist in the container, `gen(key)` will be +/// invoked and its return value inserted. +template +auto GetOrInsertGenerated(Map* map, Key&& key, Gen&& gen) + -> decltype(map->begin()->second = gen(map->begin()->first), map->begin()) { + decltype(gen(map->begin()->first)) placeholder; + + auto it_success = map->emplace(std::forward(key), std::move(placeholder)); + if (it_success.second) { + // insertion of placeholder succeeded, overwrite it with gen() + const auto& inserted_key = it_success.first->first; + auto* value = &it_success.first->second; + *value = gen(inserted_key); + } + return it_success.first; +} + +template +auto GetOrInsertGenerated(Map* map, Key&& key, Gen&& gen) + -> Resultbegin()->second = gen(map->begin()->first).ValueOrDie(), + map->begin())> { + decltype(gen(map->begin()->first).ValueOrDie()) placeholder; + + auto it_success = map->emplace(std::forward(key), std::move(placeholder)); + if (it_success.second) { + // insertion of placeholder succeeded, overwrite it with gen() + const auto& inserted_key = it_success.first->first; + auto* value = &it_success.first->second; + ARROW_ASSIGN_OR_RAISE(*value, gen(inserted_key)); + } + return it_success.first; +} + +} // namespace internal +} // namespace arrow + From 0815d6e8ba385d5a33f80dbc969608f99c076f18 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 1 Oct 2020 14:20:24 -0400 Subject: [PATCH 07/34] add a python binding for custom parquet write properties --- cpp/src/arrow/dataset/file_base.h | 2 + python/pyarrow/_dataset.pyx | 201 ++++++++++++++++++- python/pyarrow/dataset.py | 10 +- python/pyarrow/includes/libarrow_dataset.pxd | 13 +- python/pyarrow/tests/test_dataset.py | 17 +- 5 files changed, 218 insertions(+), 25 deletions(-) diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h index e8c84f4afb6..03b96004af6 100644 --- a/cpp/src/arrow/dataset/file_base.h +++ b/cpp/src/arrow/dataset/file_base.h @@ -256,6 +256,8 @@ class ARROW_DS_EXPORT FileWriteOptions { const std::shared_ptr& format() const { return format_; } + std::string type_name() const { return format_->type_name(); } + protected: explicit FileWriteOptions(std::shared_ptr format) : format_(std::move(format)) {} diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 922d7b05bea..77cefcd8d46 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -613,7 +613,7 @@ cdef class FileWriteOptions(_Weakrefable): CFileWriteOptions* options def __init__(self): - pass + _forbid_instantiation(self.__class__) cdef void init(self, const shared_ptr[CFileWriteOptions]& sp): self.wrapped = sp @@ -621,7 +621,18 @@ cdef class FileWriteOptions(_Weakrefable): @staticmethod cdef wrap(const shared_ptr[CFileWriteOptions]& sp): - cdef FileWriteOptions self = FileWriteOptions() + type_name = frombytes(sp.get().type_name()) + + classes = { + 'ipc': IpcFileWriteOptions, + 'parquet': ParquetFileWriteOptions, + } + + class_ = classes.get(type_name, None) + if class_ is None: + raise TypeError(type_name) + + cdef FileWriteOptions self = class_.__new__(class_) self.init(sp) return self @@ -685,8 +696,7 @@ cdef class FileFormat(_Weakrefable): nullptr)) return Fragment.wrap(move(c_fragment)) - @property - def default_write_options(self): + def make_write_options(self): return FileWriteOptions.wrap(self.format.DefaultWriteOptions()) def __eq__(self, other): @@ -1081,6 +1091,164 @@ cdef class ParquetReadOptions(_Weakrefable): return False +cdef class ParquetFileWriteOptions(FileWriteOptions): + + cdef: + CParquetFileWriteOptions* parquet_options + object _properties + + def update(self, **kwargs): + cdef CParquetFileWriteOptions* opts = self.parquet_options + + arrow_fields = { + "use_deprecated_int96_timestamps", + "coerce_timestamps", + "allow_truncated_timestamps", + } + + update = False + update_arrow = False + for name, value in kwargs.items(): + assert name in self._properties + self._properties[name] = value + if name in arrow_fields: + update_arrow = True + else: + update = True + + if update: + opts.writer_properties = _create_writer_properties( + use_dictionary=self.use_dictionary, + compression=self.compression, + version=self.version, + write_statistics=self.write_statistics, + data_page_size=self.data_page_size, + compression_level=self.compression_level, + use_byte_stream_split=self.use_byte_stream_split, + data_page_version=self.data_page_version, + ) + + if update_arrow: + opts.arrow_writer_properties = _create_arrow_writer_properties( + use_deprecated_int96_timestamps=( + self.use_deprecated_int96_timestamps + ), + coerce_timestamps=self.coerce_timestamps, + allow_truncated_timestamps=self.allow_truncated_timestamps, + writer_engine_version=self.writer_engine_version, + ) + + @property + def use_dictionary(self): + return self._properties['use_dictionary'] + + @use_dictionary.setter + def use_dictionary(self, use_dictionary): + self.update(use_dictionary=use_dictionary) + + @property + def compression(self): + return self._properties['compression'] + + @compression.setter + def compression(self, compression): + self.update(compression=compression) + + @property + def version(self): + return self._properties['version'] + + @version.setter + def version(self, version): + self.update(version=version) + + @property + def write_statistics(self): + return self._properties['write_statistics'] + + @write_statistics.setter + def write_statistics(self, write_statistics): + self.update(write_statistics=write_statistics) + + @property + def data_page_size(self): + return self._properties['data_page_size'] + + @data_page_size.setter + def data_page_size(self, data_page_size): + self.update(data_page_size=data_page_size) + + @property + def compression_level(self): + return self._properties['compression_level'] + + @compression_level.setter + def compression_level(self, compression_level): + self.update(compression_level=compression_level) + + @property + def use_byte_stream_split(self): + return self._properties['use_byte_stream_split'] + + @use_byte_stream_split.setter + def use_byte_stream_split(self, use_byte_stream_split): + self.update(use_byte_stream_split=use_byte_stream_split) + + @property + def data_page_version(self): + return self._properties['data_page_version'] + + @data_page_version.setter + def data_page_version(self, data_page_version): + self.update(data_page_version=data_page_version) + + @property + def use_deprecated_int96_timestamps(self): + return self._properties['use_deprecated_int96_timestamps'] + + @use_deprecated_int96_timestamps.setter + def use_deprecated_int96_timestamps(self, use_deprecated_int96_timestamps): + self.update( + use_deprecated_int96_timestamps=use_deprecated_int96_timestamps) + + @property + def coerce_timestamps(self): + return self._properties['coerce_timestamps'] + + @coerce_timestamps.setter + def coerce_timestamps(self, coerce_timestamps): + self.update(coerce_timestamps=coerce_timestamps) + + @property + def allow_truncated_timestamps(self): + return self._properties['allow_truncated_timestamps'] + + @allow_truncated_timestamps.setter + def allow_truncated_timestamps(self, allow_truncated_timestamps): + self.update(allow_truncated_timestamps=allow_truncated_timestamps) + + @property + def writer_engine_version(self): + return 'V2' + + cdef void init(self, const shared_ptr[CFileWriteOptions]& sp): + FileWriteOptions.init(self, sp) + self.parquet_options = sp.get() + self._properties = dict( + use_dictionary=True, + compression="snappy", + version="1.0", + write_statistics=None, + data_page_size=None, + compression_level=None, + use_byte_stream_split=False, + data_page_version="1.0", + use_deprecated_int96_timestamps=False, + coerce_timestamps=None, + allow_truncated_timestamps=False, + ) + + cdef class ParquetFileFormat(FileFormat): cdef: @@ -1121,15 +1289,21 @@ cdef class ParquetFileFormat(FileFormat): def read_options(self): cdef CParquetFileFormatReaderOptions* options options = &self.parquet_format.reader_options - enable = options.enable_parallel_column_conversion return ParquetReadOptions( use_buffered_stream=options.use_buffered_stream, buffer_size=options.buffer_size, dictionary_columns={frombytes(col) for col in options.dict_columns}, - enable_parallel_column_conversion=enable + enable_parallel_column_conversion=( + options.enable_parallel_column_conversion + ) ) + def make_write_options(self, **kwargs): + opts = FileFormat.make_write_options(self) + ( opts).update(**kwargs) + return opts + def equals(self, ParquetFileFormat other): return ( self.read_options.equals(other.read_options) @@ -1162,6 +1336,12 @@ cdef class ParquetFileFormat(FileFormat): return Fragment.wrap(move(c_fragment)) +cdef class IpcFileWriteOptions(FileWriteOptions): + + def __init__(self): + _forbid_instantiation(self.__class__) + + cdef class IpcFileFormat(FileFormat): def __init__(self): @@ -1187,6 +1367,9 @@ cdef class CsvFileFormat(FileFormat): FileFormat.init(self, sp) self.csv_format = sp.get() + def make_write_options(self): + raise NotImplemented("writing CSV datasets") + @property def parse_options(self): return ParseOptions.wrap(self.csv_format.parse_options) @@ -2116,7 +2299,8 @@ def _get_partition_keys(Expression partition_expression): def _filesystemdataset_write( data, object base_dir, str basename_template, Schema schema not None, FileFormat format not None, FileSystem filesystem not None, - Partitioning partitioning not None, bint use_threads=True, + Partitioning partitioning not None, FileWriteOptions file_options not None, + bint use_threads, ): """ CFileSystemDataset.Write wrapper @@ -2126,11 +2310,8 @@ def _filesystemdataset_write( CFileSystemDatasetWriteOptions c_options shared_ptr[CScanner] c_scanner - FileWriteOptions file_options vector[shared_ptr[CRecordBatch]] c_batches - file_options = format.default_write_options - c_schema = pyarrow_unwrap_schema(schema) c_options.file_write_options = file_options.unwrap() diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py index 702a50f1f35..574fad9d325 100644 --- a/python/pyarrow/dataset.py +++ b/python/pyarrow/dataset.py @@ -32,13 +32,16 @@ FileSystemDataset, FileSystemDatasetFactory, FileSystemFactoryOptions, + FileWriteOptions, Fragment, HivePartitioning, IpcFileFormat, + IpcFileWriteOptions, ParquetDatasetFactory, ParquetFactoryOptions, ParquetFileFormat, ParquetFileFragment, + ParquetFileWriteOptions, ParquetReadOptions, Partitioning, PartitioningFactory, @@ -696,7 +699,7 @@ def _ensure_write_partitioning(scheme): def write_dataset(data, base_dir, basename_template, format=None, partitioning=None, schema=None, - filesystem=None, use_threads=True): + filesystem=None, file_options=None, use_threads=True): """ Write a dataset to a given format and partitioning. @@ -742,6 +745,9 @@ def write_dataset(data, base_dir, basename_template, format=None, ) format = _ensure_format(format) + if file_options is None: + file_options = format.make_write_options() + partitioning = _ensure_write_partitioning(partitioning) if filesystem is None: @@ -752,5 +758,5 @@ def write_dataset(data, base_dir, basename_template, format=None, _filesystemdataset_write( data, base_dir, basename_template, schema, format, - filesystem, partitioning, use_threads, + filesystem, partitioning, file_options, use_threads, ) diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd index 5128a8ea0b6..2e5c9c72eb4 100644 --- a/python/pyarrow/includes/libarrow_dataset.pxd +++ b/python/pyarrow/includes/libarrow_dataset.pxd @@ -213,7 +213,7 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: cdef cppclass CFileWriteOptions \ "arrow::dataset::FileWriteOptions": - pass + c_string type_name() const cdef cppclass CFileFormat "arrow::dataset::FileFormat": c_string type_name() const @@ -243,8 +243,9 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: vector[CRowGroupInfo] FromIdentifiers(vector[int]) cdef cppclass CParquetFileWriteOptions \ - "arrow::dataset::ParquetFileWriteOptions": - pass + "arrow::dataset::ParquetFileWriteOptions"(CFileWriteOptions): + shared_ptr[WriterProperties] writer_properties + shared_ptr[ArrowWriterProperties] arrow_writer_properties cdef cppclass CParquetFileFragment "arrow::dataset::ParquetFileFragment"( CFileFragment): @@ -292,14 +293,16 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: cdef cppclass CParquetFileFormat "arrow::dataset::ParquetFileFormat"( CFileFormat): CParquetFileFormatReaderOptions reader_options - shared_ptr[WriterProperties] writer_properties - shared_ptr[ArrowWriterProperties] arrow_writer_properties CResult[shared_ptr[CFileFragment]] MakeFragment( CFileSource source, shared_ptr[CExpression] partition_expression, vector[CRowGroupInfo] row_groups, shared_ptr[CSchema] physical_schema) + cdef cppclass CIpcFileWriteOptions \ + "arrow::dataset::IpcFileWriteOptions"(CFileWriteOptions): + pass + cdef cppclass CIpcFileFormat "arrow::dataset::IpcFileFormat"( CFileFormat): pass diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index c6081c1087c..a7e46ae4ce1 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -2358,11 +2358,12 @@ def test_write_dataset_parquet(tempdir): result = ds.dataset(base_dir, format="parquet").to_table() assert result.equals(table) - # using custom options / format object - if False: - for version in ["1.0", "2.0"]: - format = ds.ParquetFileFormat(write_options=dict(version=version)) - base_dir = tempdir / 'parquet_dataset_version{0}'.format(version) - ds.write_dataset(table, base_dir, format=format) - meta = pq.read_metadata(base_dir / "dat_0.parquet") - assert meta.format_version == version + # using custom options + for version in ["1.0", "2.0"]: + format = ds.ParquetFileFormat() + opts = format.make_write_options(version=version) + base_dir = tempdir / 'parquet_dataset_version{0}'.format(version) + ds.write_dataset(table, base_dir, basename_template='dat_{i}.parquet', + format=format, file_options=opts) + meta = pq.read_metadata(base_dir / "dat_0.parquet") + assert meta.format_version == version From 2830bd0d746b8d55fce658a7e7c8c229c79321a4 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 1 Oct 2020 16:27:55 -0400 Subject: [PATCH 08/34] remove unused schema parameter --- cpp/src/arrow/dataset/file_base.cc | 3 +-- cpp/src/arrow/dataset/file_base.h | 3 +-- cpp/src/arrow/dataset/test_util.h | 2 +- python/pyarrow/_dataset.pyx | 6 +----- python/pyarrow/includes/libarrow_dataset.pxd | 1 - r/man/FileWriteOptions.Rd | 9 +++++++++ 6 files changed, 13 insertions(+), 11 deletions(-) create mode 100644 r/man/FileWriteOptions.Rd diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index a36ff9d411b..9c121bca659 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -244,8 +244,7 @@ struct WriterSet { const FileSystemDatasetWriteOptions& write_options_; }; -Status FileSystemDataset::Write(std::shared_ptr schema, - const FileSystemDatasetWriteOptions& write_options, +Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_options, std::shared_ptr scanner) { auto task_group = scanner->context()->TaskGroup(); diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h index 03b96004af6..11f5a608a91 100644 --- a/cpp/src/arrow/dataset/file_base.h +++ b/cpp/src/arrow/dataset/file_base.h @@ -214,8 +214,7 @@ class ARROW_DS_EXPORT FileSystemDataset : public Dataset { std::vector> fragments); /// \brief Write a dataset. - static Status Write(std::shared_ptr schema, - const FileSystemDatasetWriteOptions& write_options, + static Status Write(const FileSystemDatasetWriteOptions& write_options, std::shared_ptr scanner); /// \brief Return the type name of the dataset. diff --git a/cpp/src/arrow/dataset/test_util.h b/cpp/src/arrow/dataset/test_util.h index 7822f52fe82..7251c39bffd 100644 --- a/cpp/src/arrow/dataset/test_util.h +++ b/cpp/src/arrow/dataset/test_util.h @@ -589,7 +589,7 @@ class WriteFileSystemDatasetMixin : public MakeFileSystemDatasetMixin { void DoWrite(std::shared_ptr desired_partitioning) { write_options_.partitioning = desired_partitioning; auto scanner = std::make_shared(dataset_, scan_options_, scan_context_); - ASSERT_OK(FileSystemDataset::Write(source_schema_, write_options_, scanner)); + ASSERT_OK(FileSystemDataset::Write(write_options_, scanner)); // re-discover the written dataset fs::FileSelector s; diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 77cefcd8d46..847b4b0d71e 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -2306,14 +2306,10 @@ def _filesystemdataset_write( CFileSystemDataset.Write wrapper """ cdef: - shared_ptr[CSchema] c_schema CFileSystemDatasetWriteOptions c_options shared_ptr[CScanner] c_scanner - vector[shared_ptr[CRecordBatch]] c_batches - c_schema = pyarrow_unwrap_schema(schema) - c_options.file_write_options = file_options.unwrap() c_options.filesystem = filesystem.unwrap() c_options.base_dir = tobytes(_stringify_path(base_dir)) @@ -2338,4 +2334,4 @@ def _filesystemdataset_write( c_scanner = ( scanner).unwrap() with nogil: - check_status(CFileSystemDataset.Write(c_schema, c_options, c_scanner)) + check_status(CFileSystemDataset.Write(c_options, c_scanner)) diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd index 2e5c9c72eb4..3b5b70a37ba 100644 --- a/python/pyarrow/includes/libarrow_dataset.pxd +++ b/python/pyarrow/includes/libarrow_dataset.pxd @@ -274,7 +274,6 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: @staticmethod CStatus Write( - shared_ptr[CSchema] schema, const CFileSystemDatasetWriteOptions& write_options, shared_ptr[CScanner] scanner) diff --git a/r/man/FileWriteOptions.Rd b/r/man/FileWriteOptions.Rd new file mode 100644 index 00000000000..5c4d04456b4 --- /dev/null +++ b/r/man/FileWriteOptions.Rd @@ -0,0 +1,9 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset-write.R +\name{FileWriteOptions} +\alias{FileWriteOptions} +\alias{ParquetFileWriteOptions} +\title{Format-specific write options} +\description{ +A \code{FileWriteOptions} holds write options specific to a \code{FileFormat}. +} From 9b8b8fce4ac4d89e63f69f9996b892f7e6ddfe02 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 1 Oct 2020 16:28:09 -0400 Subject: [PATCH 09/34] repair parquet write options in R --- r/NAMESPACE | 1 + r/R/arrowExports.R | 20 +++++++---- r/R/dataset-format.R | 20 +++++------ r/R/dataset-write.R | 54 ++++++++++++++++++++++++------ r/R/parquet.R | 1 + r/src/arrowExports.cpp | 76 ++++++++++++++++++++++++++++++------------ r/src/dataset.cpp | 47 +++++++++++++++----------- 7 files changed, 150 insertions(+), 69 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index a46f72ea2e6..c54acf7a824 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -159,6 +159,7 @@ export(MessageType) export(MetadataVersion) export(ParquetFileFormat) export(ParquetFileReader) +export(ParquetFileWriteOptions) export(ParquetFileWriter) export(ParquetReaderProperties) export(ParquetVersionType) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index af5b71bd436..932d95396e7 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -396,12 +396,20 @@ dataset___FileFormat__type_name <- function(format){ .Call(`_arrow_dataset___FileFormat__type_name` , format) } -dataset___ParquetFileFormat__MakeRead <- function(use_buffered_stream, buffer_size, dict_columns){ - .Call(`_arrow_dataset___ParquetFileFormat__MakeRead` , use_buffered_stream, buffer_size, dict_columns) +dataset___FileFormat__DefaultWriteOptions <- function(fmt){ + .Call(`_arrow_dataset___FileFormat__DefaultWriteOptions` , fmt) } -dataset___ParquetFileFormat__MakeWrite <- function(writer_props, arrow_props){ - .Call(`_arrow_dataset___ParquetFileFormat__MakeWrite` , writer_props, arrow_props) +dataset___ParquetFileFormat__Make <- function(use_buffered_stream, buffer_size, dict_columns){ + .Call(`_arrow_dataset___ParquetFileFormat__Make` , use_buffered_stream, buffer_size, dict_columns) +} + +dataset___FileWriteOptions__type_name <- function(options){ + .Call(`_arrow_dataset___FileWriteOptions__type_name` , options) +} + +dataset___ParquetFileWriteOptions__update <- function(options, writer_props, arrow_writer_props){ + invisible(.Call(`_arrow_dataset___ParquetFileWriteOptions__update` , options, writer_props, arrow_writer_props)) } dataset___IpcFileFormat__Make <- function(){ @@ -472,8 +480,8 @@ dataset___ScanTask__get_batches <- function(scan_task){ .Call(`_arrow_dataset___ScanTask__get_batches` , scan_task) } -dataset___Dataset__Write <- function(scanner, schema, format, filesystem, path, partitioning){ - invisible(.Call(`_arrow_dataset___Dataset__Write` , scanner, schema, format, filesystem, path, partitioning)) +dataset___Dataset__Write <- function(file_write_options, filesystem, base_dir, partitioning, basename_template, scanner){ + invisible(.Call(`_arrow_dataset___Dataset__Write` , file_write_options, filesystem, base_dir, partitioning, basename_template, scanner)) } shared_ptr_is_null <- function(xp){ diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R index 46e7ec4e649..c98e7b5d44e 100644 --- a/r/R/dataset-format.R +++ b/r/R/dataset-format.R @@ -61,6 +61,11 @@ FileFormat <- R6Class("FileFormat", inherit = ArrowObject, } else { self } + }, + make_write_options = function(...) { + options <- shared_ptr(FileWriteOptions, dataset___FileFormat__DefaultWriteOptions(self)) + options$update(...) + options } ), active = list( @@ -91,18 +96,9 @@ FileFormat$create <- function(format, ...) { ParquetFileFormat <- R6Class("ParquetFileFormat", inherit = FileFormat) ParquetFileFormat$create <- function(use_buffered_stream = FALSE, buffer_size = 8196, - dict_columns = character(0), - writer_properties = NULL, - arrow_writer_properties = NULL) { - if (is.null(writer_properties) && is.null(arrow_writer_properties)) { - shared_ptr(ParquetFileFormat, dataset___ParquetFileFormat__MakeRead( - use_buffered_stream, buffer_size, dict_columns)) - } else { - writer_properties = writer_properties %||% ParquetWriterProperties$create() - arrow_writer_properties = arrow_writer_properties %||% ParquetArrowWriterProperties$create() - shared_ptr(ParquetFileFormat, dataset___ParquetFileFormat__MakeWrite( - writer_properties, arrow_writer_properties)) - } + dict_columns = character(0)) { + shared_ptr(ParquetFileFormat, dataset___ParquetFileFormat__Make( + use_buffered_stream, buffer_size, dict_columns)) } #' @usage NULL diff --git a/r/R/dataset-write.R b/r/R/dataset-write.R index b449186a9ea..abc116ef126 100644 --- a/r/R/dataset-write.R +++ b/r/R/dataset-write.R @@ -44,8 +44,9 @@ #' @export write_dataset <- function(dataset, path, - format = dataset$format$type, + format = dataset$format, partitioning = dplyr::group_vars(dataset), + basename_template = paste("dat_{i}", format$type, sep="."), hive_style = TRUE, filesystem = NULL, ...) { @@ -75,14 +76,12 @@ write_dataset <- function(dataset, schema <- scanner$schema if (!inherits(format, "FileFormat")) { - if (identical(format, "parquet")) { - # We have to do some special massaging of properties - writer_props <- ParquetWriterProperties$create(dataset, ...) - arrow_writer_props <- ParquetArrowWriterProperties$create(...) - format <- ParquetFileFormat$create(writer_properties = writer_props, arrow_writer_properties = arrow_writer_props) - } else { - format <- FileFormat$create(format, ...) - } + format <- FileFormat$create(format) + } + if (inherits(format, "ParquetFileFormat")) { + options <- format$make_write_options(table=dataset, ...) + } else { + options <- format$make_write_options(...) } if (!inherits(partitioning, "Partitioning")) { @@ -96,5 +95,40 @@ write_dataset <- function(dataset, } path_and_fs <- get_path_and_filesystem(path, filesystem) - dataset___Dataset__Write(scanner, schema, path = path_and_fs$path, format = format, partitioning = partitioning, filesystem = path_and_fs$fs) + dataset___Dataset__Write(options, path_and_fs$fs, path_and_fs$path, + partitioning, basename_template, scanner) } + +#' Format-specific write options +#' +#' @description +#' A `FileWriteOptions` holds write options specific to a `FileFormat`. +FileWriteOptions <- R6Class("FileWriteOptions", inherit = ArrowObject, + public = list( + ..dispatch = function() { + type <- self$type + if (type == "parquet") { + shared_ptr(ParquetFileWriteOptions, self$pointer()) + } else { + self + } + }, + update = function(...) { + type <- self$type + if (type == "parquet") { + dataset___ParquetFileWriteOptions__update(self, + ParquetWriterProperties$create(...), + ParquetArrowWriterProperties$create(...)) + } + } + ), + active = list( + type = function() dataset___FileWriteOptions__type_name(self) + ) +) + +#' @usage NULL +#' @format NULL +#' @rdname FileWriteOptions +#' @export +ParquetFileWriteOptions <- R6Class("ParquetFileWriteOptions", inherit = FileWriteOptions) diff --git a/r/R/parquet.R b/r/R/parquet.R index c95154e6693..15d87c15bf3 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -336,6 +336,7 @@ ParquetWriterProperties$create <- function(table, parquet___WriterProperties___Builder__create() ) if (!is.null(version)) { + print(version) builder$set_version(version) } if (!is.null(compression)) { diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 2b4d0723879..1c7ab14e816 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1558,34 +1558,66 @@ extern "C" SEXP _arrow_dataset___FileFormat__type_name(SEXP format_sexp){ // dataset.cpp #if defined(ARROW_R_WITH_ARROW) -std::shared_ptr dataset___ParquetFileFormat__MakeRead(bool use_buffered_stream, int64_t buffer_size, cpp11::strings dict_columns); -extern "C" SEXP _arrow_dataset___ParquetFileFormat__MakeRead(SEXP use_buffered_stream_sexp, SEXP buffer_size_sexp, SEXP dict_columns_sexp){ +std::shared_ptr dataset___FileFormat__DefaultWriteOptions(const std::shared_ptr& fmt); +extern "C" SEXP _arrow_dataset___FileFormat__DefaultWriteOptions(SEXP fmt_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type fmt(fmt_sexp); + return cpp11::as_sexp(dataset___FileFormat__DefaultWriteOptions(fmt)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_dataset___FileFormat__DefaultWriteOptions(SEXP fmt_sexp){ + Rf_error("Cannot call dataset___FileFormat__DefaultWriteOptions(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// dataset.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr dataset___ParquetFileFormat__Make(bool use_buffered_stream, int64_t buffer_size, cpp11::strings dict_columns); +extern "C" SEXP _arrow_dataset___ParquetFileFormat__Make(SEXP use_buffered_stream_sexp, SEXP buffer_size_sexp, SEXP dict_columns_sexp){ BEGIN_CPP11 arrow::r::Input::type use_buffered_stream(use_buffered_stream_sexp); arrow::r::Input::type buffer_size(buffer_size_sexp); arrow::r::Input::type dict_columns(dict_columns_sexp); - return cpp11::as_sexp(dataset___ParquetFileFormat__MakeRead(use_buffered_stream, buffer_size, dict_columns)); + return cpp11::as_sexp(dataset___ParquetFileFormat__Make(use_buffered_stream, buffer_size, dict_columns)); END_CPP11 } #else -extern "C" SEXP _arrow_dataset___ParquetFileFormat__MakeRead(SEXP use_buffered_stream_sexp, SEXP buffer_size_sexp, SEXP dict_columns_sexp){ - Rf_error("Cannot call dataset___ParquetFileFormat__MakeRead(). Please use arrow::install_arrow() to install required runtime libraries. "); +extern "C" SEXP _arrow_dataset___ParquetFileFormat__Make(SEXP use_buffered_stream_sexp, SEXP buffer_size_sexp, SEXP dict_columns_sexp){ + Rf_error("Cannot call dataset___ParquetFileFormat__Make(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif // dataset.cpp #if defined(ARROW_R_WITH_ARROW) -std::shared_ptr dataset___ParquetFileFormat__MakeWrite(const std::shared_ptr& writer_props, const std::shared_ptr& arrow_props); -extern "C" SEXP _arrow_dataset___ParquetFileFormat__MakeWrite(SEXP writer_props_sexp, SEXP arrow_props_sexp){ +std::string dataset___FileWriteOptions__type_name(const std::shared_ptr& options); +extern "C" SEXP _arrow_dataset___FileWriteOptions__type_name(SEXP options_sexp){ BEGIN_CPP11 + arrow::r::Input&>::type options(options_sexp); + return cpp11::as_sexp(dataset___FileWriteOptions__type_name(options)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_dataset___FileWriteOptions__type_name(SEXP options_sexp){ + Rf_error("Cannot call dataset___FileWriteOptions__type_name(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// dataset.cpp +#if defined(ARROW_R_WITH_ARROW) +void dataset___ParquetFileWriteOptions__update(const std::shared_ptr& options, const std::shared_ptr& writer_props, const std::shared_ptr& arrow_writer_props); +extern "C" SEXP _arrow_dataset___ParquetFileWriteOptions__update(SEXP options_sexp, SEXP writer_props_sexp, SEXP arrow_writer_props_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type options(options_sexp); arrow::r::Input&>::type writer_props(writer_props_sexp); - arrow::r::Input&>::type arrow_props(arrow_props_sexp); - return cpp11::as_sexp(dataset___ParquetFileFormat__MakeWrite(writer_props, arrow_props)); + arrow::r::Input&>::type arrow_writer_props(arrow_writer_props_sexp); + dataset___ParquetFileWriteOptions__update(options, writer_props, arrow_writer_props); + return R_NilValue; END_CPP11 } #else -extern "C" SEXP _arrow_dataset___ParquetFileFormat__MakeWrite(SEXP writer_props_sexp, SEXP arrow_props_sexp){ - Rf_error("Cannot call dataset___ParquetFileFormat__MakeWrite(). Please use arrow::install_arrow() to install required runtime libraries. "); +extern "C" SEXP _arrow_dataset___ParquetFileWriteOptions__update(SEXP options_sexp, SEXP writer_props_sexp, SEXP arrow_writer_props_sexp){ + Rf_error("Cannot call dataset___ParquetFileWriteOptions__update(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif @@ -1853,21 +1885,21 @@ extern "C" SEXP _arrow_dataset___ScanTask__get_batches(SEXP scan_task_sexp){ // dataset.cpp #if defined(ARROW_R_WITH_ARROW) -void dataset___Dataset__Write(const std::shared_ptr& scanner, const std::shared_ptr& schema, const std::shared_ptr& format, const std::shared_ptr& filesystem, std::string path, const std::shared_ptr& partitioning); -extern "C" SEXP _arrow_dataset___Dataset__Write(SEXP scanner_sexp, SEXP schema_sexp, SEXP format_sexp, SEXP filesystem_sexp, SEXP path_sexp, SEXP partitioning_sexp){ +void dataset___Dataset__Write(const std::shared_ptr& file_write_options, const std::shared_ptr& filesystem, std::string base_dir, const std::shared_ptr& partitioning, std::string basename_template, const std::shared_ptr& scanner); +extern "C" SEXP _arrow_dataset___Dataset__Write(SEXP file_write_options_sexp, SEXP filesystem_sexp, SEXP base_dir_sexp, SEXP partitioning_sexp, SEXP basename_template_sexp, SEXP scanner_sexp){ BEGIN_CPP11 - arrow::r::Input&>::type scanner(scanner_sexp); - arrow::r::Input&>::type schema(schema_sexp); - arrow::r::Input&>::type format(format_sexp); + arrow::r::Input&>::type file_write_options(file_write_options_sexp); arrow::r::Input&>::type filesystem(filesystem_sexp); - arrow::r::Input::type path(path_sexp); + arrow::r::Input::type base_dir(base_dir_sexp); arrow::r::Input&>::type partitioning(partitioning_sexp); - dataset___Dataset__Write(scanner, schema, format, filesystem, path, partitioning); + arrow::r::Input::type basename_template(basename_template_sexp); + arrow::r::Input&>::type scanner(scanner_sexp); + dataset___Dataset__Write(file_write_options, filesystem, base_dir, partitioning, basename_template, scanner); return R_NilValue; END_CPP11 } #else -extern "C" SEXP _arrow_dataset___Dataset__Write(SEXP scanner_sexp, SEXP schema_sexp, SEXP format_sexp, SEXP filesystem_sexp, SEXP path_sexp, SEXP partitioning_sexp){ +extern "C" SEXP _arrow_dataset___Dataset__Write(SEXP file_write_options_sexp, SEXP filesystem_sexp, SEXP base_dir_sexp, SEXP partitioning_sexp, SEXP basename_template_sexp, SEXP scanner_sexp){ Rf_error("Cannot call dataset___Dataset__Write(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif @@ -6239,8 +6271,10 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_dataset___FileSystemDatasetFactory__Make1", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make1, 3}, { "_arrow_dataset___FileSystemDatasetFactory__Make3", (DL_FUNC) &_arrow_dataset___FileSystemDatasetFactory__Make3, 4}, { "_arrow_dataset___FileFormat__type_name", (DL_FUNC) &_arrow_dataset___FileFormat__type_name, 1}, - { "_arrow_dataset___ParquetFileFormat__MakeRead", (DL_FUNC) &_arrow_dataset___ParquetFileFormat__MakeRead, 3}, - { "_arrow_dataset___ParquetFileFormat__MakeWrite", (DL_FUNC) &_arrow_dataset___ParquetFileFormat__MakeWrite, 2}, + { "_arrow_dataset___FileFormat__DefaultWriteOptions", (DL_FUNC) &_arrow_dataset___FileFormat__DefaultWriteOptions, 1}, + { "_arrow_dataset___ParquetFileFormat__Make", (DL_FUNC) &_arrow_dataset___ParquetFileFormat__Make, 3}, + { "_arrow_dataset___FileWriteOptions__type_name", (DL_FUNC) &_arrow_dataset___FileWriteOptions__type_name, 1}, + { "_arrow_dataset___ParquetFileWriteOptions__update", (DL_FUNC) &_arrow_dataset___ParquetFileWriteOptions__update, 3}, { "_arrow_dataset___IpcFileFormat__Make", (DL_FUNC) &_arrow_dataset___IpcFileFormat__Make, 0}, { "_arrow_dataset___CsvFileFormat__Make", (DL_FUNC) &_arrow_dataset___CsvFileFormat__Make, 1}, { "_arrow_dataset___DirectoryPartitioning", (DL_FUNC) &_arrow_dataset___DirectoryPartitioning, 1}, diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp index 349fde47db2..f4c3ed1d994 100644 --- a/r/src/dataset.cpp +++ b/r/src/dataset.cpp @@ -171,7 +171,13 @@ std::string dataset___FileFormat__type_name( } // [[arrow::export]] -std::shared_ptr dataset___ParquetFileFormat__MakeRead( +std::shared_ptr dataset___FileFormat__DefaultWriteOptions( + const std::shared_ptr& fmt) { + return fmt->DefaultWriteOptions(); +} + +// [[arrow::export]] +std::shared_ptr dataset___ParquetFileFormat__Make( bool use_buffered_stream, int64_t buffer_size, cpp11::strings dict_columns) { auto fmt = std::make_shared(); @@ -187,15 +193,18 @@ std::shared_ptr dataset___ParquetFileFormat__MakeRead( } // [[arrow::export]] -std::shared_ptr dataset___ParquetFileFormat__MakeWrite( - const std::shared_ptr& writer_props, - const std::shared_ptr& arrow_props) { - auto fmt = std::make_shared(); - - // fmt->writer_properties = writer_props; - // fmt->arrow_writer_properties = arrow_props; +std::string dataset___FileWriteOptions__type_name( + const std::shared_ptr& options) { + return options->type_name(); +} - return fmt; +// [[arrow::export]] +void dataset___ParquetFileWriteOptions__update( + const std::shared_ptr& options, + const std::shared_ptr& writer_props, + const std::shared_ptr& arrow_writer_props) { + options->writer_properties = writer_props; + options->arrow_writer_properties = arrow_writer_props; } // [[arrow::export]] @@ -337,20 +346,18 @@ std::vector> dataset___ScanTask__get_batches } // [[arrow::export]] -void dataset___Dataset__Write(const std::shared_ptr& scanner, - const std::shared_ptr& schema, - const std::shared_ptr& format, - const std::shared_ptr& filesystem, - std::string path, - const std::shared_ptr& partitioning) { +void dataset___Dataset__Write( + const std::shared_ptr& file_write_options, + const std::shared_ptr& filesystem, std::string base_dir, + const std::shared_ptr& partitioning, std::string basename_template, + const std::shared_ptr& scanner) { ds::FileSystemDatasetWriteOptions opts; - opts.file_write_options = format->DefaultWriteOptions(); + opts.file_write_options = file_write_options; opts.filesystem = filesystem; - opts.base_dir = path; + opts.base_dir = base_dir; opts.partitioning = partitioning; - opts.basename_template = "dat_{i}.feather"; - StopIfNotOk(ds::FileSystemDataset::Write(schema, opts, scanner)); - return; + opts.basename_template = basename_template; + StopIfNotOk(ds::FileSystemDataset::Write(opts, scanner)); } #endif From bf7d392bdb94f2568de262c01561ed816e20e079 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 1 Oct 2020 17:09:54 -0400 Subject: [PATCH 10/34] add a test for writing with a selection --- r/R/dataset-write.R | 3 +-- r/R/parquet.R | 1 - r/tests/testthat/test-dataset.R | 15 +++++++++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/r/R/dataset-write.R b/r/R/dataset-write.R index abc116ef126..ed8742e56eb 100644 --- a/r/R/dataset-write.R +++ b/r/R/dataset-write.R @@ -73,7 +73,6 @@ write_dataset <- function(dataset, } scanner <- Scanner$create(dataset) - schema <- scanner$schema if (!inherits(format, "FileFormat")) { format <- FileFormat$create(format) @@ -86,7 +85,7 @@ write_dataset <- function(dataset, if (!inherits(partitioning, "Partitioning")) { # TODO: tidyselect? - partition_schema <- schema[partitioning] + partition_schema <- scanner$schema[partitioning] if (isTRUE(hive_style)) { partitioning <- HivePartitioning$create(partition_schema) } else { diff --git a/r/R/parquet.R b/r/R/parquet.R index 15d87c15bf3..c95154e6693 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -336,7 +336,6 @@ ParquetWriterProperties$create <- function(table, parquet___WriterProperties___Builder__create() ) if (!is.null(version)) { - print(version) builder$set_version(version) } if (!is.null(compression)) { diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 5c18a8c793d..c8c4069b16b 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -898,6 +898,21 @@ test_that("Dataset writing: from data.frame", { ) }) +test_that("Dataset writing: from data.frame with projection", { + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-9651 + dst_dir <- tempfile() + stacked <- rbind(df1, df2) + stacked %>% + select(int, chr, dbl) %>% + group_by(int) %>% + write_dataset(dst_dir, format = "feather") + expect_true(dir.exists(dst_dir)) + expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "="))) + + new_ds <- open_dataset(dst_dir, format = "feather") + expect_identical(names(new_ds), c("chr", "dbl", "int")) +}) + test_that("Dataset writing: from RecordBatch", { skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-9651 dst_dir <- tempfile() From 733af53c8200d8cd3d6423c89b9416aa954525ad Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 1 Oct 2020 17:11:55 -0400 Subject: [PATCH 11/34] lint --- cpp/src/arrow/util/map.h | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/util/map.h b/cpp/src/arrow/util/map.h index 3872e972bd9..66325f5e13e 100644 --- a/cpp/src/arrow/util/map.h +++ b/cpp/src/arrow/util/map.h @@ -61,4 +61,3 @@ auto GetOrInsertGenerated(Map* map, Key&& key, Gen&& gen) } // namespace internal } // namespace arrow - From 0567537035e6d4d10b7ad6dcc2115c8570629c60 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 1 Oct 2020 17:12:59 -0400 Subject: [PATCH 12/34] make doc --- r/man/write_dataset.Rd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/man/write_dataset.Rd b/r/man/write_dataset.Rd index 8133b38479d..417cc057f0f 100644 --- a/r/man/write_dataset.Rd +++ b/r/man/write_dataset.Rd @@ -7,8 +7,9 @@ write_dataset( dataset, path, - format = dataset$format$type, + format = dataset$format, partitioning = dplyr::group_vars(dataset), + basename_template = paste("dat_{i}", format$type, sep = "."), hive_style = TRUE, filesystem = NULL, ... From f0da04ad1b38f3321f5a4c307f798453086e2d2c Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 1 Oct 2020 17:15:35 -0400 Subject: [PATCH 13/34] document basename_template parameter --- r/R/dataset-write.R | 3 +++ r/man/write_dataset.Rd | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/r/R/dataset-write.R b/r/R/dataset-write.R index ed8742e56eb..94fe5cf2e40 100644 --- a/r/R/dataset-write.R +++ b/r/R/dataset-write.R @@ -34,6 +34,9 @@ #' @param partitioning `Partitioning` or a character vector of columns to #' use as partition keys (to be written as path segments). Default is to #' use the current `group_by()` columns. +#' @param basename_template `"{i}"` will be replaced with an autoincremented +#' integer to generate basenames of datafiles. For example `"dat_{i}.feather"` +#' will yield `"dat_0.feather", ...`. #' @param hive_style logical: write partition segments as Hive-style #' (`key1=value1/key2=value2/file.ext`) or as just bare values. Default is `TRUE`. #' @param filesystem A [FileSystem] where the dataset should be written if it is a diff --git a/r/man/write_dataset.Rd b/r/man/write_dataset.Rd index 417cc057f0f..3d74f74b621 100644 --- a/r/man/write_dataset.Rd +++ b/r/man/write_dataset.Rd @@ -33,6 +33,10 @@ same format as \code{dataset}.} use as partition keys (to be written as path segments). Default is to use the current \code{group_by()} columns.} +\item{basename_template}{\code{"{i}"} will be replaced with an autoincremented +integer to generate basenames of datafiles. For example \code{"dat_{i}.feather"} +will yield \verb{"dat_0.feather", ...}.} + \item{hive_style}{logical: write partition segments as Hive-style (\code{key1=value1/key2=value2/file.ext}) or as just bare values. Default is \code{TRUE}.} From 18ea1c34ba5230f4428b80e8b565a10d497a1f82 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 2 Oct 2020 09:28:58 -0400 Subject: [PATCH 14/34] enable on-write filtering of written datasets --- r/R/dataset-write.R | 10 +++------- r/man/write_dataset.Rd | 5 +++-- r/tests/testthat/test-dataset.R | 31 ++++++++++++------------------- 3 files changed, 18 insertions(+), 28 deletions(-) diff --git a/r/R/dataset-write.R b/r/R/dataset-write.R index 94fe5cf2e40..d667db5f8d5 100644 --- a/r/R/dataset-write.R +++ b/r/R/dataset-write.R @@ -24,8 +24,9 @@ #' @param dataset [Dataset], [RecordBatch], [Table], `arrow_dplyr_query`, or #' `data.frame`. If an `arrow_dplyr_query` or `grouped_df`, #' `schema` and `partitioning` will be taken from the result of any `select()` -#' and `group_by()` operations done on the dataset. Note that `filter()` queries -#' are not currently supported, and `select`-ed columns may not be renamed. +#' and `group_by()` operations done on the dataset. `filter()` queries will be +#' applied to restrict written rows. +#' Note that `select()`-ed columns may not be renamed. #' @param path string path or URI to a directory to write to (directory will be #' created if it does not exist) #' @param format file format to write the dataset to. Currently supported @@ -54,11 +55,6 @@ write_dataset <- function(dataset, filesystem = NULL, ...) { if (inherits(dataset, "arrow_dplyr_query")) { - # Check for a filter - if (!isTRUE(dataset$filtered_rows)) { - # TODO: - stop("Writing a filtered dataset is not yet supported", call. = FALSE) - } # Check for a select if (!identical(dataset$selected_columns, set_names(names(dataset$.data)))) { # We can select a subset of columns but we can't rename them diff --git a/r/man/write_dataset.Rd b/r/man/write_dataset.Rd index 3d74f74b621..218876427ca 100644 --- a/r/man/write_dataset.Rd +++ b/r/man/write_dataset.Rd @@ -19,8 +19,9 @@ write_dataset( \item{dataset}{\link{Dataset}, \link{RecordBatch}, \link{Table}, \code{arrow_dplyr_query}, or \code{data.frame}. If an \code{arrow_dplyr_query} or \code{grouped_df}, \code{schema} and \code{partitioning} will be taken from the result of any \code{select()} -and \code{group_by()} operations done on the dataset. Note that \code{filter()} queries -are not currently supported, and \code{select}-ed columns may not be renamed.} +and \code{group_by()} operations done on the dataset. \code{filter()} queries will be +applied to restrict written rows. +Note that \code{select()}-ed columns may not be renamed.} \item{path}{string path or URI to a directory to write to (directory will be created if it does not exist)} diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index c8c4069b16b..73ea71b3c49 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -853,6 +853,18 @@ test_that("Dataset writing: dplyr methods", { collect(new_ds) %>% arrange(int), rbind(df1[c("chr", "dbl", "int")], df2[c("chr", "dbl", "int")]) ) + + # filter to restrict written rows + dst_dir3 <- tempfile() + ds %>% + filter(int == 4) %>% + write_dataset(dst_dir3, format = "feather") + new_ds <- open_dataset(dst_dir3, format = "feather") + + expect_equivalent( + new_ds %>% select(names(df1)) %>% collect(), + df1 %>% filter(int == 4) + ) }) test_that("Dataset writing: non-hive", { @@ -898,21 +910,6 @@ test_that("Dataset writing: from data.frame", { ) }) -test_that("Dataset writing: from data.frame with projection", { - skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-9651 - dst_dir <- tempfile() - stacked <- rbind(df1, df2) - stacked %>% - select(int, chr, dbl) %>% - group_by(int) %>% - write_dataset(dst_dir, format = "feather") - expect_true(dir.exists(dst_dir)) - expect_identical(dir(dst_dir), sort(paste("int", c(1:10, 101:110), sep = "="))) - - new_ds <- open_dataset(dst_dir, format = "feather") - expect_identical(names(new_ds), c("chr", "dbl", "int")) -}) - test_that("Dataset writing: from RecordBatch", { skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-9651 dst_dir <- tempfile() @@ -984,10 +981,6 @@ test_that("Dataset writing: unsupported features/input validation", { ds <- open_dataset(hive_dir) - expect_error( - filter(ds, int == 4) %>% write_dataset(ds), - "Writing a filtered dataset is not yet supported" - ) expect_error( select(ds, integer = int) %>% write_dataset(ds), "Renaming columns when writing a dataset is not yet supported" From bca876416124692f468f5756a49f865d9f6dc768 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 2 Oct 2020 13:22:12 -0400 Subject: [PATCH 15/34] add LockFreeStack --- cpp/src/arrow/util/CMakeLists.txt | 1 + cpp/src/arrow/util/lock_free.h | 114 ++++++++++++++++ cpp/src/arrow/util/lock_free_test.cc | 177 +++++++++++++++++++++++++ cpp/src/arrow/util/thread_pool_test.cc | 2 +- 4 files changed, 293 insertions(+), 1 deletion(-) create mode 100644 cpp/src/arrow/util/lock_free.h create mode 100644 cpp/src/arrow/util/lock_free_test.cc diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index c968cdd1a43..46b79bd0ec5 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -66,6 +66,7 @@ add_arrow_test(utility-test add_arrow_test(threading-utility-test SOURCES future_test + lock_free_test task_group_test thread_pool_test) diff --git a/cpp/src/arrow/util/lock_free.h b/cpp/src/arrow/util/lock_free.h new file mode 100644 index 00000000000..89fd1584776 --- /dev/null +++ b/cpp/src/arrow/util/lock_free.h @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/util/macros.h" + +namespace arrow { +namespace util { + +/// A lock free container with FILO storage order. +/// +/// Two thread safe operations are supported: +/// - Push a value onto the stack +/// - move construct from another stack +/// +/// range-for compatilble iteration is provided for +/// convenience, but it is *not* thread safe. +template +class LockFreeStack { + public: + LockFreeStack() = default; + + ~LockFreeStack() { + Node* next = nodes_.load(); + + while (next != NULLPTR) { + Node* node = next; + next = next->next; + delete node; + } + } + + LockFreeStack(const LockFreeStack&) = delete; + LockFreeStack& operator=(const LockFreeStack&) = delete; + LockFreeStack& operator=(LockFreeStack&& other) = delete; + + LockFreeStack(LockFreeStack&& other) { + Node* other_nodes = other.nodes_.load(); + + do { + } while (!other.CompareExchange(&other_nodes, NULLPTR)); + + nodes_.store(other_nodes); + } + + void Push(T value) { + Node* new_head = new Node{std::move(value), nodes_.load()}; + + do { + } while (!CompareExchange(&new_head->next, new_head)); + } + + struct iterator; + iterator begin(); + iterator end(); + + private: + struct Node { + T value; + Node* next; + }; + + bool CompareExchange(Node** expected, Node* desired) { + return nodes_.compare_exchange_strong(*expected, desired); + } + + std::atomic nodes_{NULLPTR}; +}; + +template +struct LockFreeStack::iterator { + bool operator!=(iterator other) const { return node != other.node; } + + T& operator*() const { return node->value; } + + iterator& operator++() { + node = node->next; + return *this; + } + + Node* node; +}; + +template +typename LockFreeStack::iterator LockFreeStack::begin() { + return {nodes_.load()}; +} + +template +typename LockFreeStack::iterator LockFreeStack::end() { + return {NULLPTR}; +} + +} // namespace util +} // namespace arrow + diff --git a/cpp/src/arrow/util/lock_free_test.cc b/cpp/src/arrow/util/lock_free_test.cc new file mode 100644 index 00000000000..3b027f8adcf --- /dev/null +++ b/cpp/src/arrow/util/lock_free_test.cc @@ -0,0 +1,177 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include +#include + +#include "arrow/status.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/util/io_util.h" +#include "arrow/util/lock_free.h" +#include "arrow/util/task_group.h" +#include "arrow/util/thread_pool.h" + +namespace arrow { + +using internal::TaskGroup; + +namespace util { + +static constexpr int kLots = 512; +// static constexpr int kLots = 16 * 1024; + +class TestLockFreeStack : public ::testing::Test { + public: + void TearDown() override { + fflush(stdout); + fflush(stderr); + } + + void AppendLots(LockFreeStack* ints) { + for (int j = 0; j < kLots; ++j) { + ints->Push(j); + } + } + + void AppendLots(std::vector* ints) { + for (int j = 0; j < kLots; ++j) { + ints->push_back(j); + } + } + + template + std::vector ToVector(LockFreeStack stack) { + std::vector out; + for (auto&& element : stack) { + out.push_back(std::move(element)); + } + std::reverse(out.begin(), out.end()); + return out; + } + + template + std::vector Flatten(std::vector> nested) { + size_t flat_size = 0; + for (auto&& v : nested) { + flat_size += v.size(); + } + + std::vector flat(flat_size); + auto it = flat.begin(); + for (auto&& v : nested) { + it = std::move(v.begin(), v.end(), it); + } + + EXPECT_EQ(it, flat.end()); + return flat; + } + + std::shared_ptr task_group = + TaskGroup::MakeThreaded(::arrow::internal::GetCpuThreadPool()); +}; + +TEST_F(TestLockFreeStack, AppendOnly) { + LockFreeStack ints; + + for (int i = 0; i < task_group->parallelism(); ++i) { + task_group->Append([&] { + AppendLots(&ints); + return Status::OK(); + }); + } + + ASSERT_OK(task_group->Finish()); + + std::vector actual = ToVector(std::move(ints)), expected; + + for (int i = 0; i < task_group->parallelism(); ++i) { + AppendLots(&expected); + } + + for (auto v : {&actual, &expected}) { + std::sort(v->begin(), v->end()); + } + + EXPECT_THAT(actual, testing::ElementsAreArray(expected)); +} + +TEST_F(TestLockFreeStack, ProduceThenConsumeMoved) { + LockFreeStack ints; + + std::vector> per_thread_actual(task_group->parallelism()); + + for (int i = 0; i < task_group->parallelism(); ++i) { + task_group->Append([&, i] { + AppendLots(&ints); + per_thread_actual[i] = ToVector(std::move(ints)); + return Status::OK(); + }); + } + + ASSERT_OK(task_group->Finish()); + + std::vector actual = Flatten(std::move(per_thread_actual)), expected; + + for (int i = 0; i < task_group->parallelism(); ++i) { + AppendLots(&expected); + } + + for (auto v : {&actual, &expected}) { + std::sort(v->begin(), v->end()); + } + + EXPECT_THAT(actual, testing::ElementsAreArray(expected)); +} + +TEST_F(TestLockFreeStack, AlternatingProduceAndConsumeMoved) { + LockFreeStack ints; + + std::vector> per_thread_actual(task_group->parallelism()); + + for (int i = 0; i < task_group->parallelism(); ++i) { + task_group->Append([&, i] { + for (int j = 0; j < kLots; ++j) { + ints.Push(j); + auto local = std::move(ints); + for (int got : local) { + per_thread_actual[i].push_back(got); + } + } + return Status::OK(); + }); + } + + ASSERT_OK(task_group->Finish()); + + std::vector actual = Flatten(std::move(per_thread_actual)), expected; + + for (int i = 0; i < task_group->parallelism(); ++i) { + AppendLots(&expected); + } + + for (auto v : {&actual, &expected}) { + std::sort(v->begin(), v->end()); + } + + EXPECT_THAT(actual, testing::ElementsAreArray(expected)); +} + +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/util/thread_pool_test.cc b/cpp/src/arrow/util/thread_pool_test.cc index ac0eae78165..a54880b21b1 100644 --- a/cpp/src/arrow/util/thread_pool_test.cc +++ b/cpp/src/arrow/util/thread_pool_test.cc @@ -126,7 +126,7 @@ class AddTester { class TestThreadPool : public ::testing::Test { public: - void TearDown() { + void TearDown() override { fflush(stdout); fflush(stderr); } From 1c6db502fa9d53fe169775d2aa5c8a4b4bb8c166 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 2 Oct 2020 14:05:06 -0400 Subject: [PATCH 16/34] extract and unit test string interpolation --- cpp/src/arrow/dataset/file_base.cc | 62 +++++++++++++----------------- cpp/src/arrow/util/string.cc | 10 +++++ cpp/src/arrow/util/string.h | 5 +++ cpp/src/arrow/util/string_test.cc | 14 +++++++ 4 files changed, 56 insertions(+), 35 deletions(-) diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index 9c121bca659..b5a7c98ba16 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -33,6 +33,7 @@ #include "arrow/util/logging.h" #include "arrow/util/map.h" #include "arrow/util/mutex.h" +#include "arrow/util/string.h" #include "arrow/util/task_group.h" namespace arrow { @@ -146,48 +147,32 @@ FragmentIterator FileSystemDataset::GetFragmentsImpl( } Status FileWriter::Write(RecordBatchReader* batches) { - for (std::shared_ptr batch;;) { - RETURN_NOT_OK(batches->ReadNext(&batch)); + while (true) { + ARROW_ASSIGN_OR_RAISE(auto batch, batches->Next()); if (batch == nullptr) break; RETURN_NOT_OK(Write(batch)); } return Status::OK(); } -struct NextBasenameGenerator { - static Result Make(const std::string& basename_template) { - if (basename_template.find(fs::internal::kSep) != std::string::npos) { - return Status::Invalid("basename_template contained '/'"); - } - size_t token_start = basename_template.find(token()); - if (token_start == std::string::npos) { - return Status::Invalid("basename_template did not contain '{i}'"); - } - return NextBasenameGenerator{basename_template, 0, token_start, - token_start + token().size()}; - } +constexpr util::string_view kIntegerToken = "{i}"; - static const std::string& token() { - static const std::string token = "{i}"; - return token; +Status ValidateBasenameTemplate(util::string_view basename_template) { + if (basename_template.find(fs::internal::kSep) != util::string_view::npos) { + return Status::Invalid("basename_template contained '/'"); } - - const std::string& template_; - size_t i_, token_start_, token_end_; - - std::string operator()() { - return template_.substr(0, token_start_) + std::to_string(i_++) + - template_.substr(token_end_); + size_t token_start = basename_template.find(kIntegerToken); + if (token_start == util::string_view::npos) { + return Status::Invalid("basename_template did not contain '", kIntegerToken, "'"); } -}; + return Status::OK(); +} using MutexedWriter = util::Mutexed>; struct WriterSet { - WriterSet(NextBasenameGenerator next_basename, - const FileSystemDatasetWriteOptions& write_options) - : next_basename_(std::move(next_basename)), - base_dir_(fs::internal::EnsureTrailingSlash(write_options.base_dir)), + explicit WriterSet(const FileSystemDatasetWriteOptions& write_options) + : base_dir_(fs::internal::EnsureTrailingSlash(write_options.base_dir)), write_options_(write_options) {} Result> Get(const Expression& partition_expression, @@ -208,12 +193,19 @@ struct WriterSet { })->second; if (writer_lock) { - // NB: next_basename_() must be invoked with the set_lock held - auto path = fs::internal::ConcatAbstractPath(dir, next_basename_()); + auto i = i_++; set_lock.Unlock(); RETURN_NOT_OK(write_options_.filesystem->CreateDir(dir)); + auto basename = internal::Replace(write_options_.basename_template, kIntegerToken, + std::to_string(i)); + if (!basename) { + return Status::Invalid("string interpolation of basename template failed"); + } + + auto path = fs::internal::ConcatAbstractPath(dir, *basename); + ARROW_ASSIGN_OR_RAISE(auto destination, write_options_.filesystem->OpenOutputStream(path)); @@ -236,16 +228,18 @@ struct WriterSet { return Status::OK(); } + int i_ = 0; // There should only be a single writer open for each partition directory at a time util::Mutex mutex_; std::unordered_map> dir_to_writer_; - NextBasenameGenerator next_basename_; std::string base_dir_; const FileSystemDatasetWriteOptions& write_options_; }; Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_options, std::shared_ptr scanner) { + RETURN_NOT_OK(ValidateBasenameTemplate(write_options.basename_template)); + auto task_group = scanner->context()->TaskGroup(); // Things we'll un-lazy for the sake of simplicity, with the tradeoff they represent: @@ -278,9 +272,7 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio } } - ARROW_ASSIGN_OR_RAISE(auto next_basename, - NextBasenameGenerator::Make(write_options.basename_template)); - WriterSet writers(std::move(next_basename), write_options); + WriterSet writers(write_options); auto write_task_group = task_group->MakeSubGroup(); auto fragment_for_task_it = fragment_for_task.begin(); diff --git a/cpp/src/arrow/util/string.cc b/cpp/src/arrow/util/string.cc index b3ed2473724..625086c39b2 100644 --- a/cpp/src/arrow/util/string.cc +++ b/cpp/src/arrow/util/string.cc @@ -144,5 +144,15 @@ std::string AsciiToLower(util::string_view value) { return result; } +util::optional Replace(util::string_view s, util::string_view token, + util::string_view replacement) { + size_t token_start = s.find(token); + if (token_start == std::string::npos) { + return util::nullopt; + } + return s.substr(0, token_start).to_string() + replacement.to_string() + + s.substr(token_start + token.size()).to_string(); +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/string.h b/cpp/src/arrow/util/string.h index ea445c71ef4..4201bbe2ea8 100644 --- a/cpp/src/arrow/util/string.h +++ b/cpp/src/arrow/util/string.h @@ -20,6 +20,7 @@ #include #include +#include "arrow/util/optional.h" #include "arrow/util/string_view.h" #include "arrow/util/visibility.h" @@ -56,5 +57,9 @@ bool AsciiEqualsCaseInsensitive(util::string_view left, util::string_view right) ARROW_EXPORT std::string AsciiToLower(util::string_view value); +ARROW_EXPORT +util::optional Replace(util::string_view s, util::string_view token, + util::string_view replacement); + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/string_test.cc b/cpp/src/arrow/util/string_test.cc index c56525732b0..d9b3749fdbd 100644 --- a/cpp/src/arrow/util/string_test.cc +++ b/cpp/src/arrow/util/string_test.cc @@ -88,5 +88,19 @@ TEST(ParseHexValue, Invalid) { ASSERT_RAISES(Invalid, ParseHexValue(input.c_str(), &output)); } +TEST(Replace, Basics) { + auto s = Replace("dat_{i}.txt", "{i}", "23"); + EXPECT_TRUE(s); + EXPECT_EQ(*s, "dat_23.txt"); + + // only replace the first occurrence of token + s = Replace("dat_{i}_{i}.txt", "{i}", "23"); + EXPECT_TRUE(s); + EXPECT_EQ(*s, "dat_23_{i}.txt"); + + s = Replace("dat_.txt", "{nope}", "23"); + EXPECT_FALSE(s); +} + } // namespace internal } // namespace arrow From 6eb546e18df9c9c67b4ce8acd46c513fa6f69010 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 2 Oct 2020 15:34:39 -0400 Subject: [PATCH 17/34] refactor ::Write() to use explicit WriteQueues --- cpp/src/arrow/dataset/file_base.cc | 211 ++++++++++++++++----------- cpp/src/arrow/util/lock_free.h | 37 +++-- cpp/src/arrow/util/lock_free_test.cc | 15 +- cpp/src/arrow/util/mutex.cc | 3 +- cpp/src/arrow/util/mutex.h | 25 ---- 5 files changed, 153 insertions(+), 138 deletions(-) diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index b5a7c98ba16..2c42221b192 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -30,7 +30,9 @@ #include "arrow/io/interfaces.h" #include "arrow/io/memory.h" #include "arrow/util/iterator.h" +#include "arrow/util/lock_free.h" #include "arrow/util/logging.h" +#include "arrow/util/make_unique.h" #include "arrow/util/map.h" #include "arrow/util/mutex.h" #include "arrow/util/string.h" @@ -168,73 +170,94 @@ Status ValidateBasenameTemplate(util::string_view basename_template) { return Status::OK(); } -using MutexedWriter = util::Mutexed>; - -struct WriterSet { - explicit WriterSet(const FileSystemDatasetWriteOptions& write_options) - : base_dir_(fs::internal::EnsureTrailingSlash(write_options.base_dir)), - write_options_(write_options) {} +class WriteQueue { + public: + explicit WriteQueue(util::string_view partition_expression) + : partition_expression_(partition_expression) {} + + // Push a batch into the writer's queue of pending writes. + void Push(std::shared_ptr batch) { pending_.Push(std::move(batch)); } + + // Take the writer's lock and flush its queue of pending writes. + // Returns false if the lock could not be acquired. The queue is guaranteed to be + // flushed if every thread which calls Push() subsequently calls TryFlush() until + // it returns true. + Result TryFlush() { + if (auto lock = writer_mutex_.TryLock()) { + RecordBatchVector pending_vector; + do { + // Move pending to a local variable exclusively viewed by this thread + auto pending = std::move(pending_); + pending_vector = pending.ToVector(); + + for (auto it = pending_vector.rbegin(); it != pending_vector.rend(); ++it) { + RETURN_NOT_OK(writer_->Write(*it)); + } - Result> Get(const Expression& partition_expression, - const std::shared_ptr& schema) { - ARROW_ASSIGN_OR_RAISE(auto part_segments, - write_options_.partitioning->Format(partition_expression)); - std::string dir = base_dir_ + part_segments; + // Since we're holding this writer's lock anyway we may as well check for batches + // added while we were writing + } while (!pending_vector.empty()); + return true; + } + return false; + } - util::Mutex::Guard writer_lock; + util::Mutex::Guard Lock() { return writer_mutex_.Lock(); } - auto set_lock = mutex_.Lock(); + const std::shared_ptr& writer() const { return writer_; } - auto writer = - internal::GetOrInsertGenerated(&dir_to_writer_, dir, [&](const std::string&) { - auto writer = std::make_shared(); - writer_lock = writer->Lock(); - return writer; - })->second; + Status OpenWriter(const FileSystemDatasetWriteOptions& write_options, int index, + std::shared_ptr schema) { + auto dir = fs::internal::EnsureTrailingSlash(write_options.base_dir) + + partition_expression_.to_string(); - if (writer_lock) { - auto i = i_++; - set_lock.Unlock(); + auto basename = internal::Replace(write_options.basename_template, kIntegerToken, + std::to_string(index)); + if (!basename) { + return Status::Invalid("string interpolation of basename template failed"); + } - RETURN_NOT_OK(write_options_.filesystem->CreateDir(dir)); + auto path = fs::internal::ConcatAbstractPath(dir, *basename); - auto basename = internal::Replace(write_options_.basename_template, kIntegerToken, - std::to_string(i)); - if (!basename) { - return Status::Invalid("string interpolation of basename template failed"); - } + RETURN_NOT_OK(write_options.filesystem->CreateDir(dir)); + ARROW_ASSIGN_OR_RAISE(auto destination, + write_options.filesystem->OpenOutputStream(path)); - auto path = fs::internal::ConcatAbstractPath(dir, *basename); + ARROW_ASSIGN_OR_RAISE(writer_, write_options.format()->MakeWriter( + std::move(destination), std::move(schema), + write_options.file_write_options)); + return Status::OK(); + } - ARROW_ASSIGN_OR_RAISE(auto destination, - write_options_.filesystem->OpenOutputStream(path)); + private: + // FileWriters are not required to be thread safe, so they must be guarded with a mutex. + util::Mutex writer_mutex_; + std::shared_ptr writer_; + // A stack into which batches to write will be pushed + util::LockFreeStack> pending_; - ARROW_ASSIGN_OR_RAISE(**writer, write_options_.format()->MakeWriter( - std::move(destination), schema, - write_options_.file_write_options)); - } + // The (formatted) partition expression to which this queue corresponds + util::string_view partition_expression_; +}; - return writer; +Result> MakeWriter( + const FileSystemDatasetWriteOptions& write_options, std::string part, + int auto_incremented, std::shared_ptr schema) { + auto dir = write_options.base_dir + part; + auto basename = internal::Replace(write_options.basename_template, kIntegerToken, + std::to_string(auto_incremented)); + if (!basename) { + return Status::Invalid("string interpolation of basename template failed"); } + auto path = fs::internal::ConcatAbstractPath(dir, *basename); - Status FinishAll(internal::TaskGroup* task_group) { - for (const auto& dir_writer : dir_to_writer_) { - task_group->Append([&] { - std::shared_ptr writer = **dir_writer.second; - return writer->Finish(); - }); - } - - return Status::OK(); - } + RETURN_NOT_OK(write_options.filesystem->CreateDir(dir)); + ARROW_ASSIGN_OR_RAISE(auto destination, + write_options.filesystem->OpenOutputStream(path)); - int i_ = 0; - // There should only be a single writer open for each partition directory at a time - util::Mutex mutex_; - std::unordered_map> dir_to_writer_; - std::string base_dir_; - const FileSystemDatasetWriteOptions& write_options_; -}; + return write_options.format()->MakeWriter(std::move(destination), std::move(schema), + write_options.file_write_options); +} Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_options, std::shared_ptr scanner) { @@ -272,14 +295,18 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio } } - WriterSet writers(write_options); + // Store a mapping from partitions (represened by their formatted partition expressions) + // to a WriteQueue which flushes batches into that partition's output file. + // `unordered_map` is not thread safe, so a lock on queues_mutex_ must be held whenever + // queues_ is accessed or modified. + util::Mutex queues_mutex_; + std::unordered_map> queues_; - auto write_task_group = task_group->MakeSubGroup(); auto fragment_for_task_it = fragment_for_task.begin(); for (const auto& scan_task : scan_tasks) { const Fragment* fragment = *fragment_for_task_it++; - write_task_group->Append([&, scan_task, fragment] { + task_group->Append([&, scan_task, fragment] { ARROW_ASSIGN_OR_RAISE(auto batches, scan_task->Execute()); for (auto maybe_batch : batches) { @@ -287,49 +314,59 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio ARROW_ASSIGN_OR_RAISE(auto groups, write_options.partitioning->Partition(batch)); batch.reset(); // drop to hopefully conserve memory - struct PendingWrite { - std::shared_ptr writer; - std::shared_ptr batch; - - Result TryWrite() { - if (auto lock = writer->TryLock()) { - RETURN_NOT_OK(writer->get()->Write(batch)); - return true; - } - return false; - } - }; - std::vector pending_writes(groups.batches.size()); - for (size_t i = 0; i < groups.batches.size(); ++i) { AndExpression partition_expression(std::move(groups.expressions[i]), fragment->partition_expression()); - - ARROW_ASSIGN_OR_RAISE(auto writer, writers.Get(partition_expression, - groups.batches[i]->schema())); - - pending_writes[i] = {std::move(writer), std::move(groups.batches[i])}; - } - - for (size_t i = 0; !pending_writes.empty(); ++i) { - if (i >= pending_writes.size()) { - i = 0; + auto batch = std::move(groups.batches[i]); + + ARROW_ASSIGN_OR_RAISE(auto part, + write_options.partitioning->Format(partition_expression)); + + // Now we need to look up the relevant WriteQueue in `queues_` + auto queues_lock = queues_mutex_.Lock(); + util::Mutex::Guard new_queue_lock; + + WriteQueue* queue = internal::GetOrInsertGenerated( + &queues_, std::move(part), + [&](const std::string& part) { + auto queue = internal::make_unique(part); + new_queue_lock = queue->Lock(); + return queue; + }) + ->second.get(); + + if (new_queue_lock) { + size_t queue_index = queues_.size() - 1; + // We have openned a new WriteQueue for `part` and must set up its FileWriter. + // That could be slow, so we'd like to release queues_lock now, allowing other + // threads to look up already-open queues. This is safe since we are mutating + // an element of queues_ in place, which does not conflict with access to any + // other element. However, we do need to keep new_queue_lock while we open the + // FileWriter to prevent other threads from using the writer before it has + // been constructed. + queues_lock.Unlock(); + RETURN_NOT_OK(queue->OpenWriter(write_options, queue_index, batch->schema())); + new_queue_lock.Unlock(); } - ARROW_ASSIGN_OR_RAISE(auto success, pending_writes[i].TryWrite()); - if (success) { - std::swap(pending_writes.back(), pending_writes[i]); - pending_writes.pop_back(); - } + queue->Push(std::move(batch)); + + bool flushed; + do { + ARROW_ASSIGN_OR_RAISE(flushed, queue->TryFlush()); + } while (!flushed); } } return Status::OK(); }); } - RETURN_NOT_OK(write_task_group->Finish()); + RETURN_NOT_OK(task_group->Finish()); - RETURN_NOT_OK(writers.FinishAll(task_group.get())); + task_group = scanner->context()->TaskGroup(); + for (const auto& part_queue : queues_) { + task_group->Append([&] { return part_queue.second->writer()->Finish(); }); + } return task_group->Finish(); } diff --git a/cpp/src/arrow/util/lock_free.h b/cpp/src/arrow/util/lock_free.h index 89fd1584776..244f751b46b 100644 --- a/cpp/src/arrow/util/lock_free.h +++ b/cpp/src/arrow/util/lock_free.h @@ -19,6 +19,7 @@ #include #include +#include #include "arrow/util/macros.h" @@ -30,23 +31,12 @@ namespace util { /// Two thread safe operations are supported: /// - Push a value onto the stack /// - move construct from another stack -/// -/// range-for compatilble iteration is provided for -/// convenience, but it is *not* thread safe. template class LockFreeStack { public: LockFreeStack() = default; - ~LockFreeStack() { - Node* next = nodes_.load(); - - while (next != NULLPTR) { - Node* node = next; - next = next->next; - delete node; - } - } + ~LockFreeStack() { Delete(); } LockFreeStack(const LockFreeStack&) = delete; LockFreeStack& operator=(const LockFreeStack&) = delete; @@ -68,16 +58,39 @@ class LockFreeStack { } while (!CompareExchange(&new_head->next, new_head)); } + /// range-for compatible iteration interface + /// NB: *not* thread safe struct iterator; iterator begin(); iterator end(); + std::vector ToVector() { + std::vector out; + for (auto&& element : *this) { + out.push_back(std::move(element)); + } + Delete(); + return out; + } + private: struct Node { T value; Node* next; }; + void Delete() { + Node* next = nodes_.load(); + + while (next != NULLPTR) { + Node* node = next; + next = next->next; + delete node; + } + + nodes_.store(NULLPTR); + } + bool CompareExchange(Node** expected, Node* desired) { return nodes_.compare_exchange_strong(*expected, desired); } diff --git a/cpp/src/arrow/util/lock_free_test.cc b/cpp/src/arrow/util/lock_free_test.cc index 3b027f8adcf..6913c2e9721 100644 --- a/cpp/src/arrow/util/lock_free_test.cc +++ b/cpp/src/arrow/util/lock_free_test.cc @@ -56,16 +56,6 @@ class TestLockFreeStack : public ::testing::Test { } } - template - std::vector ToVector(LockFreeStack stack) { - std::vector out; - for (auto&& element : stack) { - out.push_back(std::move(element)); - } - std::reverse(out.begin(), out.end()); - return out; - } - template std::vector Flatten(std::vector> nested) { size_t flat_size = 0; @@ -99,7 +89,7 @@ TEST_F(TestLockFreeStack, AppendOnly) { ASSERT_OK(task_group->Finish()); - std::vector actual = ToVector(std::move(ints)), expected; + std::vector actual = ints.ToVector(), expected; for (int i = 0; i < task_group->parallelism(); ++i) { AppendLots(&expected); @@ -120,7 +110,8 @@ TEST_F(TestLockFreeStack, ProduceThenConsumeMoved) { for (int i = 0; i < task_group->parallelism(); ++i) { task_group->Append([&, i] { AppendLots(&ints); - per_thread_actual[i] = ToVector(std::move(ints)); + auto local = std::move(ints); + per_thread_actual[i] = local.ToVector(); return Status::OK(); }); } diff --git a/cpp/src/arrow/util/mutex.cc b/cpp/src/arrow/util/mutex.cc index 9fc7d1c11c1..63ff026f4ee 100644 --- a/cpp/src/arrow/util/mutex.cc +++ b/cpp/src/arrow/util/mutex.cc @@ -34,8 +34,7 @@ Mutex::Guard::Guard(Mutex* locked) void Mutex::Guard::Unlock() { if (locked_) { - auto locked = std::move(locked_); - DCHECK_EQ(locked_, nullptr); + locked_.reset(); } } diff --git a/cpp/src/arrow/util/mutex.h b/cpp/src/arrow/util/mutex.h index 7e72bacc69a..8c9f0d40002 100644 --- a/cpp/src/arrow/util/mutex.h +++ b/cpp/src/arrow/util/mutex.h @@ -60,30 +60,5 @@ class ARROW_EXPORT Mutex { std::unique_ptr impl_; }; -/// A trivial Mutex, T pair -template -class Mutexed : Mutex { - public: - Mutexed() = default; - Mutexed(Mutexed&&) = default; - Mutexed& operator=(Mutexed&&) = default; - explicit Mutexed(T obj) : obj_(std::move(obj)) {} - - using Mutex::Lock; - using Mutex::TryLock; - - T& operator*() { return obj_; } - const T& operator*() const { return obj_; } - - T* operator->() { return &obj_; } - const T* operator->() const { return &obj_; } - - T& get() { return obj_; } - const T& get() const { return obj_; } - - private: - T obj_; -}; - } // namespace util } // namespace arrow From ed5ec52032951331d83c70d15626dbdc65256182 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 2 Oct 2020 17:05:29 -0400 Subject: [PATCH 18/34] cache queue mapping local to each thread --- cpp/src/arrow/dataset/file_base.cc | 144 ++++++++++++++++++----------- cpp/src/arrow/util/map.h | 12 +-- 2 files changed, 94 insertions(+), 62 deletions(-) diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index 2c42221b192..92164c15ec0 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -18,6 +18,8 @@ #include "arrow/dataset/file_base.h" #include +#include +#include #include #include "arrow/dataset/dataset_internal.h" @@ -172,16 +174,18 @@ Status ValidateBasenameTemplate(util::string_view basename_template) { class WriteQueue { public: - explicit WriteQueue(util::string_view partition_expression) - : partition_expression_(partition_expression) {} + WriteQueue(std::string partition_expression, + util::Mutex::Guard* wait_for_opened_writer_lock) + : partition_expression_(std::move(partition_expression)) { + *wait_for_opened_writer_lock = writer_mutex_.Lock(); + } // Push a batch into the writer's queue of pending writes. void Push(std::shared_ptr batch) { pending_.Push(std::move(batch)); } - // Take the writer's lock and flush its queue of pending writes. - // Returns false if the lock could not be acquired. The queue is guaranteed to be - // flushed if every thread which calls Push() subsequently calls TryFlush() until - // it returns true. + // Try to lock a writer and flush its queue of pending writes. Returns false if the + // lock could not be acquired. The queue is guaranteed to be flushed if every thread + // which calls Push() subsequently calls TryFlush() until it returns true. Result TryFlush() { if (auto lock = writer_mutex_.TryLock()) { RecordBatchVector pending_vector; @@ -202,14 +206,29 @@ class WriteQueue { return false; } - util::Mutex::Guard Lock() { return writer_mutex_.Lock(); } + // Flush a set of WriteQueues. If we find any queue to be locked we try flushing all + // the others before retrying to minimize busy waiting. + static Status FlushSet(std::unordered_set set) { + while (!set.empty()) { + for (auto it = set.begin(); it != set.end();) { + WriteQueue* queue = *it; + ARROW_ASSIGN_OR_RAISE(bool flushed, queue->TryFlush()); + if (flushed) { + it = set.erase(it); + } else { + ++it; + } + } + } + return Status::OK(); + } const std::shared_ptr& writer() const { return writer_; } - Status OpenWriter(const FileSystemDatasetWriteOptions& write_options, int index, + Status OpenWriter(const FileSystemDatasetWriteOptions& write_options, size_t index, std::shared_ptr schema) { - auto dir = fs::internal::EnsureTrailingSlash(write_options.base_dir) + - partition_expression_.to_string(); + auto dir = + fs::internal::EnsureTrailingSlash(write_options.base_dir) + partition_expression_; auto basename = internal::Replace(write_options.basename_template, kIntegerToken, std::to_string(index)); @@ -237,28 +256,9 @@ class WriteQueue { util::LockFreeStack> pending_; // The (formatted) partition expression to which this queue corresponds - util::string_view partition_expression_; + std::string partition_expression_; }; -Result> MakeWriter( - const FileSystemDatasetWriteOptions& write_options, std::string part, - int auto_incremented, std::shared_ptr schema) { - auto dir = write_options.base_dir + part; - auto basename = internal::Replace(write_options.basename_template, kIntegerToken, - std::to_string(auto_incremented)); - if (!basename) { - return Status::Invalid("string interpolation of basename template failed"); - } - auto path = fs::internal::ConcatAbstractPath(dir, *basename); - - RETURN_NOT_OK(write_options.filesystem->CreateDir(dir)); - ARROW_ASSIGN_OR_RAISE(auto destination, - write_options.filesystem->OpenOutputStream(path)); - - return write_options.format()->MakeWriter(std::move(destination), std::move(schema), - write_options.file_write_options); -} - Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_options, std::shared_ptr scanner) { RETURN_NOT_OK(ValidateBasenameTemplate(write_options.basename_template)); @@ -298,15 +298,29 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio // Store a mapping from partitions (represened by their formatted partition expressions) // to a WriteQueue which flushes batches into that partition's output file. // `unordered_map` is not thread safe, so a lock on queues_mutex_ must be held whenever - // queues_ is accessed or modified. - util::Mutex queues_mutex_; - std::unordered_map> queues_; + // queues is accessed or modified. + util::Mutex queues_mutex; + std::unordered_map> queues; + + // Thread-local caches of partition-to-WriteQueue mapping. + using QueueCache = std::unordered_map; + util::Mutex caches_mutex; + std::vector caches; auto fragment_for_task_it = fragment_for_task.begin(); for (const auto& scan_task : scan_tasks) { const Fragment* fragment = *fragment_for_task_it++; task_group->Append([&, scan_task, fragment] { + // check out a cache of `queues`, if available + auto caches_lock = caches_mutex.Lock(); + QueueCache local_queues; + if (!caches.empty()) { + local_queues = std::move(caches.back()); + caches.pop_back(); + } + caches_lock.Unlock(); + ARROW_ASSIGN_OR_RAISE(auto batches, scan_task->Execute()); for (auto maybe_batch : batches) { @@ -314,6 +328,7 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio ARROW_ASSIGN_OR_RAISE(auto groups, write_options.partitioning->Partition(batch)); batch.reset(); // drop to hopefully conserve memory + std::unordered_set need_flushed; for (size_t i = 0; i < groups.batches.size(); ++i) { AndExpression partition_expression(std::move(groups.expressions[i]), fragment->partition_expression()); @@ -322,49 +337,66 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio ARROW_ASSIGN_OR_RAISE(auto part, write_options.partitioning->Format(partition_expression)); - // Now we need to look up the relevant WriteQueue in `queues_` - auto queues_lock = queues_mutex_.Lock(); - util::Mutex::Guard new_queue_lock; - - WriteQueue* queue = internal::GetOrInsertGenerated( - &queues_, std::move(part), - [&](const std::string& part) { - auto queue = internal::make_unique(part); - new_queue_lock = queue->Lock(); - return queue; - }) - ->second.get(); - - if (new_queue_lock) { - size_t queue_index = queues_.size() - 1; + util::Mutex::Guard queues_lock, wait_for_opened_writer_lock; + + WriteQueue* queue = + // try to lookup the relevant WriteQueue up in our local cache + internal::GetOrInsertGenerated( + &local_queues, part, + [&](const std::string&) { + // local lookup failed, fall back to lookup in `queues` + queues_lock = queues_mutex.Lock(); + return internal::GetOrInsertGenerated( + &queues, std::move(part), + [&](const std::string& part) { + // lookup in `queues` also failed, + // generate a new WriteQueue + return internal::make_unique( + part, &wait_for_opened_writer_lock); + }) + ->second.get(); + }) + ->second; + + if (wait_for_opened_writer_lock) { + size_t queue_index = queues.size() - 1; // We have openned a new WriteQueue for `part` and must set up its FileWriter. // That could be slow, so we'd like to release queues_lock now, allowing other // threads to look up already-open queues. This is safe since we are mutating - // an element of queues_ in place, which does not conflict with access to any + // an element of `queues` in place, which does not conflict with access to any // other element. However, we do need to keep new_queue_lock while we open the // FileWriter to prevent other threads from using the writer before it has // been constructed. queues_lock.Unlock(); RETURN_NOT_OK(queue->OpenWriter(write_options, queue_index, batch->schema())); - new_queue_lock.Unlock(); + wait_for_opened_writer_lock.Unlock(); } - queue->Push(std::move(batch)); + if (queues_lock) { + queues_lock.Unlock(); + } - bool flushed; - do { - ARROW_ASSIGN_OR_RAISE(flushed, queue->TryFlush()); - } while (!flushed); + queue->Push(std::move(batch)); + need_flushed.insert(queue); } + + // flush all touched WriteQueues + RETURN_NOT_OK(WriteQueue::FlushSet(std::move(need_flushed))); } + // check cache back in + caches_lock = caches_mutex.Lock(); + auto larger_or_same = std::lower_bound( + caches.begin(), caches.end(), local_queues, + [](const QueueCache& l, const QueueCache& r) { return l.size() < r.size(); }); + caches.insert(larger_or_same, std::move(local_queues)); return Status::OK(); }); } RETURN_NOT_OK(task_group->Finish()); task_group = scanner->context()->TaskGroup(); - for (const auto& part_queue : queues_) { + for (const auto& part_queue : queues) { task_group->Append([&] { return part_queue.second->writer()->Finish(); }); } return task_group->Finish(); diff --git a/cpp/src/arrow/util/map.h b/cpp/src/arrow/util/map.h index 66325f5e13e..2ca1cf4c82d 100644 --- a/cpp/src/arrow/util/map.h +++ b/cpp/src/arrow/util/map.h @@ -29,11 +29,11 @@ namespace internal { /// will be returned. If `key` does not exist in the container, `gen(key)` will be /// invoked and its return value inserted. template -auto GetOrInsertGenerated(Map* map, Key&& key, Gen&& gen) +auto GetOrInsertGenerated(Map* map, Key key, Gen&& gen) -> decltype(map->begin()->second = gen(map->begin()->first), map->begin()) { - decltype(gen(map->begin()->first)) placeholder; + decltype(gen(map->begin()->first)) placeholder{}; - auto it_success = map->emplace(std::forward(key), std::move(placeholder)); + auto it_success = map->emplace(std::move(key), std::move(placeholder)); if (it_success.second) { // insertion of placeholder succeeded, overwrite it with gen() const auto& inserted_key = it_success.first->first; @@ -44,12 +44,12 @@ auto GetOrInsertGenerated(Map* map, Key&& key, Gen&& gen) } template -auto GetOrInsertGenerated(Map* map, Key&& key, Gen&& gen) +auto GetOrInsertGenerated(Map* map, Key key, Gen&& gen) -> Resultbegin()->second = gen(map->begin()->first).ValueOrDie(), map->begin())> { - decltype(gen(map->begin()->first).ValueOrDie()) placeholder; + decltype(gen(map->begin()->first).ValueOrDie()) placeholder{}; - auto it_success = map->emplace(std::forward(key), std::move(placeholder)); + auto it_success = map->emplace(std::move(key), std::move(placeholder)); if (it_success.second) { // insertion of placeholder succeeded, overwrite it with gen() const auto& inserted_key = it_success.first->first; From 1a541d65512601cad6a188afc24a41396bdc320e Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 2 Oct 2020 18:42:05 -0400 Subject: [PATCH 19/34] lint, simplify WriteQueue storage, try workaround for atomic::atomic() --- cpp/src/arrow/dataset/file_base.cc | 35 ++++++++++++++++++++---------- cpp/src/arrow/util/lock_free.h | 5 ++--- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index 92164c15ec0..7e9a25a06c1 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -18,6 +18,8 @@ #include "arrow/dataset/file_base.h" #include +#include +#include #include #include #include @@ -248,6 +250,8 @@ class WriteQueue { return Status::OK(); } + using Set = std::unordered_map; + private: // FileWriters are not required to be thread safe, so they must be guarded with a mutex. util::Mutex writer_mutex_; @@ -295,17 +299,25 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio } } + // WriteQueues are stored in this deque until writing is completed and are otherwise + // referenced by non-owning pointers. + std::deque queues_storage; + // Store a mapping from partitions (represened by their formatted partition expressions) // to a WriteQueue which flushes batches into that partition's output file. // `unordered_map` is not thread safe, so a lock on queues_mutex_ must be held whenever // queues is accessed or modified. util::Mutex queues_mutex; - std::unordered_map> queues; + WriteQueue::Set queues; // Thread-local caches of partition-to-WriteQueue mapping. - using QueueCache = std::unordered_map; util::Mutex caches_mutex; - std::vector caches; + struct SortByCacheSize { + bool operator()(const WriteQueue::Set& l, const WriteQueue::Set& r) { + return l.size() < r.size(); + } + }; + std::set caches; auto fragment_for_task_it = fragment_for_task.begin(); for (const auto& scan_task : scan_tasks) { @@ -313,11 +325,12 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio task_group->Append([&, scan_task, fragment] { // check out a cache of `queues`, if available + WriteQueue::Set local_queues; auto caches_lock = caches_mutex.Lock(); - QueueCache local_queues; if (!caches.empty()) { - local_queues = std::move(caches.back()); - caches.pop_back(); + auto back = caches.end(); + local_queues = std::move(*--back); + caches.erase(back); } caches_lock.Unlock(); @@ -351,10 +364,11 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio [&](const std::string& part) { // lookup in `queues` also failed, // generate a new WriteQueue - return internal::make_unique( + queues_storage.emplace_back( part, &wait_for_opened_writer_lock); + return &queues_storage.back(); }) - ->second.get(); + ->second; }) ->second; @@ -386,10 +400,7 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio // check cache back in caches_lock = caches_mutex.Lock(); - auto larger_or_same = std::lower_bound( - caches.begin(), caches.end(), local_queues, - [](const QueueCache& l, const QueueCache& r) { return l.size() < r.size(); }); - caches.insert(larger_or_same, std::move(local_queues)); + caches.insert(std::move(local_queues)); return Status::OK(); }); } diff --git a/cpp/src/arrow/util/lock_free.h b/cpp/src/arrow/util/lock_free.h index 244f751b46b..cd76bd30803 100644 --- a/cpp/src/arrow/util/lock_free.h +++ b/cpp/src/arrow/util/lock_free.h @@ -34,7 +34,7 @@ namespace util { template class LockFreeStack { public: - LockFreeStack() = default; + LockFreeStack() { nodes_.store(NULLPTR); } ~LockFreeStack() { Delete(); } @@ -95,7 +95,7 @@ class LockFreeStack { return nodes_.compare_exchange_strong(*expected, desired); } - std::atomic nodes_{NULLPTR}; + std::atomic nodes_; }; template @@ -124,4 +124,3 @@ typename LockFreeStack::iterator LockFreeStack::end() { } // namespace util } // namespace arrow - From 95e548e05c0bd06ebf3a97793c5f8fb8cb9a7497 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Sat, 3 Oct 2020 09:47:02 -0400 Subject: [PATCH 20/34] comparator must be const --- cpp/src/arrow/dataset/file_base.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index 7e9a25a06c1..5258cd18615 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -313,7 +313,7 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio // Thread-local caches of partition-to-WriteQueue mapping. util::Mutex caches_mutex; struct SortByCacheSize { - bool operator()(const WriteQueue::Set& l, const WriteQueue::Set& r) { + bool operator()(const WriteQueue::Set& l, const WriteQueue::Set& r) const { return l.size() < r.size(); } }; From 7fd7185e7a2ad72b56097dffa795fc80f0db27e4 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Sat, 3 Oct 2020 20:32:17 -0400 Subject: [PATCH 21/34] simplify thread local caching --- cpp/src/arrow/dataset/file_base.cc | 87 +++++++++++++----------------- 1 file changed, 38 insertions(+), 49 deletions(-) diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index 5258cd18615..59ed38f2700 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -19,7 +19,6 @@ #include #include -#include #include #include #include @@ -176,9 +175,9 @@ Status ValidateBasenameTemplate(util::string_view basename_template) { class WriteQueue { public: - WriteQueue(std::string partition_expression, + WriteQueue(std::string partition_expression, size_t index, util::Mutex::Guard* wait_for_opened_writer_lock) - : partition_expression_(std::move(partition_expression)) { + : partition_expression_(std::move(partition_expression)), index_(index) { *wait_for_opened_writer_lock = writer_mutex_.Lock(); } @@ -227,13 +226,13 @@ class WriteQueue { const std::shared_ptr& writer() const { return writer_; } - Status OpenWriter(const FileSystemDatasetWriteOptions& write_options, size_t index, + Status OpenWriter(const FileSystemDatasetWriteOptions& write_options, std::shared_ptr schema) { auto dir = fs::internal::EnsureTrailingSlash(write_options.base_dir) + partition_expression_; auto basename = internal::Replace(write_options.basename_template, kIntegerToken, - std::to_string(index)); + std::to_string(index_)); if (!basename) { return Status::Invalid("string interpolation of basename template failed"); } @@ -261,6 +260,8 @@ class WriteQueue { // The (formatted) partition expression to which this queue corresponds std::string partition_expression_; + + size_t index_; }; Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_options, @@ -312,12 +313,7 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio // Thread-local caches of partition-to-WriteQueue mapping. util::Mutex caches_mutex; - struct SortByCacheSize { - bool operator()(const WriteQueue::Set& l, const WriteQueue::Set& r) const { - return l.size() < r.size(); - } - }; - std::set caches; + std::vector caches(task_group->parallelism()); auto fragment_for_task_it = fragment_for_task.begin(); for (const auto& scan_task : scan_tasks) { @@ -325,13 +321,9 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio task_group->Append([&, scan_task, fragment] { // check out a cache of `queues`, if available - WriteQueue::Set local_queues; auto caches_lock = caches_mutex.Lock(); - if (!caches.empty()) { - auto back = caches.end(); - local_queues = std::move(*--back); - caches.erase(back); - } + WriteQueue::Set local_queues = std::move(caches.back()); + caches.pop_back(); caches_lock.Unlock(); ARROW_ASSIGN_OR_RAISE(auto batches, scan_task->Execute()); @@ -350,46 +342,43 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio ARROW_ASSIGN_OR_RAISE(auto part, write_options.partitioning->Format(partition_expression)); - util::Mutex::Guard queues_lock, wait_for_opened_writer_lock; + util::Mutex::Guard wait_for_opened_writer_lock; + + using internal::GetOrInsertGenerated; WriteQueue* queue = // try to lookup the relevant WriteQueue up in our local cache - internal::GetOrInsertGenerated( - &local_queues, part, - [&](const std::string&) { - // local lookup failed, fall back to lookup in `queues` - queues_lock = queues_mutex.Lock(); - return internal::GetOrInsertGenerated( - &queues, std::move(part), - [&](const std::string& part) { - // lookup in `queues` also failed, - // generate a new WriteQueue - queues_storage.emplace_back( - part, &wait_for_opened_writer_lock); - return &queues_storage.back(); - }) - ->second; - }) - ->second; + GetOrInsertGenerated(&local_queues, part, [&](const std::string&) { + // local lookup failed, fall back to lookup in `queues` + auto queues_lock = queues_mutex.Lock(); + + return GetOrInsertGenerated(&queues, std::move(part), + [&](const std::string& part) { + // lookup in `queues` also failed, + // generate a new WriteQueue + size_t queue_index = queues.size() - 1; + + queues_storage.emplace_back( + part, queue_index, + &wait_for_opened_writer_lock); + + return &queues_storage.back(); + }) + ->second; + })->second; if (wait_for_opened_writer_lock) { - size_t queue_index = queues.size() - 1; // We have openned a new WriteQueue for `part` and must set up its FileWriter. - // That could be slow, so we'd like to release queues_lock now, allowing other - // threads to look up already-open queues. This is safe since we are mutating - // an element of `queues` in place, which does not conflict with access to any - // other element. However, we do need to keep new_queue_lock while we open the - // FileWriter to prevent other threads from using the writer before it has - // been constructed. - queues_lock.Unlock(); - RETURN_NOT_OK(queue->OpenWriter(write_options, queue_index, batch->schema())); + // That could be slow, so we've already released queues_lock, allowing + // other threads to look up already-open queues. This is safe since we are + // mutating an element of `queues` in place, which does not conflict with + // access to any other element. However, we do need to continue holding + // wait_for_opened_writer_lock while we open the FileWriter to prevent other + // threads from using the writer before it has been constructed. + RETURN_NOT_OK(queue->OpenWriter(write_options, batch->schema())); wait_for_opened_writer_lock.Unlock(); } - if (queues_lock) { - queues_lock.Unlock(); - } - queue->Push(std::move(batch)); need_flushed.insert(queue); } @@ -400,7 +389,7 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio // check cache back in caches_lock = caches_mutex.Lock(); - caches.insert(std::move(local_queues)); + caches.push_back(std::move(local_queues)); return Status::OK(); }); } From dde5eed100427d594a8f8539cc087e0d1031b0b0 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 5 Oct 2020 08:34:48 -0400 Subject: [PATCH 22/34] simplify: revert local queue lookup caching --- cpp/src/arrow/dataset/file_base.cc | 52 +++++++++++------------------- 1 file changed, 19 insertions(+), 33 deletions(-) diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index 59ed38f2700..8a29135c476 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -305,27 +305,17 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio std::deque queues_storage; // Store a mapping from partitions (represened by their formatted partition expressions) - // to a WriteQueue which flushes batches into that partition's output file. - // `unordered_map` is not thread safe, so a lock on queues_mutex_ must be held whenever - // queues is accessed or modified. + // to a WriteQueue which flushes batches into that partition's output file. In principle + // any thread could produce a batch for any partition, so each task alternates between + // pushing batches and flushing them to disk. util::Mutex queues_mutex; WriteQueue::Set queues; - // Thread-local caches of partition-to-WriteQueue mapping. - util::Mutex caches_mutex; - std::vector caches(task_group->parallelism()); - auto fragment_for_task_it = fragment_for_task.begin(); for (const auto& scan_task : scan_tasks) { const Fragment* fragment = *fragment_for_task_it++; task_group->Append([&, scan_task, fragment] { - // check out a cache of `queues`, if available - auto caches_lock = caches_mutex.Lock(); - WriteQueue::Set local_queues = std::move(caches.back()); - caches.pop_back(); - caches_lock.Unlock(); - ARROW_ASSIGN_OR_RAISE(auto batches, scan_task->Execute()); for (auto maybe_batch : batches) { @@ -346,26 +336,25 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio using internal::GetOrInsertGenerated; - WriteQueue* queue = - // try to lookup the relevant WriteQueue up in our local cache - GetOrInsertGenerated(&local_queues, part, [&](const std::string&) { - // local lookup failed, fall back to lookup in `queues` - auto queues_lock = queues_mutex.Lock(); + WriteQueue* queue; + { + // lookup the queue to which batch should be appended + auto queues_lock = queues_mutex.Lock(); - return GetOrInsertGenerated(&queues, std::move(part), - [&](const std::string& part) { - // lookup in `queues` also failed, - // generate a new WriteQueue - size_t queue_index = queues.size() - 1; + queue = GetOrInsertGenerated(&queues, std::move(part), + [&](const std::string& part) { + // lookup in `queues` also failed, + // generate a new WriteQueue + size_t queue_index = queues.size() - 1; - queues_storage.emplace_back( - part, queue_index, - &wait_for_opened_writer_lock); + queues_storage.emplace_back( + part, queue_index, + &wait_for_opened_writer_lock); - return &queues_storage.back(); - }) - ->second; - })->second; + return &queues_storage.back(); + }) + ->second; + } if (wait_for_opened_writer_lock) { // We have openned a new WriteQueue for `part` and must set up its FileWriter. @@ -387,9 +376,6 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio RETURN_NOT_OK(WriteQueue::FlushSet(std::move(need_flushed))); } - // check cache back in - caches_lock = caches_mutex.Lock(); - caches.push_back(std::move(local_queues)); return Status::OK(); }); } From dfc2291343eec2da7606511e8814a55451726941 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 5 Oct 2020 09:02:04 -0400 Subject: [PATCH 23/34] revert lock_free --- cpp/src/arrow/dataset/file_base.cc | 68 +++++------ cpp/src/arrow/util/CMakeLists.txt | 1 - cpp/src/arrow/util/lock_free.h | 126 -------------------- cpp/src/arrow/util/lock_free_test.cc | 168 --------------------------- 4 files changed, 29 insertions(+), 334 deletions(-) delete mode 100644 cpp/src/arrow/util/lock_free.h delete mode 100644 cpp/src/arrow/util/lock_free_test.cc diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index 8a29135c476..aa9efb0dfa6 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -33,7 +33,6 @@ #include "arrow/io/interfaces.h" #include "arrow/io/memory.h" #include "arrow/util/iterator.h" -#include "arrow/util/lock_free.h" #include "arrow/util/logging.h" #include "arrow/util/make_unique.h" #include "arrow/util/map.h" @@ -173,6 +172,8 @@ Status ValidateBasenameTemplate(util::string_view basename_template) { return Status::OK(); } +/// WriteQueue allows batches to be pushed from multiple threads while another thread +/// flushes some to disk. class WriteQueue { public: WriteQueue(std::string partition_expression, size_t index, @@ -182,43 +183,30 @@ class WriteQueue { } // Push a batch into the writer's queue of pending writes. - void Push(std::shared_ptr batch) { pending_.Push(std::move(batch)); } - - // Try to lock a writer and flush its queue of pending writes. Returns false if the - // lock could not be acquired. The queue is guaranteed to be flushed if every thread - // which calls Push() subsequently calls TryFlush() until it returns true. - Result TryFlush() { - if (auto lock = writer_mutex_.TryLock()) { - RecordBatchVector pending_vector; - do { - // Move pending to a local variable exclusively viewed by this thread - auto pending = std::move(pending_); - pending_vector = pending.ToVector(); - - for (auto it = pending_vector.rbegin(); it != pending_vector.rend(); ++it) { - RETURN_NOT_OK(writer_->Write(*it)); - } - - // Since we're holding this writer's lock anyway we may as well check for batches - // added while we were writing - } while (!pending_vector.empty()); - return true; - } - return false; + void Push(std::shared_ptr batch) { + auto push_lock = push_mutex_.Lock(); + pending_.push_back(std::move(batch)); } - // Flush a set of WriteQueues. If we find any queue to be locked we try flushing all - // the others before retrying to minimize busy waiting. - static Status FlushSet(std::unordered_set set) { - while (!set.empty()) { - for (auto it = set.begin(); it != set.end();) { - WriteQueue* queue = *it; - ARROW_ASSIGN_OR_RAISE(bool flushed, queue->TryFlush()); - if (flushed) { - it = set.erase(it); - } else { - ++it; + // Flush all pending batches, or return immediately if another thread is already + // flushing this queue. + Status Flush() { + if (auto writer_lock = writer_mutex_.TryLock()) { + while (true) { + std::shared_ptr batch; + { + auto push_lock = push_mutex_.Lock(); + if (pending_.empty()) { + // Ensure the writer_lock is released before the push_lock. Otherwise another + // thread might successfully Push() a batch but then fail to Flush() it since + // the writer_lock is still held, leaving an unflushed batch in pending_. + writer_lock.Unlock(); + break; + } + batch = std::move(pending_.front()); + pending_.pop_front(); } + RETURN_NOT_OK(writer_->Write(batch)); } } return Status::OK(); @@ -252,11 +240,11 @@ class WriteQueue { using Set = std::unordered_map; private: - // FileWriters are not required to be thread safe, so they must be guarded with a mutex. util::Mutex writer_mutex_; std::shared_ptr writer_; - // A stack into which batches to write will be pushed - util::LockFreeStack> pending_; + + util::Mutex push_mutex_; + std::deque> pending_; // The (formatted) partition expression to which this queue corresponds std::string partition_expression_; @@ -373,7 +361,9 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio } // flush all touched WriteQueues - RETURN_NOT_OK(WriteQueue::FlushSet(std::move(need_flushed))); + for (auto queue : need_flushed) { + RETURN_NOT_OK(queue->Flush()); + } } return Status::OK(); diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 46b79bd0ec5..c968cdd1a43 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -66,7 +66,6 @@ add_arrow_test(utility-test add_arrow_test(threading-utility-test SOURCES future_test - lock_free_test task_group_test thread_pool_test) diff --git a/cpp/src/arrow/util/lock_free.h b/cpp/src/arrow/util/lock_free.h deleted file mode 100644 index cd76bd30803..00000000000 --- a/cpp/src/arrow/util/lock_free.h +++ /dev/null @@ -1,126 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -#include "arrow/util/macros.h" - -namespace arrow { -namespace util { - -/// A lock free container with FILO storage order. -/// -/// Two thread safe operations are supported: -/// - Push a value onto the stack -/// - move construct from another stack -template -class LockFreeStack { - public: - LockFreeStack() { nodes_.store(NULLPTR); } - - ~LockFreeStack() { Delete(); } - - LockFreeStack(const LockFreeStack&) = delete; - LockFreeStack& operator=(const LockFreeStack&) = delete; - LockFreeStack& operator=(LockFreeStack&& other) = delete; - - LockFreeStack(LockFreeStack&& other) { - Node* other_nodes = other.nodes_.load(); - - do { - } while (!other.CompareExchange(&other_nodes, NULLPTR)); - - nodes_.store(other_nodes); - } - - void Push(T value) { - Node* new_head = new Node{std::move(value), nodes_.load()}; - - do { - } while (!CompareExchange(&new_head->next, new_head)); - } - - /// range-for compatible iteration interface - /// NB: *not* thread safe - struct iterator; - iterator begin(); - iterator end(); - - std::vector ToVector() { - std::vector out; - for (auto&& element : *this) { - out.push_back(std::move(element)); - } - Delete(); - return out; - } - - private: - struct Node { - T value; - Node* next; - }; - - void Delete() { - Node* next = nodes_.load(); - - while (next != NULLPTR) { - Node* node = next; - next = next->next; - delete node; - } - - nodes_.store(NULLPTR); - } - - bool CompareExchange(Node** expected, Node* desired) { - return nodes_.compare_exchange_strong(*expected, desired); - } - - std::atomic nodes_; -}; - -template -struct LockFreeStack::iterator { - bool operator!=(iterator other) const { return node != other.node; } - - T& operator*() const { return node->value; } - - iterator& operator++() { - node = node->next; - return *this; - } - - Node* node; -}; - -template -typename LockFreeStack::iterator LockFreeStack::begin() { - return {nodes_.load()}; -} - -template -typename LockFreeStack::iterator LockFreeStack::end() { - return {NULLPTR}; -} - -} // namespace util -} // namespace arrow diff --git a/cpp/src/arrow/util/lock_free_test.cc b/cpp/src/arrow/util/lock_free_test.cc deleted file mode 100644 index 6913c2e9721..00000000000 --- a/cpp/src/arrow/util/lock_free_test.cc +++ /dev/null @@ -1,168 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include - -#include -#include - -#include "arrow/status.h" -#include "arrow/testing/gtest_util.h" -#include "arrow/util/io_util.h" -#include "arrow/util/lock_free.h" -#include "arrow/util/task_group.h" -#include "arrow/util/thread_pool.h" - -namespace arrow { - -using internal::TaskGroup; - -namespace util { - -static constexpr int kLots = 512; -// static constexpr int kLots = 16 * 1024; - -class TestLockFreeStack : public ::testing::Test { - public: - void TearDown() override { - fflush(stdout); - fflush(stderr); - } - - void AppendLots(LockFreeStack* ints) { - for (int j = 0; j < kLots; ++j) { - ints->Push(j); - } - } - - void AppendLots(std::vector* ints) { - for (int j = 0; j < kLots; ++j) { - ints->push_back(j); - } - } - - template - std::vector Flatten(std::vector> nested) { - size_t flat_size = 0; - for (auto&& v : nested) { - flat_size += v.size(); - } - - std::vector flat(flat_size); - auto it = flat.begin(); - for (auto&& v : nested) { - it = std::move(v.begin(), v.end(), it); - } - - EXPECT_EQ(it, flat.end()); - return flat; - } - - std::shared_ptr task_group = - TaskGroup::MakeThreaded(::arrow::internal::GetCpuThreadPool()); -}; - -TEST_F(TestLockFreeStack, AppendOnly) { - LockFreeStack ints; - - for (int i = 0; i < task_group->parallelism(); ++i) { - task_group->Append([&] { - AppendLots(&ints); - return Status::OK(); - }); - } - - ASSERT_OK(task_group->Finish()); - - std::vector actual = ints.ToVector(), expected; - - for (int i = 0; i < task_group->parallelism(); ++i) { - AppendLots(&expected); - } - - for (auto v : {&actual, &expected}) { - std::sort(v->begin(), v->end()); - } - - EXPECT_THAT(actual, testing::ElementsAreArray(expected)); -} - -TEST_F(TestLockFreeStack, ProduceThenConsumeMoved) { - LockFreeStack ints; - - std::vector> per_thread_actual(task_group->parallelism()); - - for (int i = 0; i < task_group->parallelism(); ++i) { - task_group->Append([&, i] { - AppendLots(&ints); - auto local = std::move(ints); - per_thread_actual[i] = local.ToVector(); - return Status::OK(); - }); - } - - ASSERT_OK(task_group->Finish()); - - std::vector actual = Flatten(std::move(per_thread_actual)), expected; - - for (int i = 0; i < task_group->parallelism(); ++i) { - AppendLots(&expected); - } - - for (auto v : {&actual, &expected}) { - std::sort(v->begin(), v->end()); - } - - EXPECT_THAT(actual, testing::ElementsAreArray(expected)); -} - -TEST_F(TestLockFreeStack, AlternatingProduceAndConsumeMoved) { - LockFreeStack ints; - - std::vector> per_thread_actual(task_group->parallelism()); - - for (int i = 0; i < task_group->parallelism(); ++i) { - task_group->Append([&, i] { - for (int j = 0; j < kLots; ++j) { - ints.Push(j); - auto local = std::move(ints); - for (int got : local) { - per_thread_actual[i].push_back(got); - } - } - return Status::OK(); - }); - } - - ASSERT_OK(task_group->Finish()); - - std::vector actual = Flatten(std::move(per_thread_actual)), expected; - - for (int i = 0; i < task_group->parallelism(); ++i) { - AppendLots(&expected); - } - - for (auto v : {&actual, &expected}) { - std::sort(v->begin(), v->end()); - } - - EXPECT_THAT(actual, testing::ElementsAreArray(expected)); -} - -} // namespace util -} // namespace arrow From a3454d96209c11dc9ce3994abdfe5c2a5fdaa555 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 5 Oct 2020 09:07:07 -0400 Subject: [PATCH 24/34] more exact typing in GetOrInsertGenerated --- cpp/src/arrow/dataset/file_base.cc | 22 ++++++++++------------ cpp/src/arrow/util/map.h | 8 ++++---- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index aa9efb0dfa6..c930d73d391 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -322,25 +322,23 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio util::Mutex::Guard wait_for_opened_writer_lock; - using internal::GetOrInsertGenerated; - WriteQueue* queue; { // lookup the queue to which batch should be appended auto queues_lock = queues_mutex.Lock(); - queue = GetOrInsertGenerated(&queues, std::move(part), - [&](const std::string& part) { - // lookup in `queues` also failed, - // generate a new WriteQueue - size_t queue_index = queues.size() - 1; + queue = internal::GetOrInsertGenerated( + &queues, std::move(part), + [&](const std::string& emplaced_part) { + // lookup in `queues` also failed, + // generate a new WriteQueue + size_t queue_index = queues.size() - 1; - queues_storage.emplace_back( - part, queue_index, - &wait_for_opened_writer_lock); + queues_storage.emplace_back(emplaced_part, queue_index, + &wait_for_opened_writer_lock); - return &queues_storage.back(); - }) + return &queues_storage.back(); + }) ->second; } diff --git a/cpp/src/arrow/util/map.h b/cpp/src/arrow/util/map.h index 2ca1cf4c82d..5523909061d 100644 --- a/cpp/src/arrow/util/map.h +++ b/cpp/src/arrow/util/map.h @@ -28,8 +28,8 @@ namespace internal { /// std::unordered_map. If `key` exists in the container, an iterator to that pair /// will be returned. If `key` does not exist in the container, `gen(key)` will be /// invoked and its return value inserted. -template -auto GetOrInsertGenerated(Map* map, Key key, Gen&& gen) +template +auto GetOrInsertGenerated(Map* map, typename Map::key_type key, Gen&& gen) -> decltype(map->begin()->second = gen(map->begin()->first), map->begin()) { decltype(gen(map->begin()->first)) placeholder{}; @@ -43,8 +43,8 @@ auto GetOrInsertGenerated(Map* map, Key key, Gen&& gen) return it_success.first; } -template -auto GetOrInsertGenerated(Map* map, Key key, Gen&& gen) +template +auto GetOrInsertGenerated(Map* map, typename Map::key_type key, Gen&& gen) -> Resultbegin()->second = gen(map->begin()->first).ValueOrDie(), map->begin())> { decltype(gen(map->begin()->first).ValueOrDie()) placeholder{}; From 448e04e9d291e793d7347f1ccb6be0db501a42ae Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 5 Oct 2020 17:01:05 -0400 Subject: [PATCH 25/34] move lazy initialization locking into Flush() --- cpp/src/arrow/dataset/file_base.cc | 61 +++++++++++------------------- 1 file changed, 23 insertions(+), 38 deletions(-) diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index c930d73d391..79083f73199 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -177,10 +177,10 @@ Status ValidateBasenameTemplate(util::string_view basename_template) { class WriteQueue { public: WriteQueue(std::string partition_expression, size_t index, - util::Mutex::Guard* wait_for_opened_writer_lock) - : partition_expression_(std::move(partition_expression)), index_(index) { - *wait_for_opened_writer_lock = writer_mutex_.Lock(); - } + std::shared_ptr schema) + : partition_expression_(std::move(partition_expression)), + index_(index), + schema_(std::move(schema)) {} // Push a batch into the writer's queue of pending writes. void Push(std::shared_ptr batch) { @@ -190,8 +190,14 @@ class WriteQueue { // Flush all pending batches, or return immediately if another thread is already // flushing this queue. - Status Flush() { + Status Flush(const FileSystemDatasetWriteOptions& write_options) { if (auto writer_lock = writer_mutex_.TryLock()) { + if (writer_ == nullptr) { + // FileWriters are opened lazily to avoid blocking while holding the lock on the + // scan-wide queue set + RETURN_NOT_OK(OpenWriter(write_options)); + } + while (true) { std::shared_ptr batch; { @@ -214,8 +220,8 @@ class WriteQueue { const std::shared_ptr& writer() const { return writer_; } - Status OpenWriter(const FileSystemDatasetWriteOptions& write_options, - std::shared_ptr schema) { + private: + Status OpenWriter(const FileSystemDatasetWriteOptions& write_options) { auto dir = fs::internal::EnsureTrailingSlash(write_options.base_dir) + partition_expression_; @@ -231,15 +237,12 @@ class WriteQueue { ARROW_ASSIGN_OR_RAISE(auto destination, write_options.filesystem->OpenOutputStream(path)); - ARROW_ASSIGN_OR_RAISE(writer_, write_options.format()->MakeWriter( - std::move(destination), std::move(schema), - write_options.file_write_options)); + ARROW_ASSIGN_OR_RAISE( + writer_, write_options.format()->MakeWriter(std::move(destination), schema_, + write_options.file_write_options)); return Status::OK(); } - using Set = std::unordered_map; - - private: util::Mutex writer_mutex_; std::shared_ptr writer_; @@ -250,6 +253,8 @@ class WriteQueue { std::string partition_expression_; size_t index_; + + std::shared_ptr schema_; }; Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_options, @@ -288,16 +293,12 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio } } - // WriteQueues are stored in this deque until writing is completed and are otherwise - // referenced by non-owning pointers. - std::deque queues_storage; - // Store a mapping from partitions (represened by their formatted partition expressions) // to a WriteQueue which flushes batches into that partition's output file. In principle // any thread could produce a batch for any partition, so each task alternates between // pushing batches and flushing them to disk. util::Mutex queues_mutex; - WriteQueue::Set queues; + std::unordered_map> queues; auto fragment_for_task_it = fragment_for_task.begin(); for (const auto& scan_task : scan_tasks) { @@ -320,8 +321,6 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio ARROW_ASSIGN_OR_RAISE(auto part, write_options.partitioning->Format(partition_expression)); - util::Mutex::Guard wait_for_opened_writer_lock; - WriteQueue* queue; { // lookup the queue to which batch should be appended @@ -334,24 +333,10 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio // generate a new WriteQueue size_t queue_index = queues.size() - 1; - queues_storage.emplace_back(emplaced_part, queue_index, - &wait_for_opened_writer_lock); - - return &queues_storage.back(); + return internal::make_unique( + emplaced_part, queue_index, batch->schema()); }) - ->second; - } - - if (wait_for_opened_writer_lock) { - // We have openned a new WriteQueue for `part` and must set up its FileWriter. - // That could be slow, so we've already released queues_lock, allowing - // other threads to look up already-open queues. This is safe since we are - // mutating an element of `queues` in place, which does not conflict with - // access to any other element. However, we do need to continue holding - // wait_for_opened_writer_lock while we open the FileWriter to prevent other - // threads from using the writer before it has been constructed. - RETURN_NOT_OK(queue->OpenWriter(write_options, batch->schema())); - wait_for_opened_writer_lock.Unlock(); + ->second.get(); } queue->Push(std::move(batch)); @@ -360,7 +345,7 @@ Status FileSystemDataset::Write(const FileSystemDatasetWriteOptions& write_optio // flush all touched WriteQueues for (auto queue : need_flushed) { - RETURN_NOT_OK(queue->Flush()); + RETURN_NOT_OK(queue->Flush(write_options)); } } From 87d863a78bde186a0542b3b0fb9095a2fc778066 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 5 Oct 2020 17:05:05 -0400 Subject: [PATCH 26/34] fix comment --- cpp/src/arrow/dataset/file_base.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/dataset/file_base.cc b/cpp/src/arrow/dataset/file_base.cc index 79083f73199..ef044ea3a03 100644 --- a/cpp/src/arrow/dataset/file_base.cc +++ b/cpp/src/arrow/dataset/file_base.cc @@ -193,8 +193,7 @@ class WriteQueue { Status Flush(const FileSystemDatasetWriteOptions& write_options) { if (auto writer_lock = writer_mutex_.TryLock()) { if (writer_ == nullptr) { - // FileWriters are opened lazily to avoid blocking while holding the lock on the - // scan-wide queue set + // FileWriters are opened lazily to avoid blocking access to a scan-wide queue set RETURN_NOT_OK(OpenWriter(write_options)); } From 7db8bf33aa8393bfc7f31517aea0681e1963fdc1 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 6 Oct 2020 09:54:57 -0400 Subject: [PATCH 27/34] address review comments --- cpp/src/arrow/dataset/file_base.h | 1 - cpp/src/arrow/dataset/file_ipc.cc | 6 +++--- cpp/src/arrow/dataset/file_parquet.cc | 24 ++++++++++++------------ cpp/src/arrow/util/mutex.cc | 18 +++++++----------- cpp/src/arrow/util/mutex.h | 2 +- cpp/src/arrow/util/string.h | 2 ++ 6 files changed, 25 insertions(+), 28 deletions(-) diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h index 11f5a608a91..ee0df11b32b 100644 --- a/cpp/src/arrow/dataset/file_base.h +++ b/cpp/src/arrow/dataset/file_base.h @@ -129,7 +129,6 @@ class ARROW_DS_EXPORT FileFormat : public std::enable_shared_from_this IsSupported(const FileSource& source) const = 0; diff --git a/cpp/src/arrow/dataset/file_ipc.cc b/cpp/src/arrow/dataset/file_ipc.cc index 9c7d643f9ac..e15de84b9bf 100644 --- a/cpp/src/arrow/dataset/file_ipc.cc +++ b/cpp/src/arrow/dataset/file_ipc.cc @@ -163,9 +163,9 @@ Result IpcFileFormat::ScanFile(std::shared_ptr op fragment->source()); } -/// -/// IpcFileWriter, IpcFileWriteOptions -/// +// +// IpcFileWriter, IpcFileWriteOptions +// std::shared_ptr IpcFileFormat::DefaultWriteOptions() { std::shared_ptr options( diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index ced75c652c4..fe2f64ad526 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -427,9 +427,9 @@ Result> ParquetFileFormat::MakeFragment( std::move(physical_schema), {})); } -/// -/// ParquetFileWriter, ParquetFileWriteOptions -/// +// +// ParquetFileWriter, ParquetFileWriteOptions +// std::shared_ptr ParquetFileFormat::DefaultWriteOptions() { std::shared_ptr options( @@ -469,9 +469,9 @@ Status ParquetFileWriter::Write(const std::shared_ptr& batch) { Status ParquetFileWriter::Finish() { return parquet_writer_->Close(); } -/// -/// RowGroupInfo -/// +// +// RowGroupInfo +// std::vector RowGroupInfo::FromIdentifiers(const std::vector ids) { std::vector results; @@ -526,9 +526,9 @@ bool RowGroupInfo::Satisfy(const Expression& predicate) const { return !HasStatistics() || predicate.IsSatisfiableWith(statistics_expression_); } -/// -/// ParquetFileFragment -/// +// +// ParquetFileFragment +// ParquetFileFragment::ParquetFileFragment(FileSource source, std::shared_ptr format, @@ -631,9 +631,9 @@ Result> ParquetFileFragment::FilterRowGroups( return row_groups; } -/// -/// ParquetDatasetFactory -/// +// +// ParquetDatasetFactory +// ParquetDatasetFactory::ParquetDatasetFactory( std::shared_ptr filesystem, std::shared_ptr format, diff --git a/cpp/src/arrow/util/mutex.cc b/cpp/src/arrow/util/mutex.cc index 63ff026f4ee..7456d7889d8 100644 --- a/cpp/src/arrow/util/mutex.cc +++ b/cpp/src/arrow/util/mutex.cc @@ -24,23 +24,19 @@ namespace arrow { namespace util { -struct Mutex::Impl : std::mutex {}; +struct Mutex::Impl { + std::mutex mutex_; +}; Mutex::Guard::Guard(Mutex* locked) : locked_(locked, [](Mutex* locked) { - DCHECK(!locked->impl_->try_lock()); - locked->impl_->unlock(); + DCHECK(!locked->impl_->mutex_.try_lock()); + locked->impl_->mutex_.unlock(); }) {} -void Mutex::Guard::Unlock() { - if (locked_) { - locked_.reset(); - } -} - Mutex::Guard Mutex::TryLock() { DCHECK_NE(impl_, nullptr); - if (impl_->try_lock()) { + if (impl_->mutex_.try_lock()) { return Guard{this}; } return Guard{}; @@ -48,7 +44,7 @@ Mutex::Guard Mutex::TryLock() { Mutex::Guard Mutex::Lock() { DCHECK_NE(impl_, nullptr); - impl_->lock(); + impl_->mutex_.lock(); return Guard{this}; } diff --git a/cpp/src/arrow/util/mutex.h b/cpp/src/arrow/util/mutex.h index 8c9f0d40002..f4fc64181fb 100644 --- a/cpp/src/arrow/util/mutex.h +++ b/cpp/src/arrow/util/mutex.h @@ -43,7 +43,7 @@ class ARROW_EXPORT Mutex { explicit operator bool() const { return bool(locked_); } - void Unlock(); + void Unlock() { locked_.reset(); } private: explicit Guard(Mutex* locked); diff --git a/cpp/src/arrow/util/string.h b/cpp/src/arrow/util/string.h index 4201bbe2ea8..56a02dfb20b 100644 --- a/cpp/src/arrow/util/string.h +++ b/cpp/src/arrow/util/string.h @@ -57,6 +57,8 @@ bool AsciiEqualsCaseInsensitive(util::string_view left, util::string_view right) ARROW_EXPORT std::string AsciiToLower(util::string_view value); +/// \brief Search for the first instance of a token and replace it or return nullopt if +/// the token is not found. ARROW_EXPORT util::optional Replace(util::string_view s, util::string_view token, util::string_view replacement); From 33257a6fcbefdd9de54a6a0faa8b863ed7566bf1 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 6 Oct 2020 10:12:16 -0400 Subject: [PATCH 28/34] add default basename_template for python --- python/pyarrow/_dataset.pyx | 16 +++++++--- python/pyarrow/dataset.py | 18 ++++++----- python/pyarrow/tests/test_dataset.py | 46 ++++++++++++---------------- 3 files changed, 42 insertions(+), 38 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 847b4b0d71e..bfee82ba362 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -699,6 +699,10 @@ cdef class FileFormat(_Weakrefable): def make_write_options(self): return FileWriteOptions.wrap(self.format.DefaultWriteOptions()) + @property + def default_extname(self): + return frombytes(self.format.type_name()) + def __eq__(self, other): try: return self.equals(other) @@ -1350,6 +1354,10 @@ cdef class IpcFileFormat(FileFormat): def equals(self, IpcFileFormat other): return True + @property + def default_extname(self): + return "feather" + def __reduce__(self): return IpcFileFormat, tuple() @@ -2297,10 +2305,10 @@ def _get_partition_keys(Expression partition_expression): def _filesystemdataset_write( - data, object base_dir, str basename_template, Schema schema not None, - FileFormat format not None, FileSystem filesystem not None, - Partitioning partitioning not None, FileWriteOptions file_options not None, - bint use_threads, + data not None, object base_dir not None, str basename_template not None, + Schema schema not None, FileFormat format not None, + FileSystem filesystem not None, Partitioning partitioning not None, + FileWriteOptions file_options not None, bint use_threads, ): """ CFileSystemDataset.Write wrapper diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py index 574fad9d325..5cadd69dc6f 100644 --- a/python/pyarrow/dataset.py +++ b/python/pyarrow/dataset.py @@ -697,7 +697,7 @@ def _ensure_write_partitioning(scheme): return scheme -def write_dataset(data, base_dir, basename_template, format=None, +def write_dataset(data, base_dir, basename_template=None, format=None, partitioning=None, schema=None, filesystem=None, file_options=None, use_threads=True): """ @@ -710,16 +710,17 @@ def write_dataset(data, base_dir, basename_template, format=None, in-memory Arrow data. base_dir : str The root directory where to write the dataset. - basename_template : str + basename_template : str, optional A template string used to generate basenames of written data files. The token '{i}' will be replaced with an automatically incremented - integer. + integer. If not specified, it defaults to + "dat_{i}." + format.default_extname format : FileFormat or str The format in which to write the dataset. Currently supported: - "ipc"/"feather". If a FileSystemDataset is being written and `format` - is not specified, it defaults to the same format as the specified - FileSystemDataset. When writing a Table or RecordBatch, this keyword - is required. + "parquet", "ipc"/"feather". If a FileSystemDataset is being written + and `format` is not specified, it defaults to the same format as the + specified FileSystemDataset. When writing a Table or RecordBatch, this + keyword is required. partitioning : Partitioning, optional The partitioning scheme specified with the ``partitioning()`` function. @@ -745,6 +746,9 @@ def write_dataset(data, base_dir, basename_template, format=None, ) format = _ensure_format(format) + if basename_template is None: + basename_template = "dat_{i}." + format.default_extname + if file_options is None: file_options = format.make_write_options() diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index a7e46ae4ce1..3df92fb8944 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -2151,10 +2151,8 @@ def _check_dataset_roundtrip(dataset, base_dir, expected_files, base_dir_path=None, partitioning=None): base_dir_path = base_dir_path or base_dir - ds.write_dataset( - dataset, base_dir, basename_template='dat_{i}.feather', - format="feather", partitioning=partitioning, use_threads=False - ) + ds.write_dataset(dataset, base_dir, format="feather", + partitioning=partitioning, use_threads=False) # check that all files are present file_paths = list(base_dir_path.rglob("*")) @@ -2247,13 +2245,13 @@ def test_write_dataset_use_threads(tempdir): target1 = tempdir / 'partitioned1' ds.write_dataset( - dataset, target1, basename_template='dat_{i}.feather', - format="feather", partitioning=partitioning, use_threads=True + dataset, target1, format="feather", partitioning=partitioning, + use_threads=True ) target2 = tempdir / 'partitioned2' ds.write_dataset( - dataset, target2, basename_template='dat_{i}.feather', - format="feather", partitioning=partitioning, use_threads=False + dataset, target2, format="feather", partitioning=partitioning, + use_threads=False ) # check that reading in gives same result @@ -2270,10 +2268,10 @@ def test_write_table(tempdir): base_dir = tempdir / 'single' ds.write_dataset(table, base_dir, - basename_template='dat_{i}.feather', format="feather") + basename_template='dat_{i}.arrow', format="feather") # check that all files are present file_paths = list(base_dir.rglob("*")) - expected_paths = [base_dir / "dat_0.feather"] + expected_paths = [base_dir / "dat_0.arrow"] assert set(file_paths) == set(expected_paths) # check Table roundtrip result = ds.dataset(base_dir, format="ipc").to_table() @@ -2283,13 +2281,13 @@ def test_write_table(tempdir): base_dir = tempdir / 'partitioned' partitioning = ds.partitioning( pa.schema([("part", pa.string())]), flavor="hive") - ds.write_dataset(table, base_dir, - basename_template='dat_{i}.feather', - format="feather", partitioning=partitioning) + ds.write_dataset(table, base_dir, format="feather", + basename_template='dat_{i}.arrow', + partitioning=partitioning) file_paths = list(base_dir.rglob("*")) expected_paths = [ - base_dir / "part=a", base_dir / "part=a" / "dat_0.feather", - base_dir / "part=b", base_dir / "part=b" / "dat_1.feather" + base_dir / "part=a", base_dir / "part=a" / "dat_0.arrow", + base_dir / "part=b", base_dir / "part=b" / "dat_1.arrow" ] assert set(file_paths) == set(expected_paths) result = ds.dataset(base_dir, format="ipc", partitioning=partitioning) @@ -2305,30 +2303,26 @@ def test_write_table_multiple_fragments(tempdir): # Table with multiple batches written as single Fragment by default base_dir = tempdir / 'single' - ds.write_dataset(table, base_dir, basename_template='dat_{i}.feather', - format="feather") + ds.write_dataset(table, base_dir, format="feather") assert set(base_dir.rglob("*")) == set([base_dir / "dat_0.feather"]) assert ds.dataset(base_dir, format="ipc").to_table().equals(table) # Same for single-element list of Table base_dir = tempdir / 'single-list' - ds.write_dataset([table], base_dir, basename_template='dat_{i}.feather', - format="feather") + ds.write_dataset([table], base_dir, format="feather") assert set(base_dir.rglob("*")) == set([base_dir / "dat_0.feather"]) assert ds.dataset(base_dir, format="ipc").to_table().equals(table) # Provide list of batches to write multiple fragments base_dir = tempdir / 'multiple' - ds.write_dataset(table.to_batches(), base_dir, - basename_template='dat_{i}.feather', format="feather") + ds.write_dataset(table.to_batches(), base_dir, format="feather") assert set(base_dir.rglob("*")) == set( [base_dir / "dat_0.feather"]) assert ds.dataset(base_dir, format="ipc").to_table().equals(table) # Provide list of tables to write multiple fragments base_dir = tempdir / 'multiple-table' - ds.write_dataset([table, table], base_dir, - basename_template='dat_{i}.feather', format="feather") + ds.write_dataset([table, table], base_dir, format="feather") assert set(base_dir.rglob("*")) == set( [base_dir / "dat_0.feather"]) assert ds.dataset(base_dir, format="ipc").to_table().equals( @@ -2348,8 +2342,7 @@ def test_write_dataset_parquet(tempdir): # using default "parquet" format string base_dir = tempdir / 'parquet_dataset' - ds.write_dataset(table, base_dir, - basename_template='dat_{i}.parquet', format="parquet") + ds.write_dataset(table, base_dir, format="parquet") # check that all files are present file_paths = list(base_dir.rglob("*")) expected_paths = [base_dir / "dat_0.parquet"] @@ -2363,7 +2356,6 @@ def test_write_dataset_parquet(tempdir): format = ds.ParquetFileFormat() opts = format.make_write_options(version=version) base_dir = tempdir / 'parquet_dataset_version{0}'.format(version) - ds.write_dataset(table, base_dir, basename_template='dat_{i}.parquet', - format=format, file_options=opts) + ds.write_dataset(table, base_dir, format=format, file_options=opts) meta = pq.read_metadata(base_dir / "dat_0.parquet") assert meta.format_version == version From d46c1af09401f6208607c278581b2166383c986d Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Mon, 5 Oct 2020 14:23:24 -0700 Subject: [PATCH 29/34] R code/doc polishing --- r/NAMESPACE | 3 +- r/R/dataset-format.R | 39 ++++++++++++++++--- r/R/dataset-scan.R | 3 ++ r/R/dataset-write.R | 69 ++++++--------------------------- r/man/FileWriteOptions.Rd | 3 +- r/man/write_dataset.Rd | 7 ++-- r/tests/testthat/test-dataset.R | 16 +++++++- 7 files changed, 70 insertions(+), 70 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index c54acf7a824..f33d39244d4 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -22,6 +22,7 @@ S3method(Ops,array_expression) S3method(all,equal.ArrowObject) S3method(as.character,Array) S3method(as.character,ChunkedArray) +S3method(as.character,FileFormat) S3method(as.character,Scalar) S3method(as.data.frame,RecordBatch) S3method(as.data.frame,Table) @@ -74,6 +75,7 @@ S3method(min,ChunkedArray) S3method(names,Dataset) S3method(names,FeatherReader) S3method(names,RecordBatch) +S3method(names,Scanner) S3method(names,ScannerBuilder) S3method(names,Schema) S3method(names,Table) @@ -159,7 +161,6 @@ export(MessageType) export(MetadataVersion) export(ParquetFileFormat) export(ParquetFileReader) -export(ParquetFileWriteOptions) export(ParquetFileWriter) export(ParquetReaderProperties) export(ParquetVersionType) diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R index c98e7b5d44e..ec3ac36f9e5 100644 --- a/r/R/dataset-format.R +++ b/r/R/dataset-format.R @@ -61,11 +61,6 @@ FileFormat <- R6Class("FileFormat", inherit = ArrowObject, } else { self } - }, - make_write_options = function(...) { - options <- shared_ptr(FileWriteOptions, dataset___FileFormat__DefaultWriteOptions(self)) - options$update(...) - options } ), active = list( @@ -89,6 +84,13 @@ FileFormat$create <- function(format, ...) { } } +#' @export +as.character.FileFormat <- function(x, ...) { + out <- x$type + # Slight hack: special case IPC -> feather, otherwise is just the type_name + ifelse(out == "ipc", "feather", out) +} + #' @usage NULL #' @format NULL #' @rdname FileFormat @@ -125,3 +127,30 @@ csv_file_format_parse_options <- function(...) { CsvParseOptions$create(...) } } + +#' Format-specific write options +#' +#' @description +#' A `FileWriteOptions` holds write options specific to a `FileFormat`. +FileWriteOptions <- R6Class("FileWriteOptions", inherit = ArrowObject, + public = list( + update = function(...) { + if (self$type == "parquet") { + dataset___ParquetFileWriteOptions__update(self, + ParquetWriterProperties$create(...), + ParquetArrowWriterProperties$create(...)) + } + invisible(self) + } + ), + active = list( + type = function() dataset___FileWriteOptions__type_name(self) + ) +) +FileWriteOptions$create <- function(format, ...) { + if (!inherits(format, "FileFormat")) { + format <- FileFormat$create(format) + } + options <- shared_ptr(FileWriteOptions, dataset___FileFormat__DefaultWriteOptions(format)) + options$update(...) +} diff --git a/r/R/dataset-scan.R b/r/R/dataset-scan.R index ab542fee8f9..e9017825782 100644 --- a/r/R/dataset-scan.R +++ b/r/R/dataset-scan.R @@ -98,6 +98,9 @@ Scanner$create <- function(dataset, scanner_builder$Finish() } +#' @export +names.Scanner <- function(x) names(x$schema) + ScanTask <- R6Class("ScanTask", inherit = ArrowObject, public = list( Execute = function() map(dataset___ScanTask__get_batches(self), shared_ptr, class = RecordBatch) diff --git a/r/R/dataset-write.R b/r/R/dataset-write.R index d667db5f8d5..d56126f05b2 100644 --- a/r/R/dataset-write.R +++ b/r/R/dataset-write.R @@ -35,8 +35,9 @@ #' @param partitioning `Partitioning` or a character vector of columns to #' use as partition keys (to be written as path segments). Default is to #' use the current `group_by()` columns. -#' @param basename_template `"{i}"` will be replaced with an autoincremented -#' integer to generate basenames of datafiles. For example `"dat_{i}.feather"` +#' @param basename_template string template for the names of files to be written. +#' Must contain `"{i}"`, which will be replaced with an autoincremented +#' integer to generate basenames of datafiles. For example, `"dat_{i}.feather"` #' will yield `"dat_0.feather", ...`. #' @param hive_style logical: write partition segments as Hive-style #' (`key1=value1/key2=value2/file.ext`) or as just bare values. Default is `TRUE`. @@ -50,21 +51,18 @@ write_dataset <- function(dataset, path, format = dataset$format, partitioning = dplyr::group_vars(dataset), - basename_template = paste("dat_{i}", format$type, sep="."), + basename_template = paste0("dat_{i}.", as.character(format)), hive_style = TRUE, filesystem = NULL, ...) { if (inherits(dataset, "arrow_dplyr_query")) { - # Check for a select - if (!identical(dataset$selected_columns, set_names(names(dataset$.data)))) { - # We can select a subset of columns but we can't rename them - if (!setequal(dataset$selected_columns, names(dataset$selected_columns))) { - stop("Renaming columns when writing a dataset is not yet supported", call. = FALSE) - } - dataset <- ensure_group_vars(dataset) + # We can select a subset of columns but we can't rename them + if (!all(dataset$selected_columns == names(dataset$selected_columns))) { + stop("Renaming columns when writing a dataset is not yet supported", call. = FALSE) } - } - if (inherits(dataset, "grouped_df")) { + # partitioning vars need to be in the `select` schema + dataset <- ensure_group_vars(dataset) + } else if (inherits(dataset, "grouped_df")) { force(partitioning) # Drop the grouping metadata before writing; we've already consumed it # now to construct `partitioning` and don't want it in the metadata$r @@ -72,18 +70,7 @@ write_dataset <- function(dataset, } scanner <- Scanner$create(dataset) - - if (!inherits(format, "FileFormat")) { - format <- FileFormat$create(format) - } - if (inherits(format, "ParquetFileFormat")) { - options <- format$make_write_options(table=dataset, ...) - } else { - options <- format$make_write_options(...) - } - if (!inherits(partitioning, "Partitioning")) { - # TODO: tidyselect? partition_schema <- scanner$schema[partitioning] if (isTRUE(hive_style)) { partitioning <- HivePartitioning$create(partition_schema) @@ -91,42 +78,10 @@ write_dataset <- function(dataset, partitioning <- DirectoryPartitioning$create(partition_schema) } } + path_and_fs <- get_path_and_filesystem(path, filesystem) + options <- FileWriteOptions$create(format, table = scanner, ...) dataset___Dataset__Write(options, path_and_fs$fs, path_and_fs$path, partitioning, basename_template, scanner) } - -#' Format-specific write options -#' -#' @description -#' A `FileWriteOptions` holds write options specific to a `FileFormat`. -FileWriteOptions <- R6Class("FileWriteOptions", inherit = ArrowObject, - public = list( - ..dispatch = function() { - type <- self$type - if (type == "parquet") { - shared_ptr(ParquetFileWriteOptions, self$pointer()) - } else { - self - } - }, - update = function(...) { - type <- self$type - if (type == "parquet") { - dataset___ParquetFileWriteOptions__update(self, - ParquetWriterProperties$create(...), - ParquetArrowWriterProperties$create(...)) - } - } - ), - active = list( - type = function() dataset___FileWriteOptions__type_name(self) - ) -) - -#' @usage NULL -#' @format NULL -#' @rdname FileWriteOptions -#' @export -ParquetFileWriteOptions <- R6Class("ParquetFileWriteOptions", inherit = FileWriteOptions) diff --git a/r/man/FileWriteOptions.Rd b/r/man/FileWriteOptions.Rd index 5c4d04456b4..661393c8e0d 100644 --- a/r/man/FileWriteOptions.Rd +++ b/r/man/FileWriteOptions.Rd @@ -1,8 +1,7 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dataset-write.R +% Please edit documentation in R/dataset-format.R \name{FileWriteOptions} \alias{FileWriteOptions} -\alias{ParquetFileWriteOptions} \title{Format-specific write options} \description{ A \code{FileWriteOptions} holds write options specific to a \code{FileFormat}. diff --git a/r/man/write_dataset.Rd b/r/man/write_dataset.Rd index 218876427ca..ee51d244a44 100644 --- a/r/man/write_dataset.Rd +++ b/r/man/write_dataset.Rd @@ -9,7 +9,7 @@ write_dataset( path, format = dataset$format, partitioning = dplyr::group_vars(dataset), - basename_template = paste("dat_{i}", format$type, sep = "."), + basename_template = paste0("dat_{i}.", as.character(format)), hive_style = TRUE, filesystem = NULL, ... @@ -34,8 +34,9 @@ same format as \code{dataset}.} use as partition keys (to be written as path segments). Default is to use the current \code{group_by()} columns.} -\item{basename_template}{\code{"{i}"} will be replaced with an autoincremented -integer to generate basenames of datafiles. For example \code{"dat_{i}.feather"} +\item{basename_template}{string template for the names of files to be written. +Must contain \code{"{i}"}, which will be replaced with an autoincremented +integer to generate basenames of datafiles. For example, \code{"dat_{i}.feather"} will yield \verb{"dat_0.feather", ...}.} \item{hive_style}{logical: write partition segments as Hive-style diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 73ea71b3c49..f33f8ad1def 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -759,6 +759,14 @@ test_that("Writing a dataset: CSV->IPC", { filter(integer > 6) %>% summarize(mean = mean(integer)) ) + + # Check whether "int" is present in the files or just in the dirs + first <- read_feather( + dir(dst_dir, pattern = ".feather$", recursive = TRUE, full.names = TRUE)[1], + as_data_frame = FALSE + ) + # It shouldn't be there + expect_false("int" %in% names(first)) }) test_that("Writing a dataset: Parquet->IPC", { @@ -980,14 +988,18 @@ test_that("Dataset writing: unsupported features/input validation", { expect_error(write_dataset(4), 'dataset must be a "Dataset"') ds <- open_dataset(hive_dir) - expect_error( select(ds, integer = int) %>% write_dataset(ds), "Renaming columns when writing a dataset is not yet supported" ) - expect_error( write_dataset(ds, partitioning = c("int", "NOTACOLUMN"), format = "ipc"), 'Invalid field name: "NOTACOLUMN"' ) + expect_error( + write_dataset(ds, tempfile(), basename_template = "something_without_i") + ) + expect_error( + write_dataset(ds, tempfile(), basename_template = NULL) + ) }) From 7f1255e9383366202b1dcaadf1dfee519d35a49b Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Mon, 5 Oct 2020 14:27:26 -0700 Subject: [PATCH 30/34] Update vignette now that you can filter when writing --- r/vignettes/dataset.Rmd | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/r/vignettes/dataset.Rmd b/r/vignettes/dataset.Rmd index 488f7f4f43b..df460c2682a 100644 --- a/r/vignettes/dataset.Rmd +++ b/r/vignettes/dataset.Rmd @@ -350,6 +350,16 @@ partitions are because they can be read from the file paths. (To instead write bare values for partition segments, i.e. `1` rather than `payment_type=1`, call `write_dataset()` with `hive_style = FALSE`.) +Perhaps, though, `payment_type == 1` is the only data we care about for our +current work, and we just want to drop the rest and have a smaller working set. +For this, we can `filter()` them out when writing: + +```r +ds %>% + filter(payment_type == 1) %>% + write_dataset("nyc-taxi/feather", format = "feather") +``` + The other thing we can do when writing datasets is select a subset of and/or reorder columns. Suppose we never care about `vendor_id`, and being a string column, it can take up a lot of space when we read it in, so let's drop it: @@ -361,5 +371,5 @@ ds %>% write_dataset("nyc-taxi/feather", format = "feather") ``` -Note that you cannot currently rename columns when writing, nor can you `filter()` -and write a subset of the data. +Note that while you can select a subset of columns, +you cannot currently rename columns when writing. From 16d9d53bbda0ff29df4ea6c5c1d68808b2caede9 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 6 Oct 2020 10:49:18 -0400 Subject: [PATCH 31/34] lint fix --- python/pyarrow/tests/test_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 3df92fb8944..fbb06564803 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -2152,7 +2152,7 @@ def _check_dataset_roundtrip(dataset, base_dir, expected_files, base_dir_path = base_dir_path or base_dir ds.write_dataset(dataset, base_dir, format="feather", - partitioning=partitioning, use_threads=False) + partitioning=partitioning, use_threads=False) # check that all files are present file_paths = list(base_dir_path.rglob("*")) From 086f59d8a3799181609c48813b7299028efcde10 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 6 Oct 2020 11:25:55 -0400 Subject: [PATCH 32/34] writing without partitioning will create a single file --- r/tests/testthat/test-s3-minio.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-s3-minio.R b/r/tests/testthat/test-s3-minio.R index 63902aac40a..5d9213f4f21 100644 --- a/r/tests/testthat/test-s3-minio.R +++ b/r/tests/testthat/test-s3-minio.R @@ -119,7 +119,7 @@ if (arrow_with_s3() && process_is_running("minio server")) { test_that("write_dataset with fs", { ds <- open_dataset(minio_path("hive_dir"), filesystem = fs) write_dataset(ds, minio_path("new_dataset_dir"), filesystem = fs) - expect_length(fs$GetFileInfo(FileSelector$create(minio_path("new_dataset_dir"))), 2) + expect_length(fs$GetFileInfo(FileSelector$create(minio_path("new_dataset_dir"))), 1) }) test_that("S3FileSystem input validation", { From 998d760ef0c1ac21145b426075f6ffba0d1ae675 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 7 Oct 2020 13:12:40 -0400 Subject: [PATCH 33/34] address review comments --- python/pyarrow/_dataset.pyx | 132 ++++--------------- python/pyarrow/dataset.py | 24 +++- python/pyarrow/includes/libarrow_dataset.pxd | 1 + python/pyarrow/tests/test_dataset.py | 28 ++-- r/R/dataset-write.R | 6 +- 5 files changed, 59 insertions(+), 132 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index bfee82ba362..0aec214831b 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -636,6 +636,10 @@ cdef class FileWriteOptions(_Weakrefable): self.init(sp) return self + @property + def format(self): + return FileFormat.wrap(self.options.format()) + cdef inline shared_ptr[CFileWriteOptions] unwrap(self): return self.wrapped @@ -1113,7 +1117,8 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): update = False update_arrow = False for name, value in kwargs.items(): - assert name in self._properties + if name not in self._properties: + raise TypeError("unexpected parquet write option: " + name) self._properties[name] = value if name in arrow_fields: update_arrow = True @@ -1122,119 +1127,30 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): if update: opts.writer_properties = _create_writer_properties( - use_dictionary=self.use_dictionary, - compression=self.compression, - version=self.version, - write_statistics=self.write_statistics, - data_page_size=self.data_page_size, - compression_level=self.compression_level, - use_byte_stream_split=self.use_byte_stream_split, - data_page_version=self.data_page_version, + use_dictionary=self._properties["use_dictionary"], + compression=self._properties["compression"], + version=self._properties["version"], + write_statistics=self._properties["write_statistics"], + data_page_size=self._properties["data_page_size"], + compression_level=self._properties["compression_level"], + use_byte_stream_split=( + self._properties["use_byte_stream_split"] + ), + data_page_version=self._properties["data_page_version"], ) if update_arrow: opts.arrow_writer_properties = _create_arrow_writer_properties( use_deprecated_int96_timestamps=( - self.use_deprecated_int96_timestamps + self._properties["use_deprecated_int96_timestamps"] + ), + coerce_timestamps=self._properties["coerce_timestamps"], + allow_truncated_timestamps=( + self._properties["allow_truncated_timestamps"] ), - coerce_timestamps=self.coerce_timestamps, - allow_truncated_timestamps=self.allow_truncated_timestamps, - writer_engine_version=self.writer_engine_version, + writer_engine_version='V2' ) - @property - def use_dictionary(self): - return self._properties['use_dictionary'] - - @use_dictionary.setter - def use_dictionary(self, use_dictionary): - self.update(use_dictionary=use_dictionary) - - @property - def compression(self): - return self._properties['compression'] - - @compression.setter - def compression(self, compression): - self.update(compression=compression) - - @property - def version(self): - return self._properties['version'] - - @version.setter - def version(self, version): - self.update(version=version) - - @property - def write_statistics(self): - return self._properties['write_statistics'] - - @write_statistics.setter - def write_statistics(self, write_statistics): - self.update(write_statistics=write_statistics) - - @property - def data_page_size(self): - return self._properties['data_page_size'] - - @data_page_size.setter - def data_page_size(self, data_page_size): - self.update(data_page_size=data_page_size) - - @property - def compression_level(self): - return self._properties['compression_level'] - - @compression_level.setter - def compression_level(self, compression_level): - self.update(compression_level=compression_level) - - @property - def use_byte_stream_split(self): - return self._properties['use_byte_stream_split'] - - @use_byte_stream_split.setter - def use_byte_stream_split(self, use_byte_stream_split): - self.update(use_byte_stream_split=use_byte_stream_split) - - @property - def data_page_version(self): - return self._properties['data_page_version'] - - @data_page_version.setter - def data_page_version(self, data_page_version): - self.update(data_page_version=data_page_version) - - @property - def use_deprecated_int96_timestamps(self): - return self._properties['use_deprecated_int96_timestamps'] - - @use_deprecated_int96_timestamps.setter - def use_deprecated_int96_timestamps(self, use_deprecated_int96_timestamps): - self.update( - use_deprecated_int96_timestamps=use_deprecated_int96_timestamps) - - @property - def coerce_timestamps(self): - return self._properties['coerce_timestamps'] - - @coerce_timestamps.setter - def coerce_timestamps(self, coerce_timestamps): - self.update(coerce_timestamps=coerce_timestamps) - - @property - def allow_truncated_timestamps(self): - return self._properties['allow_truncated_timestamps'] - - @allow_truncated_timestamps.setter - def allow_truncated_timestamps(self, allow_truncated_timestamps): - self.update(allow_truncated_timestamps=allow_truncated_timestamps) - - @property - def writer_engine_version(self): - return 'V2' - cdef void init(self, const shared_ptr[CFileWriteOptions]& sp): FileWriteOptions.init(self, sp) self.parquet_options = sp.get() @@ -2306,8 +2222,8 @@ def _get_partition_keys(Expression partition_expression): def _filesystemdataset_write( data not None, object base_dir not None, str basename_template not None, - Schema schema not None, FileFormat format not None, - FileSystem filesystem not None, Partitioning partitioning not None, + Schema schema not None, FileSystem filesystem not None, + Partitioning partitioning not None, FileWriteOptions file_options not None, bint use_threads, ): """ diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py index 5cadd69dc6f..f278e05c007 100644 --- a/python/pyarrow/dataset.py +++ b/python/pyarrow/dataset.py @@ -714,7 +714,7 @@ def write_dataset(data, base_dir, basename_template=None, format=None, A template string used to generate basenames of written data files. The token '{i}' will be replaced with an automatically incremented integer. If not specified, it defaults to - "dat_{i}." + format.default_extname + "part-{i}." + format.default_extname format : FileFormat or str The format in which to write the dataset. Currently supported: "parquet", "ipc"/"feather". If a FileSystemDataset is being written @@ -726,14 +726,15 @@ def write_dataset(data, base_dir, basename_template=None, format=None, function. schema : Schema, optional filesystem : FileSystem, optional + file_options : FileWriteOptions, optional + FileFormat specific write options, created using the + ``FileFormat.make_write_options()`` function. use_threads : bool, default True Write files in parallel. If enabled, then maximum parallelism will be used determined by the number of available CPU cores. """ if isinstance(data, Dataset): schema = schema or data.schema - if isinstance(data, FileSystemDataset): - format = format or data.format elif isinstance(data, (pa.Table, pa.RecordBatch)): schema = schema or data.schema data = [data] @@ -745,13 +746,22 @@ def write_dataset(data, base_dir, basename_template=None, format=None, "objects are supported." ) - format = _ensure_format(format) - if basename_template is None: - basename_template = "dat_{i}." + format.default_extname + if format is None and isinstance(data, FileSystemDataset): + format = data.format + else: + format = _ensure_format(format) if file_options is None: file_options = format.make_write_options() + if format != file_options.format: + raise TypeError("Supplied FileWriteOptions have format {}, " + "which doesn't match supplied FileFormat {}".format( + format, file_options)) + + if basename_template is None: + basename_template = "part-{i}." + format.default_extname + partitioning = _ensure_write_partitioning(partitioning) if filesystem is None: @@ -761,6 +771,6 @@ def write_dataset(data, base_dir, basename_template=None, format=None, filesystem, _ = _ensure_fs(filesystem) _filesystemdataset_write( - data, base_dir, basename_template, schema, format, + data, base_dir, basename_template, schema, filesystem, partitioning, file_options, use_threads, ) diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd index 3b5b70a37ba..8bf8b78b07b 100644 --- a/python/pyarrow/includes/libarrow_dataset.pxd +++ b/python/pyarrow/includes/libarrow_dataset.pxd @@ -213,6 +213,7 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: cdef cppclass CFileWriteOptions \ "arrow::dataset::FileWriteOptions": + const shared_ptr[CFileFormat]& format() const c_string type_name() const cdef cppclass CFileFormat "arrow::dataset::FileFormat": diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index fbb06564803..8c24c66ea99 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -2174,18 +2174,18 @@ def test_write_dataset(tempdir): # full string path target = tempdir / 'single-file-target' - expected_files = [target / "dat_0.feather"] + expected_files = [target / "part-0.feather"] _check_dataset_roundtrip(dataset, str(target), expected_files, target) # pathlib path object target = tempdir / 'single-file-target2' - expected_files = [target / "dat_0.feather"] + expected_files = [target / "part-0.feather"] _check_dataset_roundtrip(dataset, target, expected_files, target) # TODO # # relative path # target = tempdir / 'single-file-target3' - # expected_files = [target / "dat_0.ipc"] + # expected_files = [target / "part-0.ipc"] # _check_dataset_roundtrip( # dataset, './single-file-target3', expected_files, target) @@ -2196,7 +2196,7 @@ def test_write_dataset(tempdir): dataset = ds.dataset(directory) target = tempdir / 'single-directory-target' - expected_files = [target / "dat_0.feather"] + expected_files = [target / "part-0.feather"] _check_dataset_roundtrip(dataset, str(target), expected_files, target) @@ -2211,8 +2211,8 @@ def test_write_dataset_partitioned(tempdir): # hive partitioning target = tempdir / 'partitioned-hive-target' expected_paths = [ - target / "part=a", target / "part=a" / "dat_0.feather", - target / "part=b", target / "part=b" / "dat_1.feather" + target / "part=a", target / "part=a" / "part-0.feather", + target / "part=b", target / "part=b" / "part-1.feather" ] partitioning_schema = ds.partitioning( pa.schema([("part", pa.string())]), flavor="hive") @@ -2223,8 +2223,8 @@ def test_write_dataset_partitioned(tempdir): # directory partitioning target = tempdir / 'partitioned-dir-target' expected_paths = [ - target / "a", target / "a" / "dat_0.feather", - target / "b", target / "b" / "dat_1.feather" + target / "a", target / "a" / "part-0.feather", + target / "b", target / "b" / "part-1.feather" ] partitioning_schema = ds.partitioning( pa.schema([("part", pa.string())])) @@ -2304,27 +2304,27 @@ def test_write_table_multiple_fragments(tempdir): # Table with multiple batches written as single Fragment by default base_dir = tempdir / 'single' ds.write_dataset(table, base_dir, format="feather") - assert set(base_dir.rglob("*")) == set([base_dir / "dat_0.feather"]) + assert set(base_dir.rglob("*")) == set([base_dir / "part-0.feather"]) assert ds.dataset(base_dir, format="ipc").to_table().equals(table) # Same for single-element list of Table base_dir = tempdir / 'single-list' ds.write_dataset([table], base_dir, format="feather") - assert set(base_dir.rglob("*")) == set([base_dir / "dat_0.feather"]) + assert set(base_dir.rglob("*")) == set([base_dir / "part-0.feather"]) assert ds.dataset(base_dir, format="ipc").to_table().equals(table) # Provide list of batches to write multiple fragments base_dir = tempdir / 'multiple' ds.write_dataset(table.to_batches(), base_dir, format="feather") assert set(base_dir.rglob("*")) == set( - [base_dir / "dat_0.feather"]) + [base_dir / "part-0.feather"]) assert ds.dataset(base_dir, format="ipc").to_table().equals(table) # Provide list of tables to write multiple fragments base_dir = tempdir / 'multiple-table' ds.write_dataset([table, table], base_dir, format="feather") assert set(base_dir.rglob("*")) == set( - [base_dir / "dat_0.feather"]) + [base_dir / "part-0.feather"]) assert ds.dataset(base_dir, format="ipc").to_table().equals( pa.concat_tables([table]*2) ) @@ -2345,7 +2345,7 @@ def test_write_dataset_parquet(tempdir): ds.write_dataset(table, base_dir, format="parquet") # check that all files are present file_paths = list(base_dir.rglob("*")) - expected_paths = [base_dir / "dat_0.parquet"] + expected_paths = [base_dir / "part-0.parquet"] assert set(file_paths) == set(expected_paths) # check Table roundtrip result = ds.dataset(base_dir, format="parquet").to_table() @@ -2357,5 +2357,5 @@ def test_write_dataset_parquet(tempdir): opts = format.make_write_options(version=version) base_dir = tempdir / 'parquet_dataset_version{0}'.format(version) ds.write_dataset(table, base_dir, format=format, file_options=opts) - meta = pq.read_metadata(base_dir / "dat_0.parquet") + meta = pq.read_metadata(base_dir / "part-0.parquet") assert meta.format_version == version diff --git a/r/R/dataset-write.R b/r/R/dataset-write.R index d56126f05b2..abeb0ce4393 100644 --- a/r/R/dataset-write.R +++ b/r/R/dataset-write.R @@ -37,8 +37,8 @@ #' use the current `group_by()` columns. #' @param basename_template string template for the names of files to be written. #' Must contain `"{i}"`, which will be replaced with an autoincremented -#' integer to generate basenames of datafiles. For example, `"dat_{i}.feather"` -#' will yield `"dat_0.feather", ...`. +#' integer to generate basenames of datafiles. For example, `"part-{i}.feather"` +#' will yield `"part-0.feather", ...`. #' @param hive_style logical: write partition segments as Hive-style #' (`key1=value1/key2=value2/file.ext`) or as just bare values. Default is `TRUE`. #' @param filesystem A [FileSystem] where the dataset should be written if it is a @@ -51,7 +51,7 @@ write_dataset <- function(dataset, path, format = dataset$format, partitioning = dplyr::group_vars(dataset), - basename_template = paste0("dat_{i}.", as.character(format)), + basename_template = paste0("part-{i}.", as.character(format)), hive_style = TRUE, filesystem = NULL, ...) { From 5602aa843368a70ed130c12f0f80cab8fc7fe7e5 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 7 Oct 2020 14:44:19 -0400 Subject: [PATCH 34/34] correct R doc after dat_{i} -> part-{i} --- r/man/write_dataset.Rd | 6 +++--- r/vignettes/dataset.Rmd | 11 +++-------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/r/man/write_dataset.Rd b/r/man/write_dataset.Rd index ee51d244a44..e12c4287266 100644 --- a/r/man/write_dataset.Rd +++ b/r/man/write_dataset.Rd @@ -9,7 +9,7 @@ write_dataset( path, format = dataset$format, partitioning = dplyr::group_vars(dataset), - basename_template = paste0("dat_{i}.", as.character(format)), + basename_template = paste0("part-{i}.", as.character(format)), hive_style = TRUE, filesystem = NULL, ... @@ -36,8 +36,8 @@ use the current \code{group_by()} columns.} \item{basename_template}{string template for the names of files to be written. Must contain \code{"{i}"}, which will be replaced with an autoincremented -integer to generate basenames of datafiles. For example, \code{"dat_{i}.feather"} -will yield \verb{"dat_0.feather", ...}.} +integer to generate basenames of datafiles. For example, \code{"part-{i}.feather"} +will yield \verb{"part-0.feather", ...}.} \item{hive_style}{logical: write partition segments as Hive-style (\code{key1=value1/key2=value2/file.ext}) or as just bare values. Default is \code{TRUE}.} diff --git a/r/vignettes/dataset.Rmd b/r/vignettes/dataset.Rmd index df460c2682a..d9c90261f82 100644 --- a/r/vignettes/dataset.Rmd +++ b/r/vignettes/dataset.Rmd @@ -328,17 +328,12 @@ system("tree nyc-taxi/feather") # feather # ├── payment_type=1 -# │ ├── dat_0.ipc -# │ ├── dat_1.ipc -# │ ├── dat_2.ipc -# │ ├── dat_3.ipc -# │ ├── dat_4.ipc -# │ └── dat_5.ipc +# │ └── part-5.feather # ├── payment_type=2 -# │ ├── dat_0.ipc +# │ └── part-0.feather # ... # └── payment_type=5 -# └── dat_2.ipc +# └── part-2.feather # # 5 directories, 25 files ```