From df5f805b2231a86f7c716a366f9f03ead40b9a8f Mon Sep 17 00:00:00 2001 From: Alvin Chunga Date: Fri, 19 Aug 2022 02:23:52 -0500 Subject: [PATCH 1/2] Add defaults to FileSystemDatasetWriteOptions file_write_options, filesystem, partitioning, basename_template --- cpp/src/arrow/dataset/file_base.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h index 7b0f5ffcf2e..9c706cc7b0d 100644 --- a/cpp/src/arrow/dataset/file_base.h +++ b/cpp/src/arrow/dataset/file_base.h @@ -32,6 +32,7 @@ #include "arrow/dataset/type_fwd.h" #include "arrow/dataset/visibility.h" #include "arrow/filesystem/filesystem.h" +#include "arrow/filesystem/localfs.h" #include "arrow/io/file.h" #include "arrow/util/compression.h" @@ -350,23 +351,25 @@ class ARROW_DS_EXPORT FileWriter { /// \brief Options for writing a dataset. struct ARROW_DS_EXPORT FileSystemDatasetWriteOptions { /// Options for individual fragment writing. - std::shared_ptr file_write_options; + std::shared_ptr file_write_options = + CsvFileFormat().DefaultWriteOptions(); /// FileSystem into which a dataset will be written. - std::shared_ptr filesystem; + std::shared_ptr filesystem = + std::make_shared(); /// Root directory into which the dataset will be written. std::string base_dir; /// Partitioning used to generate fragment paths. - std::shared_ptr partitioning; + std::shared_ptr partitioning = Partitioning::Default(); /// Maximum number of partitions any batch may be written into, default is 1K. int max_partitions = 1024; /// Template string used to generate fragment basenames. /// {i} will be replaced by an auto incremented integer. - std::string basename_template; + std::string basename_template = "data_{i}.arrow"; /// If greater than 0 then this will limit the maximum number of files that can be left /// open. If an attempt is made to open too many files then the least recently used file @@ -386,7 +389,7 @@ struct ARROW_DS_EXPORT FileSystemDatasetWriteOptions { /// and only write the row groups to the disk when sufficient rows have accumulated. /// The final row group size may be less than this value and other options such as /// `max_open_files` or `max_rows_per_file` lead to smaller row group sizes. - uint64_t min_rows_per_group = 0; + uint64_t min_rows_per_group = 10; /// If greater than 0 then the dataset writer may split up large incoming batches into /// multiple row groups. If this value is set then min_rows_per_group should also be From 4ecf17e14fc090f1bfaae542568e422e1bff9014 Mon Sep 17 00:00:00 2001 From: Alvin Chunga Date: Fri, 26 Aug 2022 02:41:36 -0500 Subject: [PATCH 2/2] Change default format to ParquetFileFormat, basename_template and min_rows_per_group to higher value --- cpp/src/arrow/dataset/file_base.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h index 9c706cc7b0d..e21dd3e6538 100644 --- a/cpp/src/arrow/dataset/file_base.h +++ b/cpp/src/arrow/dataset/file_base.h @@ -352,7 +352,7 @@ class ARROW_DS_EXPORT FileWriter { struct ARROW_DS_EXPORT FileSystemDatasetWriteOptions { /// Options for individual fragment writing. std::shared_ptr file_write_options = - CsvFileFormat().DefaultWriteOptions(); + ParquetFileFormat::DefaultWriteOptions(); /// FileSystem into which a dataset will be written. std::shared_ptr filesystem = @@ -369,7 +369,7 @@ struct ARROW_DS_EXPORT FileSystemDatasetWriteOptions { /// Template string used to generate fragment basenames. /// {i} will be replaced by an auto incremented integer. - std::string basename_template = "data_{i}.arrow"; + std::string basename_template = "part-{i}." + format()->type_name(); /// If greater than 0 then this will limit the maximum number of files that can be left /// open. If an attempt is made to open too many files then the least recently used file @@ -389,7 +389,7 @@ struct ARROW_DS_EXPORT FileSystemDatasetWriteOptions { /// and only write the row groups to the disk when sufficient rows have accumulated. /// The final row group size may be less than this value and other options such as /// `max_open_files` or `max_rows_per_file` lead to smaller row group sizes. - uint64_t min_rows_per_group = 10; + uint64_t min_rows_per_group = 1 << 18; /// If greater than 0 then the dataset writer may split up large incoming batches into /// multiple row groups. If this value is set then min_rows_per_group should also be