diff --git a/cpp/src/arrow/dataset/file_base.h b/cpp/src/arrow/dataset/file_base.h index 7b0f5ffcf2e..e21dd3e6538 100644 --- a/cpp/src/arrow/dataset/file_base.h +++ b/cpp/src/arrow/dataset/file_base.h @@ -32,6 +32,7 @@ #include "arrow/dataset/type_fwd.h" #include "arrow/dataset/visibility.h" #include "arrow/filesystem/filesystem.h" +#include "arrow/filesystem/localfs.h" #include "arrow/io/file.h" #include "arrow/util/compression.h" @@ -350,23 +351,25 @@ class ARROW_DS_EXPORT FileWriter { /// \brief Options for writing a dataset. struct ARROW_DS_EXPORT FileSystemDatasetWriteOptions { /// Options for individual fragment writing. - std::shared_ptr file_write_options; + std::shared_ptr file_write_options = + ParquetFileFormat::DefaultWriteOptions(); /// FileSystem into which a dataset will be written. - std::shared_ptr filesystem; + std::shared_ptr filesystem = + std::make_shared(); /// Root directory into which the dataset will be written. std::string base_dir; /// Partitioning used to generate fragment paths. - std::shared_ptr partitioning; + std::shared_ptr partitioning = Partitioning::Default(); /// Maximum number of partitions any batch may be written into, default is 1K. int max_partitions = 1024; /// Template string used to generate fragment basenames. /// {i} will be replaced by an auto incremented integer. - std::string basename_template; + std::string basename_template = "part-{i}." + format()->type_name(); /// If greater than 0 then this will limit the maximum number of files that can be left /// open. If an attempt is made to open too many files then the least recently used file @@ -386,7 +389,7 @@ struct ARROW_DS_EXPORT FileSystemDatasetWriteOptions { /// and only write the row groups to the disk when sufficient rows have accumulated. /// The final row group size may be less than this value and other options such as /// `max_open_files` or `max_rows_per_file` lead to smaller row group sizes. - uint64_t min_rows_per_group = 0; + uint64_t min_rows_per_group = 1 << 18; /// If greater than 0 then the dataset writer may split up large incoming batches into /// multiple row groups. If this value is set then min_rows_per_group should also be