From 4ff2d44fa1e6f7fe0c929d860d18dcf433e2981b Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 15 Jan 2023 22:33:55 +0900 Subject: [PATCH 1/5] GH-15256: [C++][Dataset] Add support for writing with Partitioning::Default() It writes all data into one directory. --- cpp/src/arrow/dataset/partition.cc | 3 +- cpp/src/arrow/dataset/partition_test.cc | 38 +++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index e3d390703f2..1b5230dfbcd 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -90,8 +90,7 @@ std::shared_ptr Partitioning::Default() { } Result Format(const compute::Expression& expr) const override { - return Status::NotImplemented("formatting paths from ", type_name(), - " Partitioning"); + return PartitionPathFormat{"", ""}; } Result Partition( diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index 38dbb2d750c..0a5929cd9ee 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -189,6 +189,44 @@ TEST_F(TestPartitioning, Partition) { expected_expressions); } +TEST_F(TestPartitioning, DefaultPartitioningPartition) { + auto dataset_schema = + schema({field("a", int32()), field("b", utf8()), field("c", uint32())}); + + auto partitioning = Partitioning::Default(); + std::string json = R"([{"a": 3, "b": "x", "c": 0}, + {"a": 3, "b": "x", "c": 1}, + {"a": 1, "b": null, "c": 2}, + {"a": null, "b": null, "c": 3}, + {"a": null, "b": "z", "c": 4}, + {"a": null, "b": null, "c": 5} + ])"; + std::vector expected_batches = {json}; + std::vector expected_expressions = {literal(true)}; + AssertPartition(partitioning, dataset_schema, json, dataset_schema, expected_batches, + expected_expressions); +} + +TEST_F(TestPartitioning, DefaultPartitioningParse) { + partitioning_ = Partitioning::Default(); + + AssertParse("/hello", literal(true)); + AssertParse("", literal(true)); // no segments to parse +} + +TEST_F(TestPartitioning, DefaultPartitioningFormat) { + partitioning_ = Partitioning::Default(); + + written_schema_ = partitioning_->schema(); + + AssertFormat(and_(equal(field_ref("alpha"), literal(0)), + equal(field_ref("beta"), literal("hello"))), + ""); + AssertFormat(and_(equal(field_ref("alpha"), literal(0)), is_null(field_ref("beta"))), + ""); + AssertFormat(literal(true), ""); +} + TEST_F(TestPartitioning, DirectoryPartitioning) { partitioning_ = std::make_shared( schema({field("alpha", int32()), field("beta", utf8())})); From 316272c4ac1758fa40981c3a6342cece2c9f0686 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 1 Apr 2023 07:43:59 +0900 Subject: [PATCH 2/5] Use DictionaryPartitioning with an empty schema --- cpp/src/arrow/dataset/partition.cc | 26 +---------------- cpp/src/arrow/dataset/partition.h | 3 +- cpp/src/arrow/dataset/partition_test.cc | 38 ++----------------------- 3 files changed, 6 insertions(+), 61 deletions(-) diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index 1b5230dfbcd..bf7145ca936 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -75,31 +75,7 @@ std::string StripNonPrefix(const std::string& path) { } // namespace std::shared_ptr Partitioning::Default() { - class DefaultPartitioning : public Partitioning { - public: - DefaultPartitioning() : Partitioning(::arrow::schema({})) {} - - std::string type_name() const override { return "default"; } - - bool Equals(const Partitioning& other) const override { - return type_name() == other.type_name(); - } - - Result Parse(const std::string& path) const override { - return compute::literal(true); - } - - Result Format(const compute::Expression& expr) const override { - return PartitionPathFormat{"", ""}; - } - - Result Partition( - const std::shared_ptr& batch) const override { - return PartitionedBatches{{batch}, {compute::literal(true)}}; - } - }; - - return std::make_shared(); + return std::make_shared(arrow::schema({})); } static Result ApplyGroupings( diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h index 7a2c0aa1745..b122047c07b 100644 --- a/cpp/src/arrow/dataset/partition.h +++ b/cpp/src/arrow/dataset/partition.h @@ -90,7 +90,8 @@ class ARROW_DS_EXPORT Partitioning : public util::EqualityComparable Format(const compute::Expression& expr) const = 0; - /// \brief A default Partitioning which always yields scalar(true) + /// \brief A default Partitioning which is a DirectoryPartitioning + /// with an empty schema. static std::shared_ptr Default(); /// \brief The partition schema. diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index 0a5929cd9ee..9b6e2e2b311 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -189,42 +189,10 @@ TEST_F(TestPartitioning, Partition) { expected_expressions); } -TEST_F(TestPartitioning, DefaultPartitioningPartition) { - auto dataset_schema = - schema({field("a", int32()), field("b", utf8()), field("c", uint32())}); - +TEST_F(TestPartitioning, DefaultPartitioningIsDirectoryPartitioning) { auto partitioning = Partitioning::Default(); - std::string json = R"([{"a": 3, "b": "x", "c": 0}, - {"a": 3, "b": "x", "c": 1}, - {"a": 1, "b": null, "c": 2}, - {"a": null, "b": null, "c": 3}, - {"a": null, "b": "z", "c": 4}, - {"a": null, "b": null, "c": 5} - ])"; - std::vector expected_batches = {json}; - std::vector expected_expressions = {literal(true)}; - AssertPartition(partitioning, dataset_schema, json, dataset_schema, expected_batches, - expected_expressions); -} - -TEST_F(TestPartitioning, DefaultPartitioningParse) { - partitioning_ = Partitioning::Default(); - - AssertParse("/hello", literal(true)); - AssertParse("", literal(true)); // no segments to parse -} - -TEST_F(TestPartitioning, DefaultPartitioningFormat) { - partitioning_ = Partitioning::Default(); - - written_schema_ = partitioning_->schema(); - - AssertFormat(and_(equal(field_ref("alpha"), literal(0)), - equal(field_ref("beta"), literal("hello"))), - ""); - AssertFormat(and_(equal(field_ref("alpha"), literal(0)), is_null(field_ref("beta"))), - ""); - AssertFormat(literal(true), ""); + ASSERT_EQ(partitioning->type_name(), "directory"); + AssertSchemaEqual(partitioning->schema(), schema({})); } TEST_F(TestPartitioning, DirectoryPartitioning) { From 510a695b7b6a5c80984b6175617516da5adb7eb3 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 1 Apr 2023 13:47:47 +0900 Subject: [PATCH 3/5] Remove GADatasetDefaultPartitioning --- c_glib/arrow-dataset-glib/partitioning.cpp | 37 +++++----------------- c_glib/arrow-dataset-glib/partitioning.h | 18 ++--------- c_glib/test/dataset/test-partitioning.rb | 3 +- 3 files changed, 13 insertions(+), 45 deletions(-) diff --git a/c_glib/arrow-dataset-glib/partitioning.cpp b/c_glib/arrow-dataset-glib/partitioning.cpp index 0a9481f2bec..b22289b5d01 100644 --- a/c_glib/arrow-dataset-glib/partitioning.cpp +++ b/c_glib/arrow-dataset-glib/partitioning.cpp @@ -38,9 +38,6 @@ G_BEGIN_DECLS * #GADatasetPartitioning is a base class for partitioning classes * such as #GADatasetDirectoryPartitioning. * - * #GADatasetDefaultPartitioning is a class for partitioning that - * doesn't partition. - * * #GADatasetKeyValuePartitioningOptions is a class for key-value * partitioning options. * @@ -345,35 +342,19 @@ gadataset_partitioning_get_type_name(GADatasetPartitioning *partitioning) } -G_DEFINE_TYPE(GADatasetDefaultPartitioning, - gadataset_default_partitioning, - GADATASET_TYPE_PARTITIONING) - -static void -gadataset_default_partitioning_init(GADatasetDefaultPartitioning *object) -{ -} - -static void -gadataset_default_partitioning_class_init( - GADatasetDefaultPartitioningClass *klass) -{ -} - /** - * gadataset_default_partitioning_new: + * gadataset_partitioning_create_default: * - * Returns: The newly created #GADatasetDefaultPartitioning that - * doesn't partition. + * Returns: (transfer full): The newly created #GADatasetPartitioning + * that doesn't partition. * - * Since: 11.0.0 + * Since: 12.0.0 */ -GADatasetDefaultPartitioning * -gadataset_default_partitioning_new(void) +GADatasetPartitioning * +gadataset_partitioning_create_default(void) { auto arrow_partitioning = arrow::dataset::Partitioning::Default(); - return GADATASET_DEFAULT_PARTITIONING( - gadataset_partitioning_new_raw(&arrow_partitioning)); + return gadataset_partitioning_new_raw(&arrow_partitioning); } @@ -813,9 +794,7 @@ gadataset_partitioning_new_raw( { GType type = GADATASET_TYPE_PARTITIONING; const auto arrow_type_name = (*arrow_partitioning)->type_name(); - if (arrow_type_name == "default") { - type = GADATASET_TYPE_DEFAULT_PARTITIONING; - } else if (arrow_type_name == "directory") { + if (arrow_type_name == "directory") { type = GADATASET_TYPE_DIRECTORY_PARTITIONING; } else if (arrow_type_name == "hive") { type = GADATASET_TYPE_HIVE_PARTITIONING; diff --git a/c_glib/arrow-dataset-glib/partitioning.h b/c_glib/arrow-dataset-glib/partitioning.h index 5872735d202..ca347b895b9 100644 --- a/c_glib/arrow-dataset-glib/partitioning.h +++ b/c_glib/arrow-dataset-glib/partitioning.h @@ -71,21 +71,9 @@ gchar * gadataset_partitioning_get_type_name(GADatasetPartitioning *partitioning); -#define GADATASET_TYPE_DEFAULT_PARTITIONING \ - (gadataset_default_partitioning_get_type()) -G_DECLARE_DERIVABLE_TYPE(GADatasetDefaultPartitioning, - gadataset_default_partitioning, - GADATASET, - DEFAULT_PARTITIONING, - GADatasetPartitioning) -struct _GADatasetDefaultPartitioningClass -{ - GADatasetPartitioningClass parent_class; -}; - -GARROW_AVAILABLE_IN_11_0 -GADatasetDefaultPartitioning * -gadataset_default_partitioning_new(void); +GARROW_AVAILABLE_IN_12_0 +GADatasetPartitioning * +gadataset_partitioning_create_default(void); #define GADATASET_TYPE_KEY_VALUE_PARTITIONING_OPTIONS \ diff --git a/c_glib/test/dataset/test-partitioning.rb b/c_glib/test/dataset/test-partitioning.rb index a74a9bb7273..1e6fcae482f 100644 --- a/c_glib/test/dataset/test-partitioning.rb +++ b/c_glib/test/dataset/test-partitioning.rb @@ -23,7 +23,8 @@ def setup end def test_default - assert_equal("default", ArrowDataset::DefaultPartitioning.new.type_name) + assert_equal("directory", + ArrowDataset::Partitioning.create_default.type_name) end def test_directory From 1ced09bf0af312ca7ad12fc3f5e307a8da76bca0 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 1 Apr 2023 21:52:22 +0900 Subject: [PATCH 4/5] Follow default partitioning change --- python/pyarrow/tests/test_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index ebacd023591..67df26dbd11 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -3640,12 +3640,12 @@ def test_dataset_preserved_partitioning(tempdir): # through discovery, but without partitioning _, path = _create_single_file(tempdir) dataset = ds.dataset(path) - assert dataset.partitioning is None + assert isinstance(dataset.partitioning, ds.DirectoryPartitioning) # through discovery, with hive partitioning but not specified full_table, path = _create_partitioned_dataset(tempdir) dataset = ds.dataset(path) - assert dataset.partitioning is None + assert isinstance(dataset.partitioning, ds.DirectoryPartitioning) # through discovery, with hive partitioning (from a partitioning factory) dataset = ds.dataset(path, partitioning="hive") From 87408a4247acb534c5cc275eb2f13fe1e6bb763d Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 4 Apr 2023 14:12:21 +0900 Subject: [PATCH 5/5] Add empty directory partitioning test Co-Authored-By: Weston Pace --- cpp/src/arrow/dataset/partition_test.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index 9b6e2e2b311..1e932948676 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -215,6 +215,18 @@ TEST_F(TestPartitioning, DirectoryPartitioning) { equal(field_ref("beta"), literal("foo")))); } +TEST_F(TestPartitioning, DirectoryPartitioningEmpty) { + partitioning_ = std::make_shared(schema({})); + written_schema_ = partitioning_->schema(); + + // No partitioning info + AssertParse("", literal(true)); + // Files can be in subdirectories + AssertParse("/foo/", literal(true)); + // Partitioning info is discarded on write + AssertFormat(equal(field_ref("alpha"), literal(7)), ""); +} + TEST_F(TestPartitioning, DirectoryPartitioningEquals) { auto part = std::make_shared( schema({field("alpha", int32()), field("beta", utf8())}));