diff --git a/c_glib/arrow-dataset-glib/partitioning.cpp b/c_glib/arrow-dataset-glib/partitioning.cpp index 0a9481f2bec..b22289b5d01 100644 --- a/c_glib/arrow-dataset-glib/partitioning.cpp +++ b/c_glib/arrow-dataset-glib/partitioning.cpp @@ -38,9 +38,6 @@ G_BEGIN_DECLS * #GADatasetPartitioning is a base class for partitioning classes * such as #GADatasetDirectoryPartitioning. * - * #GADatasetDefaultPartitioning is a class for partitioning that - * doesn't partition. - * * #GADatasetKeyValuePartitioningOptions is a class for key-value * partitioning options. * @@ -345,35 +342,19 @@ gadataset_partitioning_get_type_name(GADatasetPartitioning *partitioning) } -G_DEFINE_TYPE(GADatasetDefaultPartitioning, - gadataset_default_partitioning, - GADATASET_TYPE_PARTITIONING) - -static void -gadataset_default_partitioning_init(GADatasetDefaultPartitioning *object) -{ -} - -static void -gadataset_default_partitioning_class_init( - GADatasetDefaultPartitioningClass *klass) -{ -} - /** - * gadataset_default_partitioning_new: + * gadataset_partitioning_create_default: * - * Returns: The newly created #GADatasetDefaultPartitioning that - * doesn't partition. + * Returns: (transfer full): The newly created #GADatasetPartitioning + * that doesn't partition. * - * Since: 11.0.0 + * Since: 12.0.0 */ -GADatasetDefaultPartitioning * -gadataset_default_partitioning_new(void) +GADatasetPartitioning * +gadataset_partitioning_create_default(void) { auto arrow_partitioning = arrow::dataset::Partitioning::Default(); - return GADATASET_DEFAULT_PARTITIONING( - gadataset_partitioning_new_raw(&arrow_partitioning)); + return gadataset_partitioning_new_raw(&arrow_partitioning); } @@ -813,9 +794,7 @@ gadataset_partitioning_new_raw( { GType type = GADATASET_TYPE_PARTITIONING; const auto arrow_type_name = (*arrow_partitioning)->type_name(); - if (arrow_type_name == "default") { - type = GADATASET_TYPE_DEFAULT_PARTITIONING; - } else if (arrow_type_name == "directory") { + if (arrow_type_name == "directory") { type = GADATASET_TYPE_DIRECTORY_PARTITIONING; } else if (arrow_type_name == "hive") { type = GADATASET_TYPE_HIVE_PARTITIONING; diff --git a/c_glib/arrow-dataset-glib/partitioning.h b/c_glib/arrow-dataset-glib/partitioning.h index 5872735d202..ca347b895b9 100644 --- a/c_glib/arrow-dataset-glib/partitioning.h +++ b/c_glib/arrow-dataset-glib/partitioning.h @@ -71,21 +71,9 @@ gchar * gadataset_partitioning_get_type_name(GADatasetPartitioning *partitioning); -#define GADATASET_TYPE_DEFAULT_PARTITIONING \ - (gadataset_default_partitioning_get_type()) -G_DECLARE_DERIVABLE_TYPE(GADatasetDefaultPartitioning, - gadataset_default_partitioning, - GADATASET, - DEFAULT_PARTITIONING, - GADatasetPartitioning) -struct _GADatasetDefaultPartitioningClass -{ - GADatasetPartitioningClass parent_class; -}; - -GARROW_AVAILABLE_IN_11_0 -GADatasetDefaultPartitioning * -gadataset_default_partitioning_new(void); +GARROW_AVAILABLE_IN_12_0 +GADatasetPartitioning * +gadataset_partitioning_create_default(void); #define GADATASET_TYPE_KEY_VALUE_PARTITIONING_OPTIONS \ diff --git a/c_glib/test/dataset/test-partitioning.rb b/c_glib/test/dataset/test-partitioning.rb index a74a9bb7273..1e6fcae482f 100644 --- a/c_glib/test/dataset/test-partitioning.rb +++ b/c_glib/test/dataset/test-partitioning.rb @@ -23,7 +23,8 @@ def setup end def test_default - assert_equal("default", ArrowDataset::DefaultPartitioning.new.type_name) + assert_equal("directory", + ArrowDataset::Partitioning.create_default.type_name) end def test_directory diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index e3d390703f2..bf7145ca936 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -75,32 +75,7 @@ std::string StripNonPrefix(const std::string& path) { } // namespace std::shared_ptr Partitioning::Default() { - class DefaultPartitioning : public Partitioning { - public: - DefaultPartitioning() : Partitioning(::arrow::schema({})) {} - - std::string type_name() const override { return "default"; } - - bool Equals(const Partitioning& other) const override { - return type_name() == other.type_name(); - } - - Result Parse(const std::string& path) const override { - return compute::literal(true); - } - - Result Format(const compute::Expression& expr) const override { - return Status::NotImplemented("formatting paths from ", type_name(), - " Partitioning"); - } - - Result Partition( - const std::shared_ptr& batch) const override { - return PartitionedBatches{{batch}, {compute::literal(true)}}; - } - }; - - return std::make_shared(); + return std::make_shared(arrow::schema({})); } static Result ApplyGroupings( diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h index 7a2c0aa1745..b122047c07b 100644 --- a/cpp/src/arrow/dataset/partition.h +++ b/cpp/src/arrow/dataset/partition.h @@ -90,7 +90,8 @@ class ARROW_DS_EXPORT Partitioning : public util::EqualityComparable Format(const compute::Expression& expr) const = 0; - /// \brief A default Partitioning which always yields scalar(true) + /// \brief A default Partitioning which is a DirectoryPartitioning + /// with an empty schema. static std::shared_ptr Default(); /// \brief The partition schema. diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index 38dbb2d750c..1e932948676 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -189,6 +189,12 @@ TEST_F(TestPartitioning, Partition) { expected_expressions); } +TEST_F(TestPartitioning, DefaultPartitioningIsDirectoryPartitioning) { + auto partitioning = Partitioning::Default(); + ASSERT_EQ(partitioning->type_name(), "directory"); + AssertSchemaEqual(partitioning->schema(), schema({})); +} + TEST_F(TestPartitioning, DirectoryPartitioning) { partitioning_ = std::make_shared( schema({field("alpha", int32()), field("beta", utf8())})); @@ -209,6 +215,18 @@ TEST_F(TestPartitioning, DirectoryPartitioning) { equal(field_ref("beta"), literal("foo")))); } +TEST_F(TestPartitioning, DirectoryPartitioningEmpty) { + partitioning_ = std::make_shared(schema({})); + written_schema_ = partitioning_->schema(); + + // No partitioning info + AssertParse("", literal(true)); + // Files can be in subdirectories + AssertParse("/foo/", literal(true)); + // Partitioning info is discarded on write + AssertFormat(equal(field_ref("alpha"), literal(7)), ""); +} + TEST_F(TestPartitioning, DirectoryPartitioningEquals) { auto part = std::make_shared( schema({field("alpha", int32()), field("beta", utf8())})); diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index ebacd023591..67df26dbd11 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -3640,12 +3640,12 @@ def test_dataset_preserved_partitioning(tempdir): # through discovery, but without partitioning _, path = _create_single_file(tempdir) dataset = ds.dataset(path) - assert dataset.partitioning is None + assert isinstance(dataset.partitioning, ds.DirectoryPartitioning) # through discovery, with hive partitioning but not specified full_table, path = _create_partitioned_dataset(tempdir) dataset = ds.dataset(path) - assert dataset.partitioning is None + assert isinstance(dataset.partitioning, ds.DirectoryPartitioning) # through discovery, with hive partitioning (from a partitioning factory) dataset = ds.dataset(path, partitioning="hive")