Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 8 additions & 29 deletions c_glib/arrow-dataset-glib/partitioning.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,6 @@ G_BEGIN_DECLS
* #GADatasetPartitioning is a base class for partitioning classes
* such as #GADatasetDirectoryPartitioning.
*
* #GADatasetDefaultPartitioning is a class for partitioning that
* doesn't partition.
*
* #GADatasetKeyValuePartitioningOptions is a class for key-value
* partitioning options.
*
Expand Down Expand Up @@ -345,35 +342,19 @@ gadataset_partitioning_get_type_name(GADatasetPartitioning *partitioning)
}


G_DEFINE_TYPE(GADatasetDefaultPartitioning,
gadataset_default_partitioning,
GADATASET_TYPE_PARTITIONING)

static void
gadataset_default_partitioning_init(GADatasetDefaultPartitioning *object)
{
}

static void
gadataset_default_partitioning_class_init(
GADatasetDefaultPartitioningClass *klass)
{
}

/**
* gadataset_default_partitioning_new:
* gadataset_partitioning_create_default:
*
* Returns: The newly created #GADatasetDefaultPartitioning that
* doesn't partition.
* Returns: (transfer full): The newly created #GADatasetPartitioning
* that doesn't partition.
*
* Since: 11.0.0
* Since: 12.0.0
*/
GADatasetDefaultPartitioning *
gadataset_default_partitioning_new(void)
GADatasetPartitioning *
gadataset_partitioning_create_default(void)
{
auto arrow_partitioning = arrow::dataset::Partitioning::Default();
return GADATASET_DEFAULT_PARTITIONING(
gadataset_partitioning_new_raw(&arrow_partitioning));
return gadataset_partitioning_new_raw(&arrow_partitioning);
}


Expand Down Expand Up @@ -813,9 +794,7 @@ gadataset_partitioning_new_raw(
{
GType type = GADATASET_TYPE_PARTITIONING;
const auto arrow_type_name = (*arrow_partitioning)->type_name();
if (arrow_type_name == "default") {
type = GADATASET_TYPE_DEFAULT_PARTITIONING;
} else if (arrow_type_name == "directory") {
if (arrow_type_name == "directory") {
type = GADATASET_TYPE_DIRECTORY_PARTITIONING;
} else if (arrow_type_name == "hive") {
type = GADATASET_TYPE_HIVE_PARTITIONING;
Expand Down
18 changes: 3 additions & 15 deletions c_glib/arrow-dataset-glib/partitioning.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,21 +71,9 @@ gchar *
gadataset_partitioning_get_type_name(GADatasetPartitioning *partitioning);


#define GADATASET_TYPE_DEFAULT_PARTITIONING \
(gadataset_default_partitioning_get_type())
G_DECLARE_DERIVABLE_TYPE(GADatasetDefaultPartitioning,
gadataset_default_partitioning,
GADATASET,
DEFAULT_PARTITIONING,
GADatasetPartitioning)
struct _GADatasetDefaultPartitioningClass
{
GADatasetPartitioningClass parent_class;
};

GARROW_AVAILABLE_IN_11_0
GADatasetDefaultPartitioning *
gadataset_default_partitioning_new(void);
GARROW_AVAILABLE_IN_12_0
GADatasetPartitioning *
gadataset_partitioning_create_default(void);


#define GADATASET_TYPE_KEY_VALUE_PARTITIONING_OPTIONS \
Expand Down
3 changes: 2 additions & 1 deletion c_glib/test/dataset/test-partitioning.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ def setup
end

def test_default
assert_equal("default", ArrowDataset::DefaultPartitioning.new.type_name)
assert_equal("directory",
ArrowDataset::Partitioning.create_default.type_name)
end

def test_directory
Expand Down
27 changes: 1 addition & 26 deletions cpp/src/arrow/dataset/partition.cc
Original file line number Diff line number Diff line change
Expand Up @@ -75,32 +75,7 @@ std::string StripNonPrefix(const std::string& path) {
} // namespace

std::shared_ptr<Partitioning> Partitioning::Default() {
class DefaultPartitioning : public Partitioning {
public:
DefaultPartitioning() : Partitioning(::arrow::schema({})) {}

std::string type_name() const override { return "default"; }

bool Equals(const Partitioning& other) const override {
return type_name() == other.type_name();
}

Result<compute::Expression> Parse(const std::string& path) const override {
return compute::literal(true);
}

Result<PartitionPathFormat> Format(const compute::Expression& expr) const override {
return Status::NotImplemented("formatting paths from ", type_name(),
" Partitioning");
}

Result<PartitionedBatches> Partition(
const std::shared_ptr<RecordBatch>& batch) const override {
return PartitionedBatches{{batch}, {compute::literal(true)}};
}
};

return std::make_shared<DefaultPartitioning>();
return std::make_shared<DirectoryPartitioning>(arrow::schema({}));
}

static Result<RecordBatchVector> ApplyGroupings(
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/arrow/dataset/partition.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ class ARROW_DS_EXPORT Partitioning : public util::EqualityComparable<Partitionin

virtual Result<PartitionPathFormat> Format(const compute::Expression& expr) const = 0;

/// \brief A default Partitioning which always yields scalar(true)
/// \brief A default Partitioning which is a DirectoryPartitioning
/// with an empty schema.
static std::shared_ptr<Partitioning> Default();

/// \brief The partition schema.
Expand Down
18 changes: 18 additions & 0 deletions cpp/src/arrow/dataset/partition_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,12 @@ TEST_F(TestPartitioning, Partition) {
expected_expressions);
}

TEST_F(TestPartitioning, DefaultPartitioningIsDirectoryPartitioning) {
auto partitioning = Partitioning::Default();
ASSERT_EQ(partitioning->type_name(), "directory");
AssertSchemaEqual(partitioning->schema(), schema({}));
}

TEST_F(TestPartitioning, DirectoryPartitioning) {
partitioning_ = std::make_shared<DirectoryPartitioning>(
schema({field("alpha", int32()), field("beta", utf8())}));
Expand All @@ -209,6 +215,18 @@ TEST_F(TestPartitioning, DirectoryPartitioning) {
equal(field_ref("beta"), literal("foo"))));
}

TEST_F(TestPartitioning, DirectoryPartitioningEmpty) {
partitioning_ = std::make_shared<DirectoryPartitioning>(schema({}));
written_schema_ = partitioning_->schema();

// No partitioning info
AssertParse("", literal(true));
// Files can be in subdirectories
AssertParse("/foo/", literal(true));
// Partitioning info is discarded on write
AssertFormat(equal(field_ref("alpha"), literal(7)), "");
}

TEST_F(TestPartitioning, DirectoryPartitioningEquals) {
auto part = std::make_shared<DirectoryPartitioning>(
schema({field("alpha", int32()), field("beta", utf8())}));
Expand Down
4 changes: 2 additions & 2 deletions python/pyarrow/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3640,12 +3640,12 @@ def test_dataset_preserved_partitioning(tempdir):
# through discovery, but without partitioning
_, path = _create_single_file(tempdir)
dataset = ds.dataset(path)
assert dataset.partitioning is None
assert isinstance(dataset.partitioning, ds.DirectoryPartitioning)

# through discovery, with hive partitioning but not specified
full_table, path = _create_partitioned_dataset(tempdir)
dataset = ds.dataset(path)
assert dataset.partitioning is None
assert isinstance(dataset.partitioning, ds.DirectoryPartitioning)

# through discovery, with hive partitioning (from a partitioning factory)
dataset = ds.dataset(path, partitioning="hive")
Expand Down