Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions python/pyarrow/parquet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3125,7 +3125,6 @@ def file_visitor(written_file):
"implementation."
)
metadata_collector = kwargs.pop('metadata_collector', None)
file_visitor = None
if metadata_collector is not None:
def file_visitor(written_file):
metadata_collector.append(written_file.metadata)
Expand All @@ -3140,15 +3139,15 @@ def file_visitor(written_file):
if filesystem is not None:
filesystem = _ensure_filesystem(filesystem)

partitioning = None
if partition_cols:
part_schema = table.select(partition_cols).schema
partitioning = ds.partitioning(part_schema, flavor="hive")

if basename_template is None:
basename_template = guid() + '-{i}.parquet'
if existing_data_behavior is None:
existing_data_behavior = 'overwrite_or_ignore'

if existing_data_behavior is None:
existing_data_behavior = 'overwrite_or_ignore'

ds.write_dataset(
table, root_path, filesystem=filesystem,
Expand Down
27 changes: 27 additions & 0 deletions python/pyarrow/tests/parquet/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import datetime
import os
import pathlib

import numpy as np
import pytest
Expand Down Expand Up @@ -1782,3 +1783,29 @@ def test_parquet_write_to_dataset_unsupported_keywards_in_legacy(tempdir):
with pytest.raises(ValueError, match="existing_data_behavior"):
pq.write_to_dataset(table, path, use_legacy_dataset=True,
existing_data_behavior='error')


@pytest.mark.dataset
def test_parquet_write_to_dataset_exposed_keywords(tempdir):
table = pa.table({'a': [1, 2, 3]})
path = tempdir / 'partitioning'

paths_written = []

def file_visitor(written_file):
paths_written.append(written_file.path)

basename_template = 'part-{i}.parquet'

pq.write_to_dataset(table, path, partitioning=["a"],
file_visitor=file_visitor,
basename_template=basename_template,
use_legacy_dataset=False)

expected_paths = {
path / '1' / 'part-0.parquet',
path / '2' / 'part-0.parquet',
path / '3' / 'part-0.parquet'
}
paths_written_set = set(map(pathlib.Path, paths_written))
assert paths_written_set == expected_paths