diff --git a/python/pyarrow/parquet/__init__.py b/python/pyarrow/parquet/__init__.py index 63b3e9f1a56..3b10dd3b23d 100644 --- a/python/pyarrow/parquet/__init__.py +++ b/python/pyarrow/parquet/__init__.py @@ -3109,6 +3109,12 @@ def file_visitor(written_file): # extract non-file format options schema = kwargs.pop("schema", None) use_threads = kwargs.pop("use_threads", True) + chunk_size = kwargs.pop("chunk_size", None) + row_group_size = kwargs.pop("row_group_size", None) + + row_group_size = ( + row_group_size if row_group_size is not None else chunk_size + ) # raise for unsupported keywords msg = ( @@ -3147,7 +3153,8 @@ def file_visitor(written_file): partitioning=partitioning, use_threads=use_threads, file_visitor=file_visitor, basename_template=basename_template, - existing_data_behavior=existing_data_behavior) + existing_data_behavior=existing_data_behavior, + max_rows_per_group=row_group_size) return # warnings and errors when using legecy implementation diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 19448d36870..c27430ab151 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -971,7 +971,7 @@ def _create_dataset_for_fragments(tempdir, chunk_size=None, filesystem=None): path = str(tempdir / "test_parquet_dataset") # write_to_dataset currently requires pandas - pq.write_to_dataset(table, path, use_legacy_dataset=True, + pq.write_to_dataset(table, path, partition_cols=["part"], chunk_size=chunk_size) dataset = ds.dataset( path, format="parquet", partitioning="hive", filesystem=filesystem