Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/source/python/parquet.rst
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,12 @@ option was enabled on write).
the partition keys.
- General performance improvement and bug fixes.

It also has the following changes in behaviour:

- The partition keys need to be explicitly included in the ``columns``
keyword when you want to include them in the result while reading a
subset of the columns

In the future, this will be turned on by default. The new implementation
does not yet cover all existing ParquetDataset features (e.g. specifying
the ``metadata``, or the ``pieces`` property API). Feedback is very welcome.
Expand Down
6 changes: 3 additions & 3 deletions python/pyarrow/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -722,8 +722,6 @@ def read(self, columns=None, use_threads=True, partitions=None,
# value as indicated. The distinct categories of the partition have
# been computed in the ParquetManifest
for i, (name, index) in enumerate(self.partition_keys):
if columns is not None and name not in columns:
continue
# The partition code is the same for all values in this piece
indices = np.full(len(table), index, dtype='i4')

Expand Down Expand Up @@ -1418,7 +1416,9 @@ def read(self, columns=None, use_threads=True, use_pandas_metadata=False):
Parameters
----------
columns : List[str]
Names of columns to read from the dataset.
Names of columns to read from the dataset. The partition fields
are not automatically included (in contrast to when setting
``use_legacy_dataset=True``).
use_threads : bool, default True
Perform multi-threaded column reads.
use_pandas_metadata : bool, default False
Expand Down
8 changes: 7 additions & 1 deletion python/pyarrow/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1710,7 +1710,13 @@ def test_read_partitioned_columns_selection(tempdir, use_legacy_dataset):
dataset = pq.ParquetDataset(
base_path, use_legacy_dataset=use_legacy_dataset)
result = dataset.read(columns=["values"])
assert result.column_names == ["values"]
if use_legacy_dataset:
# ParquetDataset implementation always includes the partition columns
# automatically, and we can't easily "fix" this since dask relies on
# this behaviour (ARROW-8644)
assert result.column_names == ["values", "foo", "bar"]
else:
assert result.column_names == ["values"]


@pytest.mark.pandas
Expand Down