apache · jorisvandenbossche · May 4, 2020 · May 4, 2020
diff --git a/docs/source/python/parquet.rst b/docs/source/python/parquet.rst
@@ -396,6 +396,12 @@ option was enabled on write).
      the partition keys.
    - General performance improvement and bug fixes.
 
+   It also has the following changes in behaviour:
+
+   - The partition keys need to be explicitly included in the ``columns``
+     keyword when you want to include them in the result while reading a
+     subset of the columns
+
    In the future, this will be turned on by default. The new implementation
    does not yet cover all existing ParquetDataset features (e.g. specifying
    the ``metadata``, or the ``pieces`` property API). Feedback is very welcome.

diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
@@ -722,8 +722,6 @@ def read(self, columns=None, use_threads=True, partitions=None,
             # value as indicated. The distinct categories of the partition have
             # been computed in the ParquetManifest
             for i, (name, index) in enumerate(self.partition_keys):
-                if columns is not None and name not in columns:
-                    continue
                 # The partition code is the same for all values in this piece
                 indices = np.full(len(table), index, dtype='i4')
 
@@ -1418,7 +1416,9 @@ def read(self, columns=None, use_threads=True, use_pandas_metadata=False):
         Parameters
         ----------
         columns : List[str]
-            Names of columns to read from the dataset.
+            Names of columns to read from the dataset. The partition fields
+            are not automatically included (in contrast to when setting
+            ``use_legacy_dataset=True``).
         use_threads : bool, default True
             Perform multi-threaded column reads.
         use_pandas_metadata : bool, default False

diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
@@ -1710,7 +1710,13 @@ def test_read_partitioned_columns_selection(tempdir, use_legacy_dataset):
     dataset = pq.ParquetDataset(
         base_path, use_legacy_dataset=use_legacy_dataset)
     result = dataset.read(columns=["values"])
-    assert result.column_names == ["values"]
+    if use_legacy_dataset:
+        # ParquetDataset implementation always includes the partition columns
+        # automatically, and we can't easily "fix" this since dask relies on
+        # this behaviour (ARROW-8644)
+        assert result.column_names == ["values", "foo", "bar"]
+    else:
+        assert result.column_names == ["values"]
 
 
 @pytest.mark.pandas