diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index fcd7454eee3..45441a975ad 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -722,6 +722,8 @@ def read(self, columns=None, use_threads=True, partitions=None, # value as indicated. The distinct categories of the partition have # been computed in the ParquetManifest for i, (name, index) in enumerate(self.partition_keys): + if columns is not None and name not in columns: + continue # The partition code is the same for all values in this piece indices = np.full(len(table), index, dtype='i4') diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index c7d04c43b89..a387115f0a6 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -1697,6 +1697,21 @@ def test_create_parquet_dataset_multi_threaded(tempdir): assert len(partitions.levels) == len(manifest.partitions.levels) +@pytest.mark.pandas +@parametrize_legacy_dataset +def test_read_partitioned_columns_selection(tempdir, use_legacy_dataset): + # ARROW-3861 - do not include partition columns in resulting table when + # `columns` keyword was passed without those columns + fs = LocalFileSystem.get_instance() + base_path = tempdir + _partition_test_for_filesystem(fs, base_path) + + dataset = pq.ParquetDataset( + base_path, use_legacy_dataset=use_legacy_dataset) + result = dataset.read(columns=["values"]) + assert result.column_names == ["values"] + + @pytest.mark.pandas @parametrize_legacy_dataset def test_equivalency(tempdir, use_legacy_dataset):