From c83b3700d562b14c48488c2ba972cc2feb3617cd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 9 Oct 2020 16:18:31 +0200 Subject: [PATCH 1/3] ARROW-7957: [Python] Handle new FileSystem in ParquetDataset by automatically using new implementation --- python/pyarrow/parquet.py | 10 +++++++++- python/pyarrow/tests/test_parquet.py | 13 +++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index a10b9d03c6a1..f41f4183ba1d 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -1126,7 +1126,15 @@ def __new__(cls, path_or_paths=None, filesystem=None, schema=None, metadata=None, split_row_groups=False, validate_schema=True, filters=None, metadata_nthreads=1, read_dictionary=None, memory_map=False, buffer_size=0, partitioning="hive", - use_legacy_dataset=True): + use_legacy_dataset=None): + if use_legacy_dataset is None: + # if a new filesystem is passed -> default to new implementation + if isinstance(filesystem, FileSystem): + use_legacy_dataset = False + # otherwise the default is still True + else: + use_legacy_dataset = True + if not use_legacy_dataset: return _ParquetDatasetV2(path_or_paths, filesystem=filesystem, filters=filters, diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 57e60f002b2e..145f151caa0a 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -4296,3 +4296,16 @@ def test_dataset_partitioning(tempdir): with pytest.raises(ValueError): pq.ParquetDataset( str(root_path), partitioning=part, use_legacy_dataset=True) + + +def test_parquet_dataset_new_filesystem(tempdir): + # Ensure we can pass new FileSystem object to ParquetDataset + # (use new implementation automatically without specifying + # use_legacy_dataset=False) + table = pa.table({'a': [1, 2, 3]}) + pq.write_table(table, tempdir / 'data.parquet') + # don't use simple LocalFileSystem (as that gets mapped to legacy one) + filesystem = fs.SubTreeFileSystem(str(tempdir), fs.LocalFileSystem()) + dataset = pq.ParquetDataset('.', filesystem=filesystem) + result = dataset.read() + assert result.equals(table) From 4abcf3af6550d78ddcc5115f40e6fef23e9513d7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 10 Oct 2020 14:12:29 +0200 Subject: [PATCH 2/3] add correct mark --- python/pyarrow/tests/test_parquet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 145f151caa0a..f57cf78111c3 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -4298,6 +4298,7 @@ def test_dataset_partitioning(tempdir): str(root_path), partitioning=part, use_legacy_dataset=True) +@pytest.mark.dataset def test_parquet_dataset_new_filesystem(tempdir): # Ensure we can pass new FileSystem object to ParquetDataset # (use new implementation automatically without specifying From 42b91adf39eb63b27e113d36090f65e4f3be80ca Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 10 Oct 2020 15:13:55 +0200 Subject: [PATCH 3/3] another mark --- python/pyarrow/tests/test_parquet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index f57cf78111c3..12adda5cc901 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -497,6 +497,7 @@ def test_multiple_path_types(tempdir, use_legacy_dataset): tm.assert_frame_equal(df, df_read) +@pytest.mark.dataset @parametrize_legacy_dataset @pytest.mark.parametrize("filesystem", [ None, fs.LocalFileSystem(), LocalFileSystem._get_instance()