From 1241630ed5dbdddc50b6dbc613c3a849e1ae94e8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 28 Apr 2020 10:06:24 +0200 Subject: [PATCH 1/4] test for ARROW-5572 --- python/pyarrow/tests/test_parquet.py | 40 +++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 8a81265598d..e32dc36f0c0 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -64,6 +64,8 @@ def datadir(datadir): "use_legacy_dataset", [True, pytest.param(False, marks=pytest.mark.skip)]) parametrize_legacy_dataset_skip_buffer = pytest.mark.parametrize( "use_legacy_dataset", [True, pytest.param(False, marks=pytest.mark.skip)]) +parametrize_legacy_dataset_fixed = pytest.mark.parametrize( + "use_legacy_dataset", [pytest.param(True, marks=pytest.mark.xfail), False]) def deterministic_row_order(use_legacy_dataset, chunk_size=-1): @@ -1715,7 +1717,7 @@ def test_read_partitioned_columns_selection(tempdir, use_legacy_dataset): @pytest.mark.pandas @parametrize_legacy_dataset -def test_equivalency(tempdir, use_legacy_dataset): +def test_filters_equivalency(tempdir, use_legacy_dataset): fs = LocalFileSystem.get_instance() base_path = tempdir @@ -1803,7 +1805,7 @@ def test_equivalency(tempdir, use_legacy_dataset): @pytest.mark.pandas @parametrize_legacy_dataset -def test_cutoff_exclusive_integer(tempdir, use_legacy_dataset): +def test_filters_cutoff_exclusive_integer(tempdir, use_legacy_dataset): fs = LocalFileSystem.get_instance() base_path = tempdir @@ -1845,7 +1847,7 @@ def test_cutoff_exclusive_integer(tempdir, use_legacy_dataset): raises=(TypeError, AssertionError), reason='Loss of type information in creation of categoricals.' ) -def test_cutoff_exclusive_datetime(tempdir, use_legacy_dataset): +def test_filters_cutoff_exclusive_datetime(tempdir, use_legacy_dataset): fs = LocalFileSystem.get_instance() base_path = tempdir @@ -1890,7 +1892,7 @@ def test_cutoff_exclusive_datetime(tempdir, use_legacy_dataset): @pytest.mark.pandas @parametrize_legacy_dataset -def test_inclusive_integer(tempdir, use_legacy_dataset): +def test_filters_inclusive_integer(tempdir, use_legacy_dataset): fs = LocalFileSystem.get_instance() base_path = tempdir @@ -1926,7 +1928,7 @@ def test_inclusive_integer(tempdir, use_legacy_dataset): @pytest.mark.pandas @parametrize_legacy_dataset -def test_inclusive_set(tempdir, use_legacy_dataset): +def test_filters_inclusive_set(tempdir, use_legacy_dataset): fs = LocalFileSystem.get_instance() base_path = tempdir @@ -1964,7 +1966,7 @@ def test_inclusive_set(tempdir, use_legacy_dataset): @pytest.mark.pandas @parametrize_legacy_dataset -def test_invalid_pred_op(tempdir, use_legacy_dataset): +def test_filters_invalid_pred_op(tempdir, use_legacy_dataset): fs = LocalFileSystem.get_instance() base_path = tempdir @@ -2009,6 +2011,32 @@ def test_invalid_pred_op(tempdir, use_legacy_dataset): use_legacy_dataset=use_legacy_dataset) +@pytest.mark.pandas +@parametrize_legacy_dataset_fixed +def test_filters_invalid_column(tempdir, use_legacy_dataset): + # ARROW-5572 - raise error on invalid name in filter specification + # works with new dataset / xfail with legacy implementation + fs = LocalFileSystem.get_instance() + base_path = tempdir + + integer_keys = [0, 1, 2, 3, 4] + partition_spec = [['integers', integer_keys]] + N = 5 + + df = pd.DataFrame({ + 'index': np.arange(N), + 'integers': np.array(integer_keys, dtype='i4'), + }, columns=['index', 'integers']) + + _generate_partition_directories(fs, base_path, partition_spec, df) + + msg = "Field named 'non_existent_column' not found" + with pytest.raises(ValueError, match=msg): + pq.ParquetDataset(base_path, filesystem=fs, + filters=[('non_existent_column', '<', 3), ], + use_legacy_dataset=use_legacy_dataset).read() + + @pytest.mark.pandas def test_filters_read_table(tempdir): # test that filters keyword is passed through in read_table From 9fe4656f5222407c880cdfffcd195e47f22b6179 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 28 Apr 2020 10:33:46 +0200 Subject: [PATCH 2/4] test for ARROW-5310 --- python/pyarrow/tests/test_parquet.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index e32dc36f0c0..7788bd7540c 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -2614,6 +2614,20 @@ def test_ignore_no_private_directories_path_list( _assert_dataset_paths(dataset, paths, use_legacy_dataset) +@parametrize_legacy_dataset_fixed +def test_empty_directory(tempdir, use_legacy_dataset): + # ARROW-5310 - reading empty directory + # fails with legacy implementation + empty_dir = tempdir / 'dataset' + empty_dir.mkdir() + + dataset = pq.ParquetDataset( + empty_dir, use_legacy_dataset=use_legacy_dataset) + result = dataset.read() + assert result.num_rows == 0 + assert result.num_columns == 0 + + @pytest.mark.pandas @parametrize_legacy_dataset def test_multiindex_duplicate_values(tempdir, use_legacy_dataset): From e5ce8656df2fee611c960e9e04da3a1fb7230588 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 28 Apr 2020 10:49:50 +0200 Subject: [PATCH 3/4] test for ARROW-5666 --- python/pyarrow/tests/test_parquet.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 7788bd7540c..97842b8dd39 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -2069,6 +2069,33 @@ def test_filters_read_table(tempdir): assert table.num_rows == 3 +@pytest.mark.pandas +@parametrize_legacy_dataset_fixed +def test_partition_keys_with_underscores(tempdir, use_legacy_dataset): + # ARROW-5666 - partition field values with underscores preserve underscores + # xfail with legacy dataset -> they get interpreted as integers + fs = LocalFileSystem.get_instance() + base_path = tempdir + + string_keys = ["2019_2", "2019_3"] + partition_spec = [ + ['year_week', string_keys], + ] + N = 2 + + df = pd.DataFrame({ + 'index': np.arange(N), + 'year_week': np.array(string_keys, dtype='object'), + }, columns=['index', 'year_week']) + + _generate_partition_directories(fs, base_path, partition_spec, df) + + dataset = pq.ParquetDataset( + base_path, use_legacy_dataset=use_legacy_dataset) + result = dataset.read() + assert result.column("year_week").to_pylist() == string_keys + + @pytest.fixture def s3_bucket(request, s3_connection, s3_server): boto3 = pytest.importorskip('boto3') From 6e15657ef9fe04b41f7bcd3079872eda77f01758 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 28 Apr 2020 15:35:49 +0200 Subject: [PATCH 4/4] fix mark --- python/pyarrow/tests/test_parquet.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 97842b8dd39..cc2f83d6dca 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -65,7 +65,8 @@ def datadir(datadir): parametrize_legacy_dataset_skip_buffer = pytest.mark.parametrize( "use_legacy_dataset", [True, pytest.param(False, marks=pytest.mark.skip)]) parametrize_legacy_dataset_fixed = pytest.mark.parametrize( - "use_legacy_dataset", [pytest.param(True, marks=pytest.mark.xfail), False]) + "use_legacy_dataset", [pytest.param(True, marks=pytest.mark.xfail), + pytest.param(False, marks=pytest.mark.dataset)]) def deterministic_row_order(use_legacy_dataset, chunk_size=-1):