From 620b3b8588bcd46e66f3e98115594c5cc64176d8 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 16 Aug 2019 13:52:29 -0500 Subject: [PATCH 1/4] Add unit test for ARROW-5480 --- python/pyarrow/tests/test_parquet.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 12288bbda68..756cd7f5329 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -3015,7 +3015,6 @@ def test_dictionary_array_automatically_read(): assert result.schema.metadata is None -@pytest.mark.pandas def test_pandas_categorical_na_type_row_groups(): # ARROW-5085 df = pd.DataFrame({"col": [None] * 100, "int": [1.0] * 100}) @@ -3033,6 +3032,20 @@ def test_pandas_categorical_na_type_row_groups(): assert result[1].equals(table[1]) +def test_categorical_roundtrip(): + # ARROW-5480, this was enabled by ARROW-3246 + from io import BytesIO + df = pd.DataFrame({'x': pd.Categorical(['a', 'a', 'b', 'b'])}) + + buf = BytesIO() + df.to_parquet(buf) + + # This reads back object, but I expected category + result = pd.read_parquet(BytesIO(buf.getvalue())) + assert result['x'].dtype == 'category' + tm.assert_frame_equal(result, df) + + @pytest.mark.pandas def test_multi_dataset_metadata(tempdir): filenames = ["ARROW-1983-dataset.0", "ARROW-1983-dataset.1"] From 9e984043a72f7b79035c90d77185ed80c1269e4a Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 16 Aug 2019 14:45:20 -0500 Subject: [PATCH 2/4] Improve unit test for out-of-order values, nulls, unobserved category values --- python/pyarrow/tests/test_parquet.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 756cd7f5329..92e770e15fe 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -3035,14 +3035,19 @@ def test_pandas_categorical_na_type_row_groups(): def test_categorical_roundtrip(): # ARROW-5480, this was enabled by ARROW-3246 from io import BytesIO - df = pd.DataFrame({'x': pd.Categorical(['a', 'a', 'b', 'b'])}) + + # Have one of the categories unobserved + codes = np.array([2, 0, 0, 2, 0, -1, 2], dtype='int32') + categories = ['foo', 'bar', 'baz'] + df = pd.DataFrame({'x': pd.Categorical.from_codes( + codes, categories=categories)}) buf = BytesIO() df.to_parquet(buf) - # This reads back object, but I expected category result = pd.read_parquet(BytesIO(buf.getvalue())) - assert result['x'].dtype == 'category' + assert result.x.dtype == 'category' + assert (result.x.cat.categories == categories).all() tm.assert_frame_equal(result, df) From f1f80823599fa387219711a24608f290fc96bd60 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 17 Aug 2019 15:35:10 -0500 Subject: [PATCH 3/4] Don't use pandas's Parquet functions since they don't work in CI for some reason --- python/pyarrow/tests/test_parquet.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 92e770e15fe..c8488471ce7 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -3034,18 +3034,17 @@ def test_pandas_categorical_na_type_row_groups(): def test_categorical_roundtrip(): # ARROW-5480, this was enabled by ARROW-3246 - from io import BytesIO - # Have one of the categories unobserved + # Have one of the categories unobserved and include a null (-1) codes = np.array([2, 0, 0, 2, 0, -1, 2], dtype='int32') categories = ['foo', 'bar', 'baz'] df = pd.DataFrame({'x': pd.Categorical.from_codes( codes, categories=categories)}) - buf = BytesIO() - df.to_parquet(buf) + buf = pa.BufferOutputStream() + pq.write_table(pa.table(df), buf) - result = pd.read_parquet(BytesIO(buf.getvalue())) + result = pq.read_table(buf.getvalue()).to_pandas() assert result.x.dtype == 'category' assert (result.x.cat.categories == categories).all() tm.assert_frame_equal(result, df) From a161b2432fa013ea2803476bd940b510a5a7afc8 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 19 Aug 2019 10:27:26 -0500 Subject: [PATCH 4/4] Add missing pandas marks --- python/pyarrow/tests/test_parquet.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index c8488471ce7..2a9850858ec 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -3015,6 +3015,7 @@ def test_dictionary_array_automatically_read(): assert result.schema.metadata is None +@pytest.mark.pandas def test_pandas_categorical_na_type_row_groups(): # ARROW-5085 df = pd.DataFrame({"col": [None] * 100, "int": [1.0] * 100}) @@ -3032,7 +3033,8 @@ def test_pandas_categorical_na_type_row_groups(): assert result[1].equals(table[1]) -def test_categorical_roundtrip(): +@pytest.mark.pandas +def test_pandas_categorical_roundtrip(): # ARROW-5480, this was enabled by ARROW-3246 # Have one of the categories unobserved and include a null (-1)