From 620b3b8588bcd46e66f3e98115594c5cc64176d8 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Fri, 16 Aug 2019 13:52:29 -0500
Subject: [PATCH 1/4] Add unit test for ARROW-5480

---
 python/pyarrow/tests/test_parquet.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 12288bbda68..756cd7f5329 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -3015,7 +3015,6 @@ def test_dictionary_array_automatically_read():
     assert result.schema.metadata is None
 
 
-@pytest.mark.pandas
 def test_pandas_categorical_na_type_row_groups():
     # ARROW-5085
     df = pd.DataFrame({"col": [None] * 100, "int": [1.0] * 100})
@@ -3033,6 +3032,20 @@ def test_pandas_categorical_na_type_row_groups():
     assert result[1].equals(table[1])
 
 
+def test_categorical_roundtrip():
+    # ARROW-5480, this was enabled by ARROW-3246
+    from io import BytesIO
+    df = pd.DataFrame({'x': pd.Categorical(['a', 'a', 'b', 'b'])})
+
+    buf = BytesIO()
+    df.to_parquet(buf)
+
+    # This reads back object, but I expected category
+    result = pd.read_parquet(BytesIO(buf.getvalue()))
+    assert result['x'].dtype == 'category'
+    tm.assert_frame_equal(result, df)
+
+
 @pytest.mark.pandas
 def test_multi_dataset_metadata(tempdir):
     filenames = ["ARROW-1983-dataset.0", "ARROW-1983-dataset.1"]

From 9e984043a72f7b79035c90d77185ed80c1269e4a Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Fri, 16 Aug 2019 14:45:20 -0500
Subject: [PATCH 2/4] Improve unit test for out-of-order values, nulls,
 unobserved category values

---
 python/pyarrow/tests/test_parquet.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 756cd7f5329..92e770e15fe 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -3035,14 +3035,19 @@ def test_pandas_categorical_na_type_row_groups():
 def test_categorical_roundtrip():
     # ARROW-5480, this was enabled by ARROW-3246
     from io import BytesIO
-    df = pd.DataFrame({'x': pd.Categorical(['a', 'a', 'b', 'b'])})
+
+    # Have one of the categories unobserved
+    codes = np.array([2, 0, 0, 2, 0, -1, 2], dtype='int32')
+    categories = ['foo', 'bar', 'baz']
+    df = pd.DataFrame({'x': pd.Categorical.from_codes(
+        codes, categories=categories)})
 
     buf = BytesIO()
     df.to_parquet(buf)
 
-    # This reads back object, but I expected category
     result = pd.read_parquet(BytesIO(buf.getvalue()))
-    assert result['x'].dtype == 'category'
+    assert result.x.dtype == 'category'
+    assert (result.x.cat.categories == categories).all()
     tm.assert_frame_equal(result, df)
 
 

From f1f80823599fa387219711a24608f290fc96bd60 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Sat, 17 Aug 2019 15:35:10 -0500
Subject: [PATCH 3/4] Don't use pandas's Parquet functions since they don't
 work in CI for some reason

---
 python/pyarrow/tests/test_parquet.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 92e770e15fe..c8488471ce7 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -3034,18 +3034,17 @@ def test_pandas_categorical_na_type_row_groups():
 
 def test_categorical_roundtrip():
     # ARROW-5480, this was enabled by ARROW-3246
-    from io import BytesIO
 
-    # Have one of the categories unobserved
+    # Have one of the categories unobserved and include a null (-1)
     codes = np.array([2, 0, 0, 2, 0, -1, 2], dtype='int32')
     categories = ['foo', 'bar', 'baz']
     df = pd.DataFrame({'x': pd.Categorical.from_codes(
         codes, categories=categories)})
 
-    buf = BytesIO()
-    df.to_parquet(buf)
+    buf = pa.BufferOutputStream()
+    pq.write_table(pa.table(df), buf)
 
-    result = pd.read_parquet(BytesIO(buf.getvalue()))
+    result = pq.read_table(buf.getvalue()).to_pandas()
     assert result.x.dtype == 'category'
     assert (result.x.cat.categories == categories).all()
     tm.assert_frame_equal(result, df)

From a161b2432fa013ea2803476bd940b510a5a7afc8 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Mon, 19 Aug 2019 10:27:26 -0500
Subject: [PATCH 4/4] Add missing pandas marks

---
 python/pyarrow/tests/test_parquet.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index c8488471ce7..2a9850858ec 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -3015,6 +3015,7 @@ def test_dictionary_array_automatically_read():
     assert result.schema.metadata is None
 
 
+@pytest.mark.pandas
 def test_pandas_categorical_na_type_row_groups():
     # ARROW-5085
     df = pd.DataFrame({"col": [None] * 100, "int": [1.0] * 100})
@@ -3032,7 +3033,8 @@ def test_pandas_categorical_na_type_row_groups():
     assert result[1].equals(table[1])
 
 
-def test_categorical_roundtrip():
+@pytest.mark.pandas
+def test_pandas_categorical_roundtrip():
     # ARROW-5480, this was enabled by ARROW-3246
 
     # Have one of the categories unobserved and include a null (-1)