From d476cb2c3aa527d306770400048519367ec54b3d Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 29 Oct 2019 12:02:50 +0100
Subject: [PATCH 1/4] ARROW-6999: [Python] Fix unnamed index when specifying
 schema in Table.from_pandas

---
 python/pyarrow/pandas_compat.py     | 22 ++++++++++++++++++++--
 python/pyarrow/tests/test_pandas.py | 21 +++++++++++++++++++++
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 944f92260e6..3ff41b25208 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -410,8 +410,8 @@ def _get_columns_to_convert_given_schema(df, schema, preserve_index):
             col = df[name]
             is_index = False
         except KeyError:
-            if preserve_index is not False and name in df.index.names:
-                col = df.index.get_level_values(name)
+            if preserve_index is not False and _is_index_level(df, name):
+                col = _get_index_level(df, name)
                 if (preserve_index is None and
                         isinstance(col, _pandas_api.pd.RangeIndex)):
                     raise ValueError(
@@ -449,6 +449,24 @@ def _get_columns_to_convert_given_schema(df, schema, preserve_index):
             index_levels, columns_to_convert, convert_fields)
 
 
+def _is_index_level(df, name):
+    if name in df.index.names:
+        return True
+    elif _is_generated_index_name(name):
+        return True
+    else:
+        return False
+
+
+def _get_index_level(df, name):
+    if name in df.index.names:
+        level = df.index.get_level_values(name)
+    else:
+        i = int(name.lstrip("__index_level_").rstrip("__"))
+        level = df.index.get_level_values(i)
+    return level
+
+
 def _get_range_index_descriptor(level):
     # public start/stop/step attributes added in pandas 0.25.0
     return {
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index fa1f4bc770b..6924e593262 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -2873,6 +2873,27 @@ def test_table_from_pandas_schema_index_columns():
                             expected_schema=schema, expected=expected)
 
 
+def test_table_from_pandas_schema_index_columns__unnamed_index():
+    # ARROW-6999 - unnamed indices in specified schema
+    df = pd.DataFrame({'a': [1, 2, 3], 'b': [0.1, 0.2, 0.3]})
+
+    expected_schema = pa.schema([
+        ('a', pa.int64()),
+        ('b', pa.float64()),
+        ('__index_level_0__', pa.int64()),
+    ])
+
+    schema = pa.Schema.from_pandas(df, preserve_index=True)
+    table = pa.Table.from_pandas(df, preserve_index=True, schema=schema)
+    assert table.schema.remove_metadata().equals(expected_schema)
+
+    # non-RangeIndex (preserved by default)
+    df = pd.DataFrame({'a': [1, 2, 3], 'b': [0.1, 0.2, 0.3]}, index=[0, 1, 2])
+    schema = pa.Schema.from_pandas(df)
+    table = pa.Table.from_pandas(df, schema=schema)
+    assert table.schema.remove_metadata().equals(expected_schema)
+
+
 # ----------------------------------------------------------------------
 # RecordBatch, Table
 

From 342508605fead62d78eae58a58fc744759674db5 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 29 Oct 2019 13:35:12 +0100
Subject: [PATCH 2/4] add docstring

---
 python/pyarrow/pandas_compat.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 3ff41b25208..8de15bbd8cd 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -450,6 +450,10 @@ def _get_columns_to_convert_given_schema(df, schema, preserve_index):
 
 
 def _is_index_level(df, name):
+    """
+    Check if 'name' (column from a Schema) corresponds to an Index level
+    (potentially without a name) of the DataFrame.
+    """
     if name in df.index.names:
         return True
     elif _is_generated_index_name(name):
@@ -459,9 +463,15 @@ def _is_index_level(df, name):
 
 
 def _get_index_level(df, name):
+    """
+    Get the index level of a DataFrame given 'name' (column name in an arrow
+    Schema)
+    """
     if name in df.index.names:
         level = df.index.get_level_values(name)
     else:
+        # we know we have an autogenerated name => extract number and get
+        # the index level positionally
         i = int(name.lstrip("__index_level_").rstrip("__"))
         level = df.index.get_level_values(i)
     return level

From 2a225ee5d78e7208f96daebfbbc77e081fa2e912 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 30 Oct 2019 11:58:26 +0100
Subject: [PATCH 3/4] simplify + raise more specific error in case index is
 found and preserve_idnex=False

---
 python/pyarrow/pandas_compat.py     | 55 ++++++++++++-----------------
 python/pyarrow/tests/test_pandas.py |  2 +-
 2 files changed, 24 insertions(+), 33 deletions(-)

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 8de15bbd8cd..968a76418c1 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -410,22 +410,28 @@ def _get_columns_to_convert_given_schema(df, schema, preserve_index):
             col = df[name]
             is_index = False
         except KeyError:
-            if preserve_index is not False and _is_index_level(df, name):
+            try:
                 col = _get_index_level(df, name)
-                if (preserve_index is None and
-                        isinstance(col, _pandas_api.pd.RangeIndex)):
-                    raise ValueError(
-                        "name '{}' is present in the schema, but it is a "
-                        "RangeIndex which will not be converted as a column "
-                        "in the Table, but saved as metadata-only not in "
-                        "columns. Specify 'preserve_index=True' to force it "
-                        "being added as a column, or remove it from the "
-                        "specified schema".format(name))
-                is_index = True
-            else:
+            except (KeyError, IndexError):
+                # name not found as index level
                 raise KeyError(
                     "name '{}' present in the specified schema is not found "
                     "in the columns or index".format(name))
+            if preserve_index is False:
+                raise ValueError(
+                    "name '{}' present in the specified schema corresponds "
+                    "to the index, but 'preserve_index=False' was "
+                    "specified".format(name))
+            elif (preserve_index is None and
+                    isinstance(col, _pandas_api.pd.RangeIndex)):
+                raise ValueError(
+                    "name '{}' is present in the schema, but it is a "
+                    "RangeIndex which will not be converted as a column "
+                    "in the Table, but saved as metadata-only not in "
+                    "columns. Specify 'preserve_index=True' to force it "
+                    "being added as a column, or remove it from the "
+                    "specified schema".format(name))
+            is_index = True
 
         name = _column_name_to_strings(name)
 
@@ -449,32 +455,17 @@ def _get_columns_to_convert_given_schema(df, schema, preserve_index):
             index_levels, columns_to_convert, convert_fields)
 
 
-def _is_index_level(df, name):
-    """
-    Check if 'name' (column from a Schema) corresponds to an Index level
-    (potentially without a name) of the DataFrame.
-    """
-    if name in df.index.names:
-        return True
-    elif _is_generated_index_name(name):
-        return True
-    else:
-        return False
-
-
 def _get_index_level(df, name):
     """
     Get the index level of a DataFrame given 'name' (column name in an arrow
-    Schema)
+    Schema).
     """
-    if name in df.index.names:
-        level = df.index.get_level_values(name)
-    else:
+    key = name
+    if name not in df.index.names and _is_generated_index_name(name):
         # we know we have an autogenerated name => extract number and get
         # the index level positionally
-        i = int(name.lstrip("__index_level_").rstrip("__"))
-        level = df.index.get_level_values(i)
-    return level
+        key = int(name.lstrip("__index_level_").rstrip("__"))
+    return df.index.get_level_values(key)
 
 
 def _get_range_index_descriptor(level):
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 6924e593262..957dfe0d3aa 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -2804,7 +2804,7 @@ def test_table_from_pandas_schema_index_columns():
                             expected_schema=schema)
 
     # schema includes correct index name but preserve_index=False
-    with pytest.raises(KeyError):
+    with pytest.raises(ValueError, match="'preserve_index=False' was"):
         pa.Table.from_pandas(df, schema=schema, preserve_index=False)
 
     # in case of preserve_index=None -> RangeIndex serialized as metadata

From 199de5550cf37e9fc849eea6bd47b4090d1231d1 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 5 Nov 2019 11:27:10 +0100
Subject: [PATCH 4/4] simplify extracting integer

---
 python/pyarrow/pandas_compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 968a76418c1..5b17e03d2fc 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -464,7 +464,7 @@ def _get_index_level(df, name):
     if name not in df.index.names and _is_generated_index_name(name):
         # we know we have an autogenerated name => extract number and get
         # the index level positionally
-        key = int(name.lstrip("__index_level_").rstrip("__"))
+        key = int(name[len("__index_level_"):-2])
     return df.index.get_level_values(key)