From d476cb2c3aa527d306770400048519367ec54b3d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 29 Oct 2019 12:02:50 +0100 Subject: [PATCH 1/4] ARROW-6999: [Python] Fix unnamed index when specifying schema in Table.from_pandas --- python/pyarrow/pandas_compat.py | 22 ++++++++++++++++++++-- python/pyarrow/tests/test_pandas.py | 21 +++++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 944f92260e6..3ff41b25208 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -410,8 +410,8 @@ def _get_columns_to_convert_given_schema(df, schema, preserve_index): col = df[name] is_index = False except KeyError: - if preserve_index is not False and name in df.index.names: - col = df.index.get_level_values(name) + if preserve_index is not False and _is_index_level(df, name): + col = _get_index_level(df, name) if (preserve_index is None and isinstance(col, _pandas_api.pd.RangeIndex)): raise ValueError( @@ -449,6 +449,24 @@ def _get_columns_to_convert_given_schema(df, schema, preserve_index): index_levels, columns_to_convert, convert_fields) +def _is_index_level(df, name): + if name in df.index.names: + return True + elif _is_generated_index_name(name): + return True + else: + return False + + +def _get_index_level(df, name): + if name in df.index.names: + level = df.index.get_level_values(name) + else: + i = int(name.lstrip("__index_level_").rstrip("__")) + level = df.index.get_level_values(i) + return level + + def _get_range_index_descriptor(level): # public start/stop/step attributes added in pandas 0.25.0 return { diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index fa1f4bc770b..6924e593262 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -2873,6 +2873,27 @@ def test_table_from_pandas_schema_index_columns(): expected_schema=schema, expected=expected) +def test_table_from_pandas_schema_index_columns__unnamed_index(): + # ARROW-6999 - unnamed indices in specified schema + df = pd.DataFrame({'a': [1, 2, 3], 'b': [0.1, 0.2, 0.3]}) + + expected_schema = pa.schema([ + ('a', pa.int64()), + ('b', pa.float64()), + ('__index_level_0__', pa.int64()), + ]) + + schema = pa.Schema.from_pandas(df, preserve_index=True) + table = pa.Table.from_pandas(df, preserve_index=True, schema=schema) + assert table.schema.remove_metadata().equals(expected_schema) + + # non-RangeIndex (preserved by default) + df = pd.DataFrame({'a': [1, 2, 3], 'b': [0.1, 0.2, 0.3]}, index=[0, 1, 2]) + schema = pa.Schema.from_pandas(df) + table = pa.Table.from_pandas(df, schema=schema) + assert table.schema.remove_metadata().equals(expected_schema) + + # ---------------------------------------------------------------------- # RecordBatch, Table From 342508605fead62d78eae58a58fc744759674db5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 29 Oct 2019 13:35:12 +0100 Subject: [PATCH 2/4] add docstring --- python/pyarrow/pandas_compat.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 3ff41b25208..8de15bbd8cd 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -450,6 +450,10 @@ def _get_columns_to_convert_given_schema(df, schema, preserve_index): def _is_index_level(df, name): + """ + Check if 'name' (column from a Schema) corresponds to an Index level + (potentially without a name) of the DataFrame. + """ if name in df.index.names: return True elif _is_generated_index_name(name): @@ -459,9 +463,15 @@ def _is_index_level(df, name): def _get_index_level(df, name): + """ + Get the index level of a DataFrame given 'name' (column name in an arrow + Schema) + """ if name in df.index.names: level = df.index.get_level_values(name) else: + # we know we have an autogenerated name => extract number and get + # the index level positionally i = int(name.lstrip("__index_level_").rstrip("__")) level = df.index.get_level_values(i) return level From 2a225ee5d78e7208f96daebfbbc77e081fa2e912 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 30 Oct 2019 11:58:26 +0100 Subject: [PATCH 3/4] simplify + raise more specific error in case index is found and preserve_idnex=False --- python/pyarrow/pandas_compat.py | 55 ++++++++++++----------------- python/pyarrow/tests/test_pandas.py | 2 +- 2 files changed, 24 insertions(+), 33 deletions(-) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 8de15bbd8cd..968a76418c1 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -410,22 +410,28 @@ def _get_columns_to_convert_given_schema(df, schema, preserve_index): col = df[name] is_index = False except KeyError: - if preserve_index is not False and _is_index_level(df, name): + try: col = _get_index_level(df, name) - if (preserve_index is None and - isinstance(col, _pandas_api.pd.RangeIndex)): - raise ValueError( - "name '{}' is present in the schema, but it is a " - "RangeIndex which will not be converted as a column " - "in the Table, but saved as metadata-only not in " - "columns. Specify 'preserve_index=True' to force it " - "being added as a column, or remove it from the " - "specified schema".format(name)) - is_index = True - else: + except (KeyError, IndexError): + # name not found as index level raise KeyError( "name '{}' present in the specified schema is not found " "in the columns or index".format(name)) + if preserve_index is False: + raise ValueError( + "name '{}' present in the specified schema corresponds " + "to the index, but 'preserve_index=False' was " + "specified".format(name)) + elif (preserve_index is None and + isinstance(col, _pandas_api.pd.RangeIndex)): + raise ValueError( + "name '{}' is present in the schema, but it is a " + "RangeIndex which will not be converted as a column " + "in the Table, but saved as metadata-only not in " + "columns. Specify 'preserve_index=True' to force it " + "being added as a column, or remove it from the " + "specified schema".format(name)) + is_index = True name = _column_name_to_strings(name) @@ -449,32 +455,17 @@ def _get_columns_to_convert_given_schema(df, schema, preserve_index): index_levels, columns_to_convert, convert_fields) -def _is_index_level(df, name): - """ - Check if 'name' (column from a Schema) corresponds to an Index level - (potentially without a name) of the DataFrame. - """ - if name in df.index.names: - return True - elif _is_generated_index_name(name): - return True - else: - return False - - def _get_index_level(df, name): """ Get the index level of a DataFrame given 'name' (column name in an arrow - Schema) + Schema). """ - if name in df.index.names: - level = df.index.get_level_values(name) - else: + key = name + if name not in df.index.names and _is_generated_index_name(name): # we know we have an autogenerated name => extract number and get # the index level positionally - i = int(name.lstrip("__index_level_").rstrip("__")) - level = df.index.get_level_values(i) - return level + key = int(name.lstrip("__index_level_").rstrip("__")) + return df.index.get_level_values(key) def _get_range_index_descriptor(level): diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 6924e593262..957dfe0d3aa 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -2804,7 +2804,7 @@ def test_table_from_pandas_schema_index_columns(): expected_schema=schema) # schema includes correct index name but preserve_index=False - with pytest.raises(KeyError): + with pytest.raises(ValueError, match="'preserve_index=False' was"): pa.Table.from_pandas(df, schema=schema, preserve_index=False) # in case of preserve_index=None -> RangeIndex serialized as metadata From 199de5550cf37e9fc849eea6bd47b4090d1231d1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 5 Nov 2019 11:27:10 +0100 Subject: [PATCH 4/4] simplify extracting integer --- python/pyarrow/pandas_compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 968a76418c1..5b17e03d2fc 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -464,7 +464,7 @@ def _get_index_level(df, name): if name not in df.index.names and _is_generated_index_name(name): # we know we have an autogenerated name => extract number and get # the index level positionally - key = int(name.lstrip("__index_level_").rstrip("__")) + key = int(name[len("__index_level_"):-2]) return df.index.get_level_values(key)