From c91f385ac3b3a3de5e3551d24cec4f7e71489654 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Tue, 30 May 2023 10:23:37 +0200 Subject: [PATCH 1/9] Add __dataframe__ to the API docs for pa.Table and pa.RecordBatch --- docs/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index 19b0c353bdc..8a05641525e 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -79,6 +79,7 @@ # Show members for classes in .. autosummary autodoc_default_options = { 'members': None, + 'special-members': '__dataframe__', 'undoc-members': None, 'show-inheritance': None, 'inherited-members': None From 09c0ddb9924700b63b0a1f957ad5e2b6fec48c49 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Tue, 30 May 2023 12:36:32 +0200 Subject: [PATCH 2/9] Add from_dataframe to the API docs and add an example --- docs/source/python/api/tables.rst | 12 +++++ python/pyarrow/interchange/from_dataframe.py | 55 ++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/docs/source/python/api/tables.rst b/docs/source/python/api/tables.rst index eadf40cb759..af8a0de4f5d 100644 --- a/docs/source/python/api/tables.rst +++ b/docs/source/python/api/tables.rst @@ -46,6 +46,18 @@ Classes TableGroupBy RecordBatchReader +Dataframe Interchange Protocol +------------------------------ + +.. currentmodule:: pyarrow.interchange + +.. autosummary:: + :toctree: ../generated/ + + from_dataframe + +.. currentmodule:: pyarrow + .. _api.tensor: Tensors diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py index 801d0dd452a..4f376c2fe95 100644 --- a/python/pyarrow/interchange/from_dataframe.py +++ b/python/pyarrow/interchange/from_dataframe.py @@ -74,6 +74,61 @@ def from_dataframe(df: DataFrameObject, allow_copy=True) -> pa.Table: Returns ------- pa.Table + + Examples + -------- + >>> import pyarrow + >>> from pyarrow.interchange import from_dataframe + + Convert a pandas dataframe to a pyarrow table: + + >>> import pandas as pd + >>> df = pd.DataFrame({ + ... "n_atendees": [100, 10, 1], + ... "country": ["Italy", "Spain", "Slovenia"], + ... }) + >>> df + n_atendees country + 0 100 Italy + 1 10 Spain + 2 1 Slovenia + >>> from_dataframe(df) + pyarrow.Table + n_atendees: int64 + country: large_string + ---- + n_atendees: [[100,10,1]] + country: [["Italy","Spain","Slovenia"]] + + Convert a polars dataframe to a pyarrow table: + + >>> import polars as pl + >>> from datetime import datetime + >>> arr = [datetime(2023, 5, 20, 10, 0), + ... datetime(2023, 5, 20, 11, 0), + ... datetime(2023, 5, 20, 13, 30)] + >>> df = pl.DataFrame({ + ... 'Talk': ['About Polars','Intro into PyArrow','Coding in Rust'], + ... 'Time': arr, + ... }) + >>> df + shape: (3, 2) + ┌────────────────────┬─────────────────────┐ + │ Talk ┆ Time │ + │ --- ┆ --- │ + │ str ┆ datetime[μs] │ + ╞════════════════════╪═════════════════════╡ + │ About Polars ┆ 2023-05-20 10:00:00 │ + │ Intro into PyArrow ┆ 2023-05-20 11:00:00 │ + │ Coding in Rust ┆ 2023-05-20 13:30:00 │ + └────────────────────┴─────────────────────┘ + >>> from_dataframe(df) + pyarrow.Table + Talk: large_string + Time: timestamp[us] + ---- + Talk: [["About Polars","Intro into PyArrow","Coding in Rust"]] + Time: [[2023-05-20 10:00:00.000000,2023-05-20 11:00:00.000000,2023-05-20 13:30:00.000000]] """ if isinstance(df, pa.Table): return df From 3d35cfe077e589b8faef963b6a5fe8ddac79076a Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Wed, 31 May 2023 08:49:37 +0200 Subject: [PATCH 3/9] Add a page to the Python User Guide --- docs/source/python/index.rst | 1 + docs/source/python/interchange_protocol.rst | 115 ++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 docs/source/python/interchange_protocol.rst diff --git a/docs/source/python/index.rst b/docs/source/python/index.rst index 77cfaef4a40..b80cbc7de59 100644 --- a/docs/source/python/index.rst +++ b/docs/source/python/index.rst @@ -47,6 +47,7 @@ files into Arrow structures. filesystems_deprecated numpy pandas + interchange_protocol timestamps orc csv diff --git a/docs/source/python/interchange_protocol.rst b/docs/source/python/interchange_protocol.rst new file mode 100644 index 00000000000..9d3118aef1a --- /dev/null +++ b/docs/source/python/interchange_protocol.rst @@ -0,0 +1,115 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Dataframe Interchange Protocol +============================== + +The interchange protocol is implemented for ``pa.Table`` and +``pa.RecordBatch`` and is used to interchange data between +PyArrow and other dataframe libraries that also have the +protocol implemented. The data structures that are supported +in the protocol are primitive data types plus the dictionary +data type. The protocol also has missing data support and +it supports chunking, meaning accessing the +data in “batches” of rows. + + +Python dataframe interchange protocol is designed by the +`Consortium for Python Data API Standards `_ +in order to enable data interchange between dataframe +libraries in the Python ecosystem. See more about the +standard in the +`protocol documentation `_. + +``__dataframe__()`` method +-------------------------- + +``__dataframe__()`` method creates a new exchange object that +the consumer library can take and construct an object of it's own. + +.. code-block:: + + >>> import pyarrow as pa + >>> table = pa.table({"n_atendees": [100, 10, 1]}) + >>> table.__dataframe__() + + +from_dataframe() method +----------------------- + +With ``from_dataframe()`` method, we can construct a ``pa.table`` +from any dataframe object that implements the +``__dataframe__()`` method via the dataframe interchange +protocol. + +We can for example take a pandas dataframe and construct a +pyarrow table with the use of the interchange protocol: + +.. code-block:: + + >>> import pyarrow + >>> from pyarrow.interchange import from_dataframe + + >>> import pandas as pd + >>> df = pd.DataFrame({ + ... "n_atendees": [100, 10, 1], + ... "country": ["Italy", "Spain", "Slovenia"], + ... }) + >>> df + n_atendees country + 0 100 Italy + 1 10 Spain + 2 1 Slovenia + >>> from_dataframe(df) + pyarrow.Table + n_atendees: int64 + country: large_string + ---- + n_atendees: [[100,10,1]] + country: [["Italy","Spain","Slovenia"]] + +We can do the same with polars dataframe: + +.. code-block:: + + >>> import polars as pl + >>> from datetime import datetime + >>> arr = [datetime(2023, 5, 20, 10, 0), + ... datetime(2023, 5, 20, 11, 0), + ... datetime(2023, 5, 20, 13, 30)] + >>> df = pl.DataFrame({ + ... 'Talk': ['About Polars','Intro into PyArrow','Coding in Rust'], + ... 'Time': arr, + ... }) + >>> df + shape: (3, 2) + ┌────────────────────┬─────────────────────┐ + │ Talk ┆ Time │ + │ --- ┆ --- │ + │ str ┆ datetime[μs] │ + ╞════════════════════╪═════════════════════╡ + │ About Polars ┆ 2023-05-20 10:00:00 │ + │ Intro into PyArrow ┆ 2023-05-20 11:00:00 │ + │ Coding in Rust ┆ 2023-05-20 13:30:00 │ + └────────────────────┴─────────────────────┘ + >>> from_dataframe(df) + pyarrow.Table + Talk: large_string + Time: timestamp[us] + ---- + Talk: [["About Polars","Intro into PyArrow","Coding in Rust"]] + Time: [[2023-05-20 10:00:00.000000,2023-05-20 11:00:00.000000,2023-05-20 13:30:00.000000]] From c3abb8e3708d871cec6fa0d44d2dbde65313355e Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 31 May 2023 10:57:26 +0200 Subject: [PATCH 4/9] Apply suggestions from code review - Joris Co-authored-by: Joris Van den Bossche --- docs/source/python/interchange_protocol.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/python/interchange_protocol.rst b/docs/source/python/interchange_protocol.rst index 9d3118aef1a..603f8fcb7f8 100644 --- a/docs/source/python/interchange_protocol.rst +++ b/docs/source/python/interchange_protocol.rst @@ -28,7 +28,7 @@ it supports chunking, meaning accessing the data in “batches” of rows. -Python dataframe interchange protocol is designed by the +The Python dataframe interchange protocol is designed by the `Consortium for Python Data API Standards `_ in order to enable data interchange between dataframe libraries in the Python ecosystem. See more about the @@ -38,7 +38,7 @@ standard in the ``__dataframe__()`` method -------------------------- -``__dataframe__()`` method creates a new exchange object that +The ``__dataframe__()`` method creates a new exchange object that the consumer library can take and construct an object of it's own. .. code-block:: @@ -51,7 +51,7 @@ the consumer library can take and construct an object of it's own. from_dataframe() method ----------------------- -With ``from_dataframe()`` method, we can construct a ``pa.table`` +With ``from_dataframe()`` method, we can construct a :class:`pyarrow.Table` from any dataframe object that implements the ``__dataframe__()`` method via the dataframe interchange protocol. From f0d59171f64db5a3b5e8f67b534cb014b032ee2e Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Wed, 31 May 2023 15:08:59 +0200 Subject: [PATCH 5/9] Remove polars ex from the docstring --- python/pyarrow/interchange/from_dataframe.py | 30 -------------------- 1 file changed, 30 deletions(-) diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py index 4f376c2fe95..1d41aa8d7ee 100644 --- a/python/pyarrow/interchange/from_dataframe.py +++ b/python/pyarrow/interchange/from_dataframe.py @@ -99,36 +99,6 @@ def from_dataframe(df: DataFrameObject, allow_copy=True) -> pa.Table: ---- n_atendees: [[100,10,1]] country: [["Italy","Spain","Slovenia"]] - - Convert a polars dataframe to a pyarrow table: - - >>> import polars as pl - >>> from datetime import datetime - >>> arr = [datetime(2023, 5, 20, 10, 0), - ... datetime(2023, 5, 20, 11, 0), - ... datetime(2023, 5, 20, 13, 30)] - >>> df = pl.DataFrame({ - ... 'Talk': ['About Polars','Intro into PyArrow','Coding in Rust'], - ... 'Time': arr, - ... }) - >>> df - shape: (3, 2) - ┌────────────────────┬─────────────────────┐ - │ Talk ┆ Time │ - │ --- ┆ --- │ - │ str ┆ datetime[μs] │ - ╞════════════════════╪═════════════════════╡ - │ About Polars ┆ 2023-05-20 10:00:00 │ - │ Intro into PyArrow ┆ 2023-05-20 11:00:00 │ - │ Coding in Rust ┆ 2023-05-20 13:30:00 │ - └────────────────────┴─────────────────────┘ - >>> from_dataframe(df) - pyarrow.Table - Talk: large_string - Time: timestamp[us] - ---- - Talk: [["About Polars","Intro into PyArrow","Coding in Rust"]] - Time: [[2023-05-20 10:00:00.000000,2023-05-20 11:00:00.000000,2023-05-20 13:30:00.000000]] """ if isinstance(df, pa.Table): return df From c5afa7b05308b3ffefc61988a3f7d6354bb83a24 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Wed, 31 May 2023 15:09:47 +0200 Subject: [PATCH 6/9] Remove the shifting of currentmodules in tables.rst --- docs/source/python/api/tables.rst | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/source/python/api/tables.rst b/docs/source/python/api/tables.rst index af8a0de4f5d..ae9f5de127d 100644 --- a/docs/source/python/api/tables.rst +++ b/docs/source/python/api/tables.rst @@ -49,14 +49,10 @@ Classes Dataframe Interchange Protocol ------------------------------ -.. currentmodule:: pyarrow.interchange - .. autosummary:: :toctree: ../generated/ - from_dataframe - -.. currentmodule:: pyarrow + interchange.from_dataframe .. _api.tensor: From 913ca86f384149f1eef12330a42462ed7cd7b7b2 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Wed, 31 May 2023 15:14:33 +0200 Subject: [PATCH 7/9] Change the titles --- docs/source/python/interchange_protocol.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/python/interchange_protocol.rst b/docs/source/python/interchange_protocol.rst index 603f8fcb7f8..ae2093a4192 100644 --- a/docs/source/python/interchange_protocol.rst +++ b/docs/source/python/interchange_protocol.rst @@ -35,8 +35,8 @@ libraries in the Python ecosystem. See more about the standard in the `protocol documentation `_. -``__dataframe__()`` method --------------------------- +From pyarrow to other libraries: ``__dataframe__()`` method +----------------------------------------------------------- The ``__dataframe__()`` method creates a new exchange object that the consumer library can take and construct an object of it's own. @@ -48,8 +48,8 @@ the consumer library can take and construct an object of it's own. >>> table.__dataframe__() -from_dataframe() method ------------------------ +From other libraries to pyarrow: ``from_dataframe()`` +----------------------------------------------------- With ``from_dataframe()`` method, we can construct a :class:`pyarrow.Table` from any dataframe object that implements the From ebc6ac18a58b7d4fafb0ce6a8ee560c66e285b42 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Wed, 31 May 2023 15:18:06 +0200 Subject: [PATCH 8/9] Add note about the dunder method and calling it manually --- docs/source/python/interchange_protocol.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/source/python/interchange_protocol.rst b/docs/source/python/interchange_protocol.rst index ae2093a4192..ac7f0f53027 100644 --- a/docs/source/python/interchange_protocol.rst +++ b/docs/source/python/interchange_protocol.rst @@ -48,6 +48,10 @@ the consumer library can take and construct an object of it's own. >>> table.__dataframe__() +This is meant to be used by the consumer library when calling +``from_dataframe()`` method and is not meant to be used manually +by the user. + From other libraries to pyarrow: ``from_dataframe()`` ----------------------------------------------------- @@ -82,7 +86,7 @@ pyarrow table with the use of the interchange protocol: n_atendees: [[100,10,1]] country: [["Italy","Spain","Slovenia"]] -We can do the same with polars dataframe: +We can do the same with a polars dataframe: .. code-block:: From 81f80ddf65ec01c83c76cbeb017c7c5dbc3b9c7b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 6 Jun 2023 12:30:53 +0200 Subject: [PATCH 9/9] Apply suggestions from code review --- docs/source/python/interchange_protocol.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/python/interchange_protocol.rst b/docs/source/python/interchange_protocol.rst index ac7f0f53027..7784d78619e 100644 --- a/docs/source/python/interchange_protocol.rst +++ b/docs/source/python/interchange_protocol.rst @@ -49,13 +49,13 @@ the consumer library can take and construct an object of it's own. This is meant to be used by the consumer library when calling -``from_dataframe()`` method and is not meant to be used manually +the ``from_dataframe()`` function and is not meant to be used manually by the user. From other libraries to pyarrow: ``from_dataframe()`` ----------------------------------------------------- -With ``from_dataframe()`` method, we can construct a :class:`pyarrow.Table` +With the ``from_dataframe()`` function, we can construct a :class:`pyarrow.Table` from any dataframe object that implements the ``__dataframe__()`` method via the dataframe interchange protocol.