From 854e114f45ec52869cfec2b3bc14f53377b91a9a Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Tue, 25 Oct 2022 11:51:28 +0200
Subject: [PATCH 01/29] Initial sceleton for interchange package

---
 python/pyarrow/interchange/__init__.py        |  16 +
 python/pyarrow/interchange/buffer.py          |  72 +++
 python/pyarrow/interchange/column.py          | 193 +++++++
 python/pyarrow/interchange/dataframe.py       |  89 ++++
 .../pyarrow/interchange/dataframe_protocol.py | 493 ++++++++++++++++++
 python/pyarrow/interchange/from_dataframe.py  | 253 +++++++++
 6 files changed, 1116 insertions(+)
 create mode 100644 python/pyarrow/interchange/__init__.py
 create mode 100644 python/pyarrow/interchange/buffer.py
 create mode 100644 python/pyarrow/interchange/column.py
 create mode 100644 python/pyarrow/interchange/dataframe.py
 create mode 100644 python/pyarrow/interchange/dataframe_protocol.py
 create mode 100644 python/pyarrow/interchange/from_dataframe.py

diff --git a/python/pyarrow/interchange/__init__.py b/python/pyarrow/interchange/__init__.py
new file mode 100644
index 00000000000..d216be4ddc9
--- /dev/null
+++ b/python/pyarrow/interchange/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
\ No newline at end of file
diff --git a/python/pyarrow/interchange/buffer.py b/python/pyarrow/interchange/buffer.py
new file mode 100644
index 00000000000..06803aaf299
--- /dev/null
+++ b/python/pyarrow/interchange/buffer.py
@@ -0,0 +1,72 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from dataframe_protocol import (
+    Buffer,
+    DlpackDeviceType,
+)
+import numpy as np
+
+class PyArrowBuffer(Buffer):
+    """
+    Data in the buffer is guaranteed to be contiguous in memory.
+    """
+
+    def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None:
+        """
+        Handle only regular columns (= numpy arrays) for now.
+        """
+        pass
+
+    @property
+    def bufsize(self) -> int:
+        """
+        Buffer size in bytes.
+        """
+        pass
+
+    @property
+    def ptr(self) -> int:
+        """
+        Pointer to start of the buffer as an integer.
+        """
+        pass
+
+    def __dlpack__(self):
+        """
+        Represent this structure as DLPack interface.
+        """
+        pass
+
+    def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
+        """
+        Device type and device ID for where the data in the buffer resides.
+        """
+        pass
+
+    def __repr__(self) -> str:
+        return (
+            "PyArrowBuffer("
+            + str(
+                {
+                    "bufsize": self.bufsize,
+                    "ptr": self.ptr,
+                    "device": self.__dlpack_device__()[0].name,
+                }
+            )
+            + ")"
+        )
\ No newline at end of file
diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py
new file mode 100644
index 00000000000..eb63f014187
--- /dev/null
+++ b/python/pyarrow/interchange/column.py
@@ -0,0 +1,193 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from typing import Any
+
+import pyarrow as pa
+
+from buffer import PyArrowBuffer
+from dataframe_protocol import (
+    Column,
+    ColumnBuffers,
+    DtypeKind,
+)
+
+class PyArrowColumn(Column):
+    """
+    A column object, with only the methods and properties required by the
+    interchange protocol defined.
+    A column can contain one or more chunks. Each chunk can contain up to three
+    buffers - a data buffer, a mask buffer (depending on null representation),
+    and an offsets buffer (if variable-size binary; e.g., variable-length
+    strings).
+    Note: this Column object can only be produced by ``__dataframe__``, so
+          doesn't need its own version or ``__column__`` protocol.
+    """
+
+    def __init__(self, column: pa.Array, allow_copy: bool = True) -> None:
+        """
+        Note: doesn't deal with extension arrays yet, just assume a regular
+        Series/ndarray for now.
+        """
+        pass
+
+    def size(self) -> int:
+        """
+        Size of the column, in elements.
+        """
+        pass
+
+    @property
+    def offset(self) -> int:
+        """
+        Offset of first element. Always zero.
+        """
+        pass
+
+    @property
+    def dtype(self) -> tuple[DtypeKind, int, str, str]:
+        """
+        Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
+        Bit-width : the number of bits as an integer
+        Format string : data type description format string in Apache Arrow C
+                        Data Interface format.
+        Endianness : current only native endianness (``=``) is supported
+        Notes:
+            - Kind specifiers are aligned with DLPack where possible (hence the
+              jump to 20, leave enough room for future extension)
+            - Masks must be specified as boolean with either bit width 1 (for bit
+              masks) or 8 (for byte masks).
+            - Dtype width in bits was preferred over bytes
+            - Endianness isn't too useful, but included now in case in the future
+              we need to support non-native endianness
+            - Went with Apache Arrow format strings over NumPy format strings
+              because they're more complete from a dataframe perspective
+            - Format strings are mostly useful for datetime specification, and
+              for categoricals.
+            - For categoricals, the format string describes the type of the
+              categorical in the data buffer. In case of a separate encoding of
+              the categorical (e.g. an integer to string mapping), this can
+              be derived from ``self.describe_categorical``.
+            - Data types not included: complex, Arrow-style null, binary, decimal,
+              and nested (list, struct, map, union) dtypes.
+        """
+        pass
+
+    def _dtype_from_arrowdtype(self, dtype) -> tuple[DtypeKind, int, str, str]:
+        """
+        See `self.dtype` for details.
+        """
+        # Note: 'c' (complex) not handled yet (not in array spec v1).
+        #       'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled
+        #       datetime and timedelta both map to datetime (is timedelta handled?)
+
+        pass
+
+    @property
+    def describe_categorical(self):
+        """
+        If the dtype is categorical, there are two options:
+        - There are only values in the data buffer.
+        - There is a separate non-categorical Column encoding for categorical values.
+        Raises TypeError if the dtype is not categorical
+        Content of returned dict:
+            - "is_ordered" : bool, whether the ordering of dictionary indices is
+                             semantically meaningful.
+            - "is_dictionary" : bool, whether a dictionary-style mapping of
+                                categorical values to other objects exists
+            - "categories" : Column representing the (implicit) mapping of indices to
+                             category values (e.g. an array of cat1, cat2, ...).
+                             None if not a dictionary-style categorical.
+        """
+        pass
+
+    @property
+    def describe_null(self):
+        pass
+
+    @property
+    def null_count(self) -> int:
+        """
+        Number of null elements. Should always be known.
+        """
+        pass
+
+    @property
+    def metadata(self) -> dict[str, pd.Index]:
+        """
+        Store specific metadata of the column.
+        """
+        pass
+
+    def num_chunks(self) -> int:
+        """
+        Return the number of chunks the column consists of.
+        """
+        pass
+
+    def get_chunks(self, n_chunks: int | None = None):
+        """
+        Return an iterator yielding the chunks.
+        See `DataFrame.get_chunks` for details on ``n_chunks``.
+        """
+        pass
+
+    def get_buffers(self) -> ColumnBuffers:
+        """
+        Return a dictionary containing the underlying buffers.
+        The returned dictionary has the following contents:
+            - "data": a two-element tuple whose first element is a buffer
+                      containing the data and whose second element is the data
+                      buffer's associated dtype.
+            - "validity": a two-element tuple whose first element is a buffer
+                          containing mask values indicating missing data and
+                          whose second element is the mask value buffer's
+                          associated dtype. None if the null representation is
+                          not a bit or byte mask.
+            - "offsets": a two-element tuple whose first element is a buffer
+                         containing the offset values for variable-size binary
+                         data (e.g., variable-length strings) and whose second
+                         element is the offsets buffer's associated dtype. None
+                         if the data buffer does not have an associated offsets
+                         buffer.
+        """
+        pass
+
+    def _get_data_buffer(
+        self,
+    ) -> tuple[PyArrowBuffer, Any]:  # Any is for self.dtype tuple
+        """
+        Return the buffer containing the data and the buffer's associated dtype.
+        """
+        pass
+
+    def _get_validity_buffer(self) -> tuple[PyArrowBuffer, Any]:
+        """
+        Return the buffer containing the mask values indicating missing data and
+        the buffer's associated dtype.
+        Raises NoBufferPresent if null representation is not a bit or byte mask.
+        """
+        pass
+
+    def _get_offsets_buffer(self) -> tuple[PyArrowBuffer, Any]:
+        """
+        Return the buffer containing the offset values for variable-size binary
+        data (e.g., variable-length strings) and the buffer's associated dtype.
+        Raises NoBufferPresent if the data buffer does not have an associated
+        offsets buffer.
+        """
+        pass
diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py
new file mode 100644
index 00000000000..c14ae49758a
--- /dev/null
+++ b/python/pyarrow/interchange/dataframe.py
@@ -0,0 +1,89 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pyarrow as pa
+
+from column import PandasColumn
+from dataframe_protocol import DataFrame as DataFrameXchg
+
+class PyArrowTableXchg(DataFrameXchg):
+    """
+    A data frame class, with only the methods required by the interchange
+    protocol defined.
+    Instances of this (private) class are returned from
+    ``pd.DataFrame.__dataframe__`` as objects with the methods and
+    attributes defined on this class.
+    """
+
+    def __init__(
+        self, df: pa.Table, nan_as_null: bool = False, allow_copy: bool = True
+    ) -> None:
+        """
+        Constructor - an instance of this (private) class is returned from
+        `pd.DataFrame.__dataframe__`.
+        """
+        self._df = df
+        # ``nan_as_null`` is a keyword intended for the consumer to tell the
+        # producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
+        # This currently has no effect; once support for nullable extension
+        # dtypes is added, this value should be propagated to columns.
+        self._nan_as_null = nan_as_null
+        self._allow_copy = allow_copy
+
+    def __dataframe__(
+        self, nan_as_null: bool = False, allow_copy: bool = True
+    ) -> PyArrowTableXchg:
+        return PyArrowTableXchg(self._df, nan_as_null, allow_copy)
+
+    @property
+    def metadata(self) -> dict[str, Index]:
+        # `index` isn't a regular column, and the protocol doesn't support row
+        # labels - so we export it as Pandas-specific metadata here.
+        pass
+
+    def num_columns(self) -> int:
+        pass
+
+    def num_rows(self) -> int:
+        pass
+
+    def num_chunks(self) -> int:
+        pass
+
+    def column_names(self) -> Index:
+        pass
+
+    def get_column(self, i: int) -> PyArrowColumn:
+        pass
+
+    def get_column_by_name(self, name: str) -> PyArrowColumn:
+        pass
+
+    def get_columns(self) -> list[PyArrowColumn]:
+        pass
+
+    def select_columns(self, indices) -> PyArrowTableFrameXchg:
+        pass
+
+    def select_columns_by_name(self, names) -> PyArrowTableFrameXchg:
+        pass
+
+    def get_chunks(self, n_chunks=None):
+        """
+        Return an iterator yielding the chunks.
+        """
+        pass
\ No newline at end of file
diff --git a/python/pyarrow/interchange/dataframe_protocol.py b/python/pyarrow/interchange/dataframe_protocol.py
new file mode 100644
index 00000000000..f12ae4e7ae8
--- /dev/null
+++ b/python/pyarrow/interchange/dataframe_protocol.py
@@ -0,0 +1,493 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Copy of the spec from https://github.com/data-apis/dataframe-api
+"""
+
+from abc import (
+    ABC,
+    abstractmethod,
+)
+import enum
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    Optional,
+    Sequence,
+    Tuple,
+    TypedDict,
+)
+
+
+class DlpackDeviceType(enum.IntEnum):
+    """Integer enum for device type codes matching DLPack."""
+
+    CPU = 1
+    CUDA = 2
+    CPU_PINNED = 3
+    OPENCL = 4
+    VULKAN = 7
+    METAL = 8
+    VPI = 9
+    ROCM = 10
+
+
+class DtypeKind(enum.IntEnum):
+    """
+    Integer enum for data types.
+    Attributes
+    ----------
+    INT : int
+        Matches to signed integer data type.
+    UINT : int
+        Matches to unsigned integer data type.
+    FLOAT : int
+        Matches to floating point data type.
+    BOOL : int
+        Matches to boolean data type.
+    STRING : int
+        Matches to string data type (UTF-8 encoded).
+    DATETIME : int
+        Matches to datetime data type.
+    CATEGORICAL : int
+        Matches to categorical data type.
+    """
+
+    INT = 0
+    UINT = 1
+    FLOAT = 2
+    BOOL = 20
+    STRING = 21  # UTF-8
+    DATETIME = 22
+    CATEGORICAL = 23
+
+
+Dtype = Tuple[DtypeKind, int, str, str]  # see Column.dtype
+
+
+class ColumnNullType(enum.IntEnum):
+    """
+    Integer enum for null type representation.
+    Attributes
+    ----------
+    NON_NULLABLE : int
+        Non-nullable column.
+    USE_NAN : int
+        Use explicit float NaN value.
+    USE_SENTINEL : int
+        Sentinel value besides NaN.
+    USE_BITMASK : int
+        The bit is set/unset representing a null on a certain position.
+    USE_BYTEMASK : int
+        The byte is set/unset representing a null on a certain position.
+    """
+
+    NON_NULLABLE = 0
+    USE_NAN = 1
+    USE_SENTINEL = 2
+    USE_BITMASK = 3
+    USE_BYTEMASK = 4
+
+
+class ColumnBuffers(TypedDict):
+    # first element is a buffer containing the column data;
+    # second element is the data buffer's associated dtype
+    data: Tuple["Buffer", Dtype]
+
+    # first element is a buffer containing mask values indicating missing data;
+    # second element is the mask value buffer's associated dtype.
+    # None if the null representation is not a bit or byte mask
+    validity: Optional[Tuple["Buffer", Dtype]]
+
+    # first element is a buffer containing the offset values for
+    # variable-size binary data (e.g., variable-length strings);
+    # second element is the offsets buffer's associated dtype.
+    # None if the data buffer does not have an associated offsets buffer
+    offsets: Optional[Tuple["Buffer", Dtype]]
+
+
+class CategoricalDescription(TypedDict):
+    # whether the ordering of dictionary indices is semantically meaningful
+    is_ordered: bool
+    # whether a dictionary-style mapping of categorical values to other objects exists
+    is_dictionary: bool
+    # Python-level only (e.g. ``{int: str}``).
+    # None if not a dictionary-style categorical.
+    categories: Optional[Column]
+
+
+class Buffer(ABC):
+    """
+    Data in the buffer is guaranteed to be contiguous in memory.
+    Note that there is no dtype attribute present, a buffer can be thought of
+    as simply a block of memory. However, if the column that the buffer is
+    attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
+    implemented, then that dtype information will be contained in the return
+    value from ``__dlpack__``.
+    This distinction is useful to support both data exchange via DLPack on a
+    buffer and (b) dtypes like variable-length strings which do not have a
+    fixed number of bytes per element.
+    """
+
+    @property
+    @abstractmethod
+    def bufsize(self) -> int:
+        """
+        Buffer size in bytes.
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def ptr(self) -> int:
+        """
+        Pointer to start of the buffer as an integer.
+        """
+        pass
+
+    @abstractmethod
+    def __dlpack__(self):
+        """
+        Produce DLPack capsule (see array API standard).
+        Raises:
+            - TypeError : if the buffer contains unsupported dtypes.
+            - NotImplementedError : if DLPack support is not implemented
+        Useful to have to connect to array libraries. Support optional because
+        it's not completely trivial to implement for a Python-only library.
+        """
+        raise NotImplementedError("__dlpack__")
+
+    @abstractmethod
+    def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]:
+        """
+        Device type and device ID for where the data in the buffer resides.
+        Uses device type codes matching DLPack.
+        Note: must be implemented even if ``__dlpack__`` is not.
+        """
+        pass
+
+
+class Column(ABC):
+    """
+    A column object, with only the methods and properties required by the
+    interchange protocol defined.
+    A column can contain one or more chunks. Each chunk can contain up to three
+    buffers - a data buffer, a mask buffer (depending on null representation),
+    and an offsets buffer (if variable-size binary; e.g., variable-length
+    strings).
+    TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
+         Instead, it seems to use "children" for both columns with a bit mask,
+         and for nested dtypes. Unclear whether this is elegant or confusing.
+         This design requires checking the null representation explicitly.
+         The Arrow design requires checking:
+         1. the ARROW_FLAG_NULLABLE (for sentinel values)
+         2. if a column has two children, combined with one of those children
+            having a null dtype.
+         Making the mask concept explicit seems useful. One null dtype would
+         not be enough to cover both bit and byte masks, so that would mean
+         even more checking if we did it the Arrow way.
+    TBD: there's also the "chunk" concept here, which is implicit in Arrow as
+         multiple buffers per array (= column here). Semantically it may make
+         sense to have both: chunks were meant for example for lazy evaluation
+         of data which doesn't fit in memory, while multiple buffers per column
+         could also come from doing a selection operation on a single
+         contiguous buffer.
+         Given these concepts, one would expect chunks to be all of the same
+         size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),
+         while multiple buffers could have data-dependent lengths. Not an issue
+         in pandas if one column is backed by a single NumPy array, but in
+         Arrow it seems possible.
+         Are multiple chunks *and* multiple buffers per column necessary for
+         the purposes of this interchange protocol, or must producers either
+         reuse the chunk concept for this or copy the data?
+    Note: this Column object can only be produced by ``__dataframe__``, so
+          doesn't need its own version or ``__column__`` protocol.
+    """
+
+    @abstractmethod
+    def size(self) -> int:
+        """
+        Size of the column, in elements.
+        Corresponds to DataFrame.num_rows() if column is a single chunk;
+        equal to size of this current chunk otherwise.
+        Is a method rather than a property because it may cause a (potentially
+        expensive) computation for some dataframe implementations.
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def offset(self) -> int:
+        """
+        Offset of first element.
+        May be > 0 if using chunks; for example for a column with N chunks of
+        equal size M (only the last chunk may be shorter),
+        ``offset = n * M``, ``n = 0 .. N-1``.
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def dtype(self) -> Dtype:
+        """
+        Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
+        Bit-width : the number of bits as an integer
+        Format string : data type description format string in Apache Arrow C
+                        Data Interface format.
+        Endianness : current only native endianness (``=``) is supported
+        Notes:
+            - Kind specifiers are aligned with DLPack where possible (hence the
+              jump to 20, leave enough room for future extension)
+            - Masks must be specified as boolean with either bit width 1 (for bit
+              masks) or 8 (for byte masks).
+            - Dtype width in bits was preferred over bytes
+            - Endianness isn't too useful, but included now in case in the future
+              we need to support non-native endianness
+            - Went with Apache Arrow format strings over NumPy format strings
+              because they're more complete from a dataframe perspective
+            - Format strings are mostly useful for datetime specification, and
+              for categoricals.
+            - For categoricals, the format string describes the type of the
+              categorical in the data buffer. In case of a separate encoding of
+              the categorical (e.g. an integer to string mapping), this can
+              be derived from ``self.describe_categorical``.
+            - Data types not included: complex, Arrow-style null, binary, decimal,
+              and nested (list, struct, map, union) dtypes.
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def describe_categorical(self) -> CategoricalDescription:
+        """
+        If the dtype is categorical, there are two options:
+        - There are only values in the data buffer.
+        - There is a separate non-categorical Column encoding categorical values.
+        Raises TypeError if the dtype is not categorical
+        Returns the dictionary with description on how to interpret the data buffer:
+            - "is_ordered" : bool, whether the ordering of dictionary indices is
+                             semantically meaningful.
+            - "is_dictionary" : bool, whether a mapping of
+                                categorical values to other objects exists
+            - "categories" : Column representing the (implicit) mapping of indices to
+                             category values (e.g. an array of cat1, cat2, ...).
+                             None if not a dictionary-style categorical.
+        TBD: are there any other in-memory representations that are needed?
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def describe_null(self) -> Tuple[ColumnNullType, Any]:
+        """
+        Return the missing value (or "null") representation the column dtype
+        uses, as a tuple ``(kind, value)``.
+        Value : if kind is "sentinel value", the actual value. If kind is a bit
+        mask or a byte mask, the value (0 or 1) indicating a missing value. None
+        otherwise.
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def null_count(self) -> Optional[int]:
+        """
+        Number of null elements, if known.
+        Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def metadata(self) -> Dict[str, Any]:
+        """
+        The metadata for the column. See `DataFrame.metadata` for more details.
+        """
+        pass
+
+    @abstractmethod
+    def num_chunks(self) -> int:
+        """
+        Return the number of chunks the column consists of.
+        """
+        pass
+
+    @abstractmethod
+    def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]:
+        """
+        Return an iterator yielding the chunks.
+        See `DataFrame.get_chunks` for details on ``n_chunks``.
+        """
+        pass
+
+    @abstractmethod
+    def get_buffers(self) -> ColumnBuffers:
+        """
+        Return a dictionary containing the underlying buffers.
+        The returned dictionary has the following contents:
+            - "data": a two-element tuple whose first element is a buffer
+                      containing the data and whose second element is the data
+                      buffer's associated dtype.
+            - "validity": a two-element tuple whose first element is a buffer
+                          containing mask values indicating missing data and
+                          whose second element is the mask value buffer's
+                          associated dtype. None if the null representation is
+                          not a bit or byte mask.
+            - "offsets": a two-element tuple whose first element is a buffer
+                         containing the offset values for variable-size binary
+                         data (e.g., variable-length strings) and whose second
+                         element is the offsets buffer's associated dtype. None
+                         if the data buffer does not have an associated offsets
+                         buffer.
+        """
+        pass
+
+
+#    def get_children(self) -> Iterable[Column]:
+#        """
+#        Children columns underneath the column, each object in this iterator
+#        must adhere to the column specification.
+#        """
+#        pass
+
+
+class DataFrame(ABC):
+    """
+    A data frame class, with only the methods required by the interchange
+    protocol defined.
+    A "data frame" represents an ordered collection of named columns.
+    A column's "name" must be a unique string.
+    Columns may be accessed by name or by position.
+    This could be a public data frame class, or an object with the methods and
+    attributes defined on this DataFrame class could be returned from the
+    ``__dataframe__`` method of a public data frame class in a library adhering
+    to the dataframe interchange protocol specification.
+    """
+
+    version = 0  # version of the protocol
+
+    @abstractmethod
+    def __dataframe__(
+        self, nan_as_null: bool = False, allow_copy: bool = True
+    ) -> "DataFrame":
+        """
+        Construct a new exchange object, potentially changing the parameters.
+        ``nan_as_null`` is a keyword intended for the consumer to tell the
+        producer to overwrite null values in the data with ``NaN``.
+        It is intended for cases where the consumer does not support the bit
+        mask or byte mask that is the producer's native representation.
+        ``allow_copy`` is a keyword that defines whether or not the library is
+        allowed to make a copy of the data. For example, copying data would be
+        necessary if a library supports strided buffers, given that this protocol
+        specifies contiguous buffers.
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def metadata(self) -> Dict[str, Any]:
+        """
+        The metadata for the data frame, as a dictionary with string keys. The
+        contents of `metadata` may be anything, they are meant for a library
+        to store information that it needs to, e.g., roundtrip losslessly or
+        for two implementations to share data that is not (yet) part of the
+        interchange protocol specification. For avoiding collisions with other
+        entries, please add name the keys with the name of the library
+        followed by a period and the desired name, e.g, ``pandas.indexcol``.
+        """
+        pass
+
+    @abstractmethod
+    def num_columns(self) -> int:
+        """
+        Return the number of columns in the DataFrame.
+        """
+        pass
+
+    @abstractmethod
+    def num_rows(self) -> Optional[int]:
+        # TODO: not happy with Optional, but need to flag it may be expensive
+        #       why include it if it may be None - what do we expect consumers
+        #       to do here?
+        """
+        Return the number of rows in the DataFrame, if available.
+        """
+        pass
+
+    @abstractmethod
+    def num_chunks(self) -> int:
+        """
+        Return the number of chunks the DataFrame consists of.
+        """
+        pass
+
+    @abstractmethod
+    def column_names(self) -> Iterable[str]:
+        """
+        Return an iterator yielding the column names.
+        """
+        pass
+
+    @abstractmethod
+    def get_column(self, i: int) -> Column:
+        """
+        Return the column at the indicated position.
+        """
+        pass
+
+    @abstractmethod
+    def get_column_by_name(self, name: str) -> Column:
+        """
+        Return the column whose name is the indicated name.
+        """
+        pass
+
+    @abstractmethod
+    def get_columns(self) -> Iterable[Column]:
+        """
+        Return an iterator yielding the columns.
+        """
+        pass
+
+    @abstractmethod
+    def select_columns(self, indices: Sequence[int]) -> "DataFrame":
+        """
+        Create a new DataFrame by selecting a subset of columns by index.
+        """
+        pass
+
+    @abstractmethod
+    def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame":
+        """
+        Create a new DataFrame by selecting a subset of columns by name.
+        """
+        pass
+
+    @abstractmethod
+    def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["DataFrame"]:
+        """
+        Return an iterator yielding the chunks.
+        By default (None), yields the chunks that the data is stored as by the
+        producer. If given, ``n_chunks`` must be a multiple of
+        ``self.num_chunks()``, meaning the producer must subdivide each chunk
+        before yielding it.
+        Note that the producer must ensure that all columns are chunked the
+        same way.
+        """
+        pass
diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py
new file mode 100644
index 00000000000..a0d5179a053
--- /dev/null
+++ b/python/pyarrow/interchange/from_dataframe.py
@@ -0,0 +1,253 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from column import PyArrowColumn
+from dataframe_protocol import (
+    Buffer,
+    Column,
+    ColumnNullType,
+    DataFrame as DataFrameXchg,
+    DtypeKind,
+)
+import pyarrow as pa
+
+
+def from_dataframe(df, allow_copy=True) -> pd.DataFrame:
+    """
+    Build a ``pd.DataFrame`` from any DataFrame supporting the interchange protocol.
+    Parameters
+    ----------
+    df : DataFrameXchg
+        Object supporting the interchange protocol, i.e. `__dataframe__` method.
+    allow_copy : bool, default: True
+        Whether to allow copying the memory to perform the conversion
+        (if false then zero-copy approach is requested).
+    Returns
+    -------
+    pd.DataFrame
+    """
+    if isinstance(df, pa.Table):
+        return df
+
+    if not hasattr(df, "__dataframe__"):
+        raise ValueError("`df` does not support __dataframe__")
+
+    return _from_dataframe(df.__dataframe__(allow_copy=allow_copy))
+
+
+def _from_dataframe(df: DataFrameXchg, allow_copy=True):
+    """
+    Build a ``pd.DataFrame`` from the DataFrame interchange object.
+    Parameters
+    ----------
+    df : DataFrameXchg
+        Object supporting the interchange protocol, i.e. `__dataframe__` method.
+    allow_copy : bool, default: True
+        Whether to allow copying the memory to perform the conversion
+        (if false then zero-copy approach is requested).
+    Returns
+    -------
+    pd.DataFrame
+    """
+    pass
+
+
+def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame:
+    """
+    Convert interchange protocol chunk to ``pd.DataFrame``.
+    Parameters
+    ----------
+    df : DataFrameXchg
+    Returns
+    -------
+    pd.DataFrame
+    """
+    # We need a dict of columns here, with each column being a NumPy array (at
+    # least for now, deal with non-NumPy dtypes later).
+    columns: dict[str, Any] = {}
+    buffers = []  # hold on to buffers, keeps memory alive
+    for name in df.column_names():
+        if not isinstance(name, str):
+            raise ValueError(f"Column {name} is not a string")
+        if name in columns:
+            raise ValueError(f"Column {name} is not unique")
+        col = df.get_column_by_name(name)
+        dtype = col.dtype[0]
+        if dtype in (
+            DtypeKind.INT,
+            DtypeKind.UINT,
+            DtypeKind.FLOAT,
+            DtypeKind.BOOL,
+        ):
+            columns[name], buf = primitive_column_to_ndarray(col)
+        elif dtype == DtypeKind.CATEGORICAL:
+            columns[name], buf = categorical_column_to_series(col)
+        elif dtype == DtypeKind.STRING:
+            columns[name], buf = string_column_to_ndarray(col)
+        elif dtype == DtypeKind.DATETIME:
+            columns[name], buf = datetime_column_to_ndarray(col)
+        else:
+            raise NotImplementedError(f"Data type {dtype} not handled yet")
+
+        buffers.append(buf)
+
+    pass
+
+
+def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
+    """
+    Convert a column holding one of the primitive dtypes to a NumPy array.
+    A primitive type is one of: int, uint, float, bool.
+    Parameters
+    ----------
+    col : Column
+    Returns
+    -------
+    tuple
+        Tuple of np.ndarray holding the data and the memory owner object
+        that keeps the memory alive.
+    """
+    pass
+
+
+def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]:
+    """
+    Convert a column holding categorical data to a pandas Series.
+    Parameters
+    ----------
+    col : Column
+    Returns
+    -------
+    tuple
+        Tuple of pd.Series holding the data and the memory owner object
+        that keeps the memory alive.
+    """
+    pass
+
+
+def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
+    """
+    Convert a column holding string data to a NumPy array.
+    Parameters
+    ----------
+    col : Column
+    Returns
+    -------
+    tuple
+        Tuple of np.ndarray holding the data and the memory owner object
+        that keeps the memory alive.
+    """
+    pass
+
+
+def parse_datetime_format_str(format_str, data):
+    """Parse datetime `format_str` to interpret the `data`."""
+    pass
+
+
+def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
+    """
+    Convert a column holding DateTime data to a NumPy array.
+    Parameters
+    ----------
+    col : Column
+    Returns
+    -------
+    tuple
+        Tuple of np.ndarray holding the data and the memory owner object
+        that keeps the memory alive.
+    """
+    pass
+
+
+def buffer_to_ndarray(
+    buffer: Buffer,
+    dtype: tuple[DtypeKind, int, str, str],
+    offset: int = 0,
+    length: int | None = None,
+) -> np.ndarray:
+    """
+    Build a NumPy array from the passed buffer.
+    Parameters
+    ----------
+    buffer : Buffer
+        Buffer to build a NumPy array from.
+    dtype : tuple
+        Data type of the buffer conforming protocol dtypes format.
+    offset : int, default: 0
+        Number of elements to offset from the start of the buffer.
+    length : int, optional
+        If the buffer is a bit-mask, specifies a number of bits to read
+        from the buffer. Has no effect otherwise.
+    Returns
+    -------
+    np.ndarray
+    Notes
+    -----
+    The returned array doesn't own the memory. The caller of this function is
+    responsible for keeping the memory owner object alive as long as
+    the returned NumPy array is being used.
+    """
+    pass
+
+
+def bitmask_to_bool_ndarray(
+    bitmask: np.ndarray, mask_length: int, first_byte_offset: int = 0
+) -> np.ndarray:
+    """
+    Convert bit-mask to a boolean NumPy array.
+    Parameters
+    ----------
+    bitmask : np.ndarray[uint8]
+        NumPy array of uint8 dtype representing the bitmask.
+    mask_length : int
+        Number of elements in the mask to interpret.
+    first_byte_offset : int, default: 0
+        Number of elements to offset from the start of the first byte.
+    Returns
+    -------
+    np.ndarray[bool]
+    """
+    pass
+
+
+def set_nulls(
+    data: np.ndarray | pd.Series,
+    col: Column,
+    validity: tuple[Buffer, tuple[DtypeKind, int, str, str]] | None,
+    allow_modify_inplace: bool = True,
+):
+    """
+    Set null values for the data according to the column null kind.
+    Parameters
+    ----------
+    data : np.ndarray or pd.Series
+        Data to set nulls in.
+    col : Column
+        Column object that describes the `data`.
+    validity : tuple(Buffer, dtype) or None
+        The return value of ``col.buffers()``. We do not access the ``col.buffers()``
+        here to not take the ownership of the memory of buffer objects.
+    allow_modify_inplace : bool, default: True
+        Whether to modify the `data` inplace when zero-copy is possible (True) or always
+        modify a copy of the `data` (False).
+    Returns
+    -------
+    np.ndarray or pd.Series
+        Data with the nulls being set.
+    """
+    pass

From 010d9a8eab42d1709880b55f5a2c38bd77f85bbb Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Wed, 26 Oct 2022 11:44:53 +0200
Subject: [PATCH 02/29] Add a dataframe (PyArrowTableXchg) class methods

---
 python/pyarrow/interchange/dataframe.py | 64 +++++++++++++++++--------
 1 file changed, 45 insertions(+), 19 deletions(-)

diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py
index c14ae49758a..fef614e27ec 100644
--- a/python/pyarrow/interchange/dataframe.py
+++ b/python/pyarrow/interchange/dataframe.py
@@ -15,11 +15,24 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import chunk
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    Optional,
+    Sequence,
+    Tuple,
+    TypedDict,
+)
+
 import pyarrow as pa
+import warnings
 
 from column import PandasColumn
 from dataframe_protocol import DataFrame as DataFrameXchg
 
+
 class PyArrowTableXchg(DataFrameXchg):
     """
     A data frame class, with only the methods required by the interchange
@@ -50,40 +63,53 @@ def __dataframe__(
         return PyArrowTableXchg(self._df, nan_as_null, allow_copy)
 
     @property
-    def metadata(self) -> dict[str, Index]:
-        # `index` isn't a regular column, and the protocol doesn't support row
-        # labels - so we export it as Pandas-specific metadata here.
-        pass
+    def metadata(self) -> dict[str, Any]:
+        # The metadata for the data frame, as a dictionary with string keys.
+        # Add schema metadata here (pandas metadata, ot custom metadata)
+        schema_metadata = {k.decode('utf8'): v.decode('utf8') for k, v in self.schema.metadata.items()}
+        return schema_metadata
 
     def num_columns(self) -> int:
-        pass
+        return self.num_columns
 
     def num_rows(self) -> int:
-        pass
+        return self.num_rows
 
     def num_chunks(self) -> int:
-        pass
+        return self.column(0).num_chunks
 
-    def column_names(self) -> Index:
-        pass
+    def column_names(self) -> Iterable[str]:
+        return self.column_names
 
     def get_column(self, i: int) -> PyArrowColumn:
-        pass
+        return self.column(i)
 
     def get_column_by_name(self, name: str) -> PyArrowColumn:
-        pass
+        return self.column(name)
 
-    def get_columns(self) -> list[PyArrowColumn]:
-        pass
+    def get_columns(self) -> Iterable[PyArrowColumn]:
+        return self.columns
 
-    def select_columns(self, indices) -> PyArrowTableFrameXchg:
-        pass
+    def select_columns(self, indices: Sequence[int]) -> PyArrowTableFrameXchg:
+        return self.select(indices)
 
-    def select_columns_by_name(self, names) -> PyArrowTableFrameXchg:
-        pass
+    def select_columns_by_name(self, names: Sequence[str]) -> PyArrowTableFrameXchg:
+        return self.select(names)
 
-    def get_chunks(self, n_chunks=None):
+    def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable[PyArrowTableFrameXchg]:
         """
         Return an iterator yielding the chunks.
         """
-        pass
\ No newline at end of file
+        if n_chunks:
+            if n_chunks%self.num_chunks == 0:
+                chunk_size = self.num_rows//n_chunks
+                if self.num_rows%n_chunks != 0:
+                    warnings.warn("Converting dataframe into smaller chunks")
+                batches = self.to_batches(max_chunksize = chunk_size)
+            else:
+                warnings.warn("``n_chunks`` must be a multiple of ``self.num_chunks()``")
+        else:
+            batches = self.to_batches()
+        
+        iterator_tables = [pa.Table.from_batches([batch]) for batch in batches]
+        return iterator_tables

From c0af309ae109c4050a7c6beffdd725ffd3035091 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Wed, 26 Oct 2022 15:35:38 +0200
Subject: [PATCH 03/29] Add a subpackage for testing interchange protocol, add
 a test for Table.__dataframe__ and do some minor corrections

---
 python/pyarrow/interchange/__init__.py        |  2 +-
 python/pyarrow/interchange/buffer.py          | 23 +++----
 python/pyarrow/interchange/column.py          | 63 ++++++++++---------
 python/pyarrow/interchange/dataframe.py       | 60 ++++++++++--------
 .../pyarrow/interchange/dataframe_protocol.py | 12 ++--
 python/pyarrow/interchange/from_dataframe.py  | 31 +++++----
 python/pyarrow/table.pxi                      | 30 +++++++++
 python/pyarrow/tests/interchange/__init__.py  | 16 +++++
 .../interchange/test_interchange_spec.py      | 34 ++++++++++
 9 files changed, 185 insertions(+), 86 deletions(-)
 create mode 100644 python/pyarrow/tests/interchange/__init__.py
 create mode 100644 python/pyarrow/tests/interchange/test_interchange_spec.py

diff --git a/python/pyarrow/interchange/__init__.py b/python/pyarrow/interchange/__init__.py
index d216be4ddc9..13a83393a91 100644
--- a/python/pyarrow/interchange/__init__.py
+++ b/python/pyarrow/interchange/__init__.py
@@ -13,4 +13,4 @@
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
-# under the License.
\ No newline at end of file
+# under the License.
diff --git a/python/pyarrow/interchange/buffer.py b/python/pyarrow/interchange/buffer.py
index 06803aaf299..07b21190039 100644
--- a/python/pyarrow/interchange/buffer.py
+++ b/python/pyarrow/interchange/buffer.py
@@ -15,12 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from dataframe_protocol import (
+from pyarrow.interchange.dataframe_protocol import (
     Buffer,
     DlpackDeviceType,
 )
 import numpy as np
 
+
 class PyArrowBuffer(Buffer):
     """
     Data in the buffer is guaranteed to be contiguous in memory.
@@ -52,21 +53,21 @@ def __dlpack__(self):
         """
         pass
 
-    def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
-        """
-        Device type and device ID for where the data in the buffer resides.
-        """
-        pass
+    # def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
+    #     """
+    #     Device type and device ID for where the data in the buffer resides.
+    #     """
+    #     pass
 
     def __repr__(self) -> str:
         return (
-            "PyArrowBuffer("
-            + str(
+            "PyArrowBuffer(" +
+            str(
                 {
                     "bufsize": self.bufsize,
                     "ptr": self.ptr,
                     "device": self.__dlpack_device__()[0].name,
                 }
-            )
-            + ")"
-        )
\ No newline at end of file
+            ) +
+            ")"
+        )
diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py
index eb63f014187..d19a7f7f725 100644
--- a/python/pyarrow/interchange/column.py
+++ b/python/pyarrow/interchange/column.py
@@ -15,17 +15,18 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from typing import Any
+from typing import (Dict, Any)
 
 import pyarrow as pa
 
-from buffer import PyArrowBuffer
-from dataframe_protocol import (
+from pyarrow.interchange.buffer import PyArrowBuffer
+from pyarrow.interchange.dataframe_protocol import (
     Column,
     ColumnBuffers,
     DtypeKind,
 )
 
+
 class PyArrowColumn(Column):
     """
     A column object, with only the methods and properties required by the
@@ -61,19 +62,20 @@ def offset(self) -> int:
     @property
     def dtype(self) -> tuple[DtypeKind, int, str, str]:
         """
-        Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
+        Dtype description as a tuple ``(kind, bit-width, format string,
+        endianness)``.
         Bit-width : the number of bits as an integer
-        Format string : data type description format string in Apache Arrow C
-                        Data Interface format.
+        Format string : data type description format string in Apache Arrow
+                        C Data Interface format.
         Endianness : current only native endianness (``=``) is supported
         Notes:
-            - Kind specifiers are aligned with DLPack where possible (hence the
-              jump to 20, leave enough room for future extension)
-            - Masks must be specified as boolean with either bit width 1 (for bit
-              masks) or 8 (for byte masks).
+            - Kind specifiers are aligned with DLPack where possible (hence
+            the jump to 20, leave enough room for future extension)
+            - Masks must be specified as boolean with either bit width 1
+              (for bit masks) or 8 (for byte masks).
             - Dtype width in bits was preferred over bytes
-            - Endianness isn't too useful, but included now in case in the future
-              we need to support non-native endianness
+            - Endianness isn't too useful, but included now in case in the
+              future we need to support non-native endianness
             - Went with Apache Arrow format strings over NumPy format strings
               because they're more complete from a dataframe perspective
             - Format strings are mostly useful for datetime specification, and
@@ -82,8 +84,8 @@ def dtype(self) -> tuple[DtypeKind, int, str, str]:
               categorical in the data buffer. In case of a separate encoding of
               the categorical (e.g. an integer to string mapping), this can
               be derived from ``self.describe_categorical``.
-            - Data types not included: complex, Arrow-style null, binary, decimal,
-              and nested (list, struct, map, union) dtypes.
+            - Data types not included: complex, Arrow-style null, binary,
+              decimal, and nested (list, struct, map, union) dtypes.
         """
         pass
 
@@ -92,8 +94,9 @@ def _dtype_from_arrowdtype(self, dtype) -> tuple[DtypeKind, int, str, str]:
         See `self.dtype` for details.
         """
         # Note: 'c' (complex) not handled yet (not in array spec v1).
-        #       'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled
-        #       datetime and timedelta both map to datetime (is timedelta handled?)
+        #       'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not
+        #       handled datetime and timedelta both map to datetime
+        #       (is timedelta handled?)
 
         pass
 
@@ -102,16 +105,18 @@ def describe_categorical(self):
         """
         If the dtype is categorical, there are two options:
         - There are only values in the data buffer.
-        - There is a separate non-categorical Column encoding for categorical values.
+        - There is a separate non-categorical Column encoding for categorical
+          values.
         Raises TypeError if the dtype is not categorical
         Content of returned dict:
-            - "is_ordered" : bool, whether the ordering of dictionary indices is
-                             semantically meaningful.
+            - "is_ordered" : bool, whether the ordering of dictionary indices
+                             is semantically meaningful.
             - "is_dictionary" : bool, whether a dictionary-style mapping of
                                 categorical values to other objects exists
-            - "categories" : Column representing the (implicit) mapping of indices to
-                             category values (e.g. an array of cat1, cat2, ...).
-                             None if not a dictionary-style categorical.
+            - "categories" : Column representing the (implicit) mapping of
+                             indices to category values (e.g. an array of
+                             cat1, cat2, ...). None if not a dictionary-style
+                             categorical.
         """
         pass
 
@@ -127,7 +132,7 @@ def null_count(self) -> int:
         pass
 
     @property
-    def metadata(self) -> dict[str, pd.Index]:
+    def metadata(self) -> Dict[str, Any]:
         """
         Store specific metadata of the column.
         """
@@ -139,12 +144,12 @@ def num_chunks(self) -> int:
         """
         pass
 
-    def get_chunks(self, n_chunks: int | None = None):
-        """
-        Return an iterator yielding the chunks.
-        See `DataFrame.get_chunks` for details on ``n_chunks``.
-        """
-        pass
+    # def get_chunks(self, n_chunks: int | None = None):
+    #     """
+    #     Return an iterator yielding the chunks.
+    #     See `DataFrame.get_chunks` for details on ``n_chunks``.
+    #     """
+    #     pass
 
     def get_buffers(self) -> ColumnBuffers:
         """
diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py
index fef614e27ec..a4dafceae1f 100644
--- a/python/pyarrow/interchange/dataframe.py
+++ b/python/pyarrow/interchange/dataframe.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import chunk
+from __future__ import annotations
 from typing import (
     Any,
     Dict,
@@ -29,11 +29,11 @@
 import pyarrow as pa
 import warnings
 
-from column import PandasColumn
-from dataframe_protocol import DataFrame as DataFrameXchg
+from pyarrow.interchange.column import PyArrowColumn
+from pyarrow.interchange.dataframe_protocol import DataFrame as DataFrameXchg
 
 
-class PyArrowTableXchg(DataFrameXchg):
+class TableXchg(DataFrameXchg):
     """
     A data frame class, with only the methods required by the interchange
     protocol defined.
@@ -47,7 +47,7 @@ def __init__(
     ) -> None:
         """
         Constructor - an instance of this (private) class is returned from
-        `pd.DataFrame.__dataframe__`.
+        `pa.Table.__dataframe__`.
         """
         self._df = df
         # ``nan_as_null`` is a keyword intended for the consumer to tell the
@@ -59,57 +59,63 @@ def __init__(
 
     def __dataframe__(
         self, nan_as_null: bool = False, allow_copy: bool = True
-    ) -> PyArrowTableXchg:
-        return PyArrowTableXchg(self._df, nan_as_null, allow_copy)
+    ) -> TableXchg:
+        return TableXchg(self._df, nan_as_null, allow_copy)
 
     @property
     def metadata(self) -> dict[str, Any]:
         # The metadata for the data frame, as a dictionary with string keys.
         # Add schema metadata here (pandas metadata, ot custom metadata)
-        schema_metadata = {k.decode('utf8'): v.decode('utf8') for k, v in self.schema.metadata.items()}
+        schema_metadata = {k.decode('utf8'): v.decode('utf8')
+                           for k, v in self._df.schema.metadata.items()}
         return schema_metadata
 
     def num_columns(self) -> int:
-        return self.num_columns
+        return self._df.num_columns
 
     def num_rows(self) -> int:
-        return self.num_rows
+        return self._df.num_rows
 
     def num_chunks(self) -> int:
-        return self.column(0).num_chunks
+        return self._df.column(0).num_chunks
 
     def column_names(self) -> Iterable[str]:
-        return self.column_names
+        return self._df.column_names
 
     def get_column(self, i: int) -> PyArrowColumn:
-        return self.column(i)
+        return self._df.column(i)
 
     def get_column_by_name(self, name: str) -> PyArrowColumn:
-        return self.column(name)
+        return self._df.column(name)
 
     def get_columns(self) -> Iterable[PyArrowColumn]:
-        return self.columns
+        return self._df.columns
 
-    def select_columns(self, indices: Sequence[int]) -> PyArrowTableFrameXchg:
-        return self.select(indices)
+    def select_columns(self, indices: Sequence[int]) -> TableXchg:
+        return TableXchg(
+            self._df.select(indices), self._nan_as_null, self._allow_copy
+        )
 
-    def select_columns_by_name(self, names: Sequence[str]) -> PyArrowTableFrameXchg:
-        return self.select(names)
+    def select_columns_by_name(self, names: Sequence[str]) -> TableXchg:
+        return TableXchg(
+            self._df.select(names), self._nan_as_null, self._allow_copy
+        )
 
-    def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable[PyArrowTableFrameXchg]:
+    def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable[TableXchg]:
         """
         Return an iterator yielding the chunks.
         """
         if n_chunks:
-            if n_chunks%self.num_chunks == 0:
-                chunk_size = self.num_rows//n_chunks
-                if self.num_rows%n_chunks != 0:
+            if n_chunks % self._df.num_chunks == 0:
+                chunk_size = self._df.num_rows // n_chunks
+                if self.num_rows %n_chunks != 0:
                     warnings.warn("Converting dataframe into smaller chunks")
-                batches = self.to_batches(max_chunksize = chunk_size)
+                batches = self._df.to_batches(max_chunksize=chunk_size)
             else:
-                warnings.warn("``n_chunks`` must be a multiple of ``self.num_chunks()``")
+                warnings.warn(
+                    "``n_chunks`` must be a multiple of ``self.num_chunks()``")
         else:
-            batches = self.to_batches()
-        
+            batches = self._df.to_batches()
+
         iterator_tables = [pa.Table.from_batches([batch]) for batch in batches]
         return iterator_tables
diff --git a/python/pyarrow/interchange/dataframe_protocol.py b/python/pyarrow/interchange/dataframe_protocol.py
index f12ae4e7ae8..de30ede8d63 100644
--- a/python/pyarrow/interchange/dataframe_protocol.py
+++ b/python/pyarrow/interchange/dataframe_protocol.py
@@ -125,11 +125,12 @@ class ColumnBuffers(TypedDict):
 class CategoricalDescription(TypedDict):
     # whether the ordering of dictionary indices is semantically meaningful
     is_ordered: bool
-    # whether a dictionary-style mapping of categorical values to other objects exists
+    # whether a dictionary-style mapping of categorical values to other objects
+    # exists
     is_dictionary: bool
     # Python-level only (e.g. ``{int: str}``).
     # None if not a dictionary-style categorical.
-    categories: Optional[Column]
+    # categories: Optional[Column]
 
 
 class Buffer(ABC):
@@ -246,7 +247,8 @@ def offset(self) -> int:
     @abstractmethod
     def dtype(self) -> Dtype:
         """
-        Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
+        Dtype description as a tuple ``(kind, bit-width, format string,
+        endianness)``.
         Bit-width : the number of bits as an integer
         Format string : data type description format string in Apache Arrow C
                         Data Interface format.
@@ -285,8 +287,8 @@ def describe_categorical(self) -> CategoricalDescription:
                              semantically meaningful.
             - "is_dictionary" : bool, whether a mapping of
                                 categorical values to other objects exists
-            - "categories" : Column representing the (implicit) mapping of indices to
-                             category values (e.g. an array of cat1, cat2, ...).
+            - "categories" : Column representing the (implicit) mapping of indices
+                             to category values (e.g. an array of cat1, cat2, ...).
                              None if not a dictionary-style categorical.
         TBD: are there any other in-memory representations that are needed?
         """
diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py
index a0d5179a053..884c2c2c3f8 100644
--- a/python/pyarrow/interchange/from_dataframe.py
+++ b/python/pyarrow/interchange/from_dataframe.py
@@ -15,8 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from column import PyArrowColumn
-from dataframe_protocol import (
+from pyarrow.interchange.column import PyArrowColumn
+from pyarrow.interchange.dataframe_protocol import (
     Buffer,
     Column,
     ColumnNullType,
@@ -28,11 +28,14 @@
 
 def from_dataframe(df, allow_copy=True) -> pd.DataFrame:
     """
-    Build a ``pd.DataFrame`` from any DataFrame supporting the interchange protocol.
+    Build a ``pd.DataFrame`` from any DataFrame supporting the interchange
+    protocol.
+
     Parameters
     ----------
     df : DataFrameXchg
-        Object supporting the interchange protocol, i.e. `__dataframe__` method.
+        Object supporting the interchange protocol, i.e. `__dataframe__`
+        method.
     allow_copy : bool, default: True
         Whether to allow copying the memory to perform the conversion
         (if false then zero-copy approach is requested).
@@ -55,7 +58,8 @@ def _from_dataframe(df: DataFrameXchg, allow_copy=True):
     Parameters
     ----------
     df : DataFrameXchg
-        Object supporting the interchange protocol, i.e. `__dataframe__` method.
+        Object supporting the interchange protocol, i.e. `__dataframe__`
+        method.
     allow_copy : bool, default: True
         Whether to allow copying the memory to perform the conversion
         (if false then zero-copy approach is requested).
@@ -76,8 +80,8 @@ def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame:
     -------
     pd.DataFrame
     """
-    # We need a dict of columns here, with each column being a NumPy array (at
-    # least for now, deal with non-NumPy dtypes later).
+    # We need a dict of columns here, with each column being a NumPy array
+    # (at least for now, deal with non-NumPy dtypes later).
     columns: dict[str, Any] = {}
     buffers = []  # hold on to buffers, keeps memory alive
     for name in df.column_names():
@@ -198,8 +202,8 @@ def buffer_to_ndarray(
     np.ndarray
     Notes
     -----
-    The returned array doesn't own the memory. The caller of this function is
-    responsible for keeping the memory owner object alive as long as
+    The returned array doesn't own the memory. The caller of this function
+    is responsible for keeping the memory owner object alive as long as
     the returned NumPy array is being used.
     """
     pass
@@ -240,11 +244,12 @@ def set_nulls(
     col : Column
         Column object that describes the `data`.
     validity : tuple(Buffer, dtype) or None
-        The return value of ``col.buffers()``. We do not access the ``col.buffers()``
-        here to not take the ownership of the memory of buffer objects.
+        The return value of ``col.buffers()``. We do not access the
+        ``col.buffers()`` here to not take the ownership of the memory
+        of buffer objects.
     allow_modify_inplace : bool, default: True
-        Whether to modify the `data` inplace when zero-copy is possible (True) or always
-        modify a copy of the `data` (False).
+        Whether to modify the `data` inplace when zero-copy is possible
+        (True) or always modify a copy of the `data` (False).
     Returns
     -------
     np.ndarray or pd.Series
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 5c58ae61f19..25f498cb0dd 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -2809,6 +2809,36 @@ cdef class Table(_PandasConvertible):
 
         return self.column(key)
 
+    # ----------------------------------------------------------------------
+    def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
+        """
+        Return the dataframe interchange object implementing the interchange protocol.
+        Parameters
+        ----------
+        nan_as_null : bool, default False
+            Whether to tell the DataFrame to overwrite null values in the data
+            with ``NaN`` (or ``NaT``).
+        allow_copy : bool, default True
+            Whether to allow memory copying when exporting. If set to False
+            it would cause non-zero-copy exports to fail.
+        Returns
+        -------
+        DataFrame interchange object
+            The object which consuming library can use to ingress the dataframe.
+        Notes
+        -----
+        Details on the interchange protocol:
+        https://data-apis.org/dataframe-protocol/latest/index.html
+        `nan_as_null` currently has no effect; once support for nullable extension
+        dtypes is added, this value should be propagated to columns.
+        """
+
+        from pyarrow.interchange.dataframe import TableXchg
+
+        return TableXchg(self, nan_as_null, allow_copy)
+
+    # ----------------------------------------------------------------------
+
     def slice(self, offset=0, length=None):
         """
         Compute zero-copy slice of this Table.
diff --git a/python/pyarrow/tests/interchange/__init__.py b/python/pyarrow/tests/interchange/__init__.py
new file mode 100644
index 00000000000..13a83393a91
--- /dev/null
+++ b/python/pyarrow/tests/interchange/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py
new file mode 100644
index 00000000000..49bc8e65a44
--- /dev/null
+++ b/python/pyarrow/tests/interchange/test_interchange_spec.py
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pyarrow as pa
+
+
+def test_dataframe():
+    n = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
+    a = pa.chunked_array([["Flamingo", "Parrot", "Cow"],
+                         ["Horse", "Brittle stars", "Centipede"]])
+    table = pa.Table.from_arrays([n, a], names=['n_legs', 'animals'])
+    df = table.__dataframe__()
+
+    assert df.num_columns() == 2
+    assert df.num_rows() == 6
+    assert df.num_chunks() == 2
+    assert list(df.column_names()) == ['n_legs', 'animals']
+    assert list(df.select_columns([1]).column_names()) == list(
+        df.select_columns_by_name(["animals"]).column_names()
+    )

From 842ba3e1a26dff479629000c5e350fbd41acd8ec Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Mon, 7 Nov 2022 11:29:16 +0100
Subject: [PATCH 04/29] Add column (PyArrowColumn) class methods

---
 python/pyarrow/interchange/column.py          | 152 ++++++++++++++----
 python/pyarrow/interchange/dataframe.py       |  13 +-
 .../interchange/test_interchange_spec.py      |  27 +++-
 3 files changed, 155 insertions(+), 37 deletions(-)

diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py
index d19a7f7f725..40e535e90e6 100644
--- a/python/pyarrow/interchange/column.py
+++ b/python/pyarrow/interchange/column.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from typing import (Dict, Any)
+from typing import (Dict, Tuple, Any)
 
 import pyarrow as pa
 
@@ -23,10 +23,49 @@
 from pyarrow.interchange.dataframe_protocol import (
     Column,
     ColumnBuffers,
+    ColumnNullType,
     DtypeKind,
 )
 
 
+_PYARROW_KINDS = {
+    pa.int8(): (DtypeKind.INT, "c"),
+    pa.int16(): (DtypeKind.INT, "s"),
+    pa.int32(): (DtypeKind.INT, "i"),
+    pa.int64(): (DtypeKind.INT, "l"),
+    pa.uint8(): (DtypeKind.UINT, "C"),
+    pa.uint16(): (DtypeKind.UINT, "S"),
+    pa.uint32(): (DtypeKind.UINT, "I"),
+    pa.uint64(): (DtypeKind.UINT, "L"),
+    pa.float16(): (DtypeKind.FLOAT, "e"),
+    pa.float32(): (DtypeKind.FLOAT, "f"),
+    pa.float64(): (DtypeKind.FLOAT, "g"),
+    pa.bool_(): (DtypeKind.BOOL, "b"),
+    pa.string(): (DtypeKind.STRING, "u"),  # utf-8
+    pa.large_string(): (DtypeKind.STRING, "U"),
+    # Resoulution:
+    #   - seconds -> 's'
+    #   - milliseconds -> 'm'
+    #   - microseconds -> 'u'
+    #   - nanoseconds -> 'n'
+    pa.timestamp(): (DtypeKind.DATETIME, "ts{resolution}:{tz}"),
+    pa.dictionary(): (DtypeKind.CATEGORICAL, "L")
+}
+
+
+class Endianness:
+    """Enum indicating the byte-order of a data-type."""
+
+    LITTLE = "<"
+    BIG = ">"
+    NATIVE = "="
+    NA = "|"
+
+
+class NoBufferPresent(Exception):
+    """Exception to signal that there is no requested buffer."""
+
+
 class PyArrowColumn(Column):
     """
     A column object, with only the methods and properties required by the
@@ -44,20 +83,22 @@ def __init__(self, column: pa.Array, allow_copy: bool = True) -> None:
         Note: doesn't deal with extension arrays yet, just assume a regular
         Series/ndarray for now.
         """
-        pass
+        # Store the column as a private attribute
+        self._col = column
+        self._allow_copy = allow_copy
 
     def size(self) -> int:
         """
         Size of the column, in elements.
         """
-        pass
+        return self._col.to_numpy().size
 
     @property
     def offset(self) -> int:
         """
-        Offset of first element. Always zero.
+        Offset of first element.
         """
-        pass
+        return self._col.offset
 
     @property
     def dtype(self) -> tuple[DtypeKind, int, str, str]:
@@ -87,18 +128,13 @@ def dtype(self) -> tuple[DtypeKind, int, str, str]:
             - Data types not included: complex, Arrow-style null, binary,
               decimal, and nested (list, struct, map, union) dtypes.
         """
-        pass
+        dtype = self._col.type
+        kind, f_string = _PYARROW_KINDS.get(dtype, (None, None))
+        if kind is None:
+            raise ValueError(f"Data type {dtype} not supported by interchange protocol")
+        bit_width = self._col.nbytes * 8
 
-    def _dtype_from_arrowdtype(self, dtype) -> tuple[DtypeKind, int, str, str]:
-        """
-        See `self.dtype` for details.
-        """
-        # Note: 'c' (complex) not handled yet (not in array spec v1).
-        #       'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not
-        #       handled datetime and timedelta both map to datetime
-        #       (is timedelta handled?)
-
-        pass
+        return kind, bit_width, f_string, Endianness.NATIVE
 
     @property
     def describe_categorical(self):
@@ -118,18 +154,27 @@ def describe_categorical(self):
                              cat1, cat2, ...). None if not a dictionary-style
                              categorical.
         """
-        pass
+        if pa.types.is_dictionary(self._col.type):
+            raise TypeError(
+                "describe_categorical only works on a column with categorical dtype!"
+            )
+
+        return {
+            "is_ordered": True,
+            "is_dictionary": True,
+            "categories": PyArrowColumn(self._col.dictionary),
+        }
 
     @property
     def describe_null(self):
-        pass
+        return ColumnNullType.USE_BYTEMASK, 0
 
     @property
     def null_count(self) -> int:
         """
         Number of null elements. Should always be known.
         """
-        pass
+        return self._col.null_count
 
     @property
     def metadata(self) -> Dict[str, Any]:
@@ -142,14 +187,24 @@ def num_chunks(self) -> int:
         """
         Return the number of chunks the column consists of.
         """
-        pass
+        return 1
 
-    # def get_chunks(self, n_chunks: int | None = None):
-    #     """
-    #     Return an iterator yielding the chunks.
-    #     See `DataFrame.get_chunks` for details on ``n_chunks``.
-    #     """
-    #     pass
+    def get_chunks(self, n_chunks: int | None = None):
+        """
+        Return an iterator yielding the chunks.
+        See `DataFrame.get_chunks` for details on ``n_chunks``.
+        """
+        if n_chunks and n_chunks > 1:
+            size = self.size()
+            step = size // n_chunks
+            if size % n_chunks != 0:
+                step += 1
+            for start in range(0, step * n_chunks, step):
+                yield PyArrowColumn(
+                    self._col.slice(start,step), self._allow_copy
+                )
+        else:
+            yield self
 
     def get_buffers(self) -> ColumnBuffers:
         """
@@ -170,7 +225,23 @@ def get_buffers(self) -> ColumnBuffers:
                          if the data buffer does not have an associated offsets
                          buffer.
         """
-        pass
+        buffers: ColumnBuffers = {
+            "data": self._get_data_buffer(),
+            "validity": None,
+            "offsets": None,
+        }
+
+        try:
+            buffers["validity"] = self._get_validity_buffer()
+        except NoBufferPresent:
+            pass
+
+        try:
+            buffers["offsets"] = self._get_offsets_buffer()
+        except NoBufferPresent:
+            pass
+
+        return buffers
 
     def _get_data_buffer(
         self,
@@ -178,7 +249,11 @@ def _get_data_buffer(
         """
         Return the buffer containing the data and the buffer's associated dtype.
         """
-        pass
+        len = len(self._col.buffers())
+        if len == 2:
+            return PyArrowBuffer(self._col.buffers()[1]), self.dtype
+        elif len == 3:
+            return PyArrowBuffer(self._col.buffers()[2]), self.dtype
 
     def _get_validity_buffer(self) -> tuple[PyArrowBuffer, Any]:
         """
@@ -186,7 +261,15 @@ def _get_validity_buffer(self) -> tuple[PyArrowBuffer, Any]:
         the buffer's associated dtype.
         Raises NoBufferPresent if null representation is not a bit or byte mask.
         """
-        pass
+        # Define the dtype of the returned buffer
+        dtype = (DtypeKind.BOOL, 8, "b", Endianness.NATIVE)
+        buff = self._col.buffers()[0]
+        if buff:
+            return PyArrowBuffer(buff), dtype
+
+        raise NoBufferPresent(
+            "There are no missing values so "
+            "does not have a separate mask")
 
     def _get_offsets_buffer(self) -> tuple[PyArrowBuffer, Any]:
         """
@@ -195,4 +278,13 @@ def _get_offsets_buffer(self) -> tuple[PyArrowBuffer, Any]:
         Raises NoBufferPresent if the data buffer does not have an associated
         offsets buffer.
         """
-        pass
+        len = len(self._col.buffers())
+        if len == 2:
+            raise NoBufferPresent(
+                "This column has a fixed-length dtype so "
+                "it does not have an offsets buffer"
+            )
+        elif len == 3:
+            # Define the dtype of the returned buffer
+            dtype = (DtypeKind.INT, 64, "L", Endianness.NATIVE)
+            return PyArrowBuffer(self._col.buffers()[2]), dtype
diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py
index a4dafceae1f..43e4f03c6b7 100644
--- a/python/pyarrow/interchange/dataframe.py
+++ b/python/pyarrow/interchange/dataframe.py
@@ -83,22 +83,25 @@ def column_names(self) -> Iterable[str]:
         return self._df.column_names
 
     def get_column(self, i: int) -> PyArrowColumn:
-        return self._df.column(i)
+        return PyArrowColumn(self._df.column(i),
+                             allow_copy=self._allow_copy)
 
     def get_column_by_name(self, name: str) -> PyArrowColumn:
-        return self._df.column(name)
+        return PyArrowColumn(self._df.column(name),
+                             allow_copy=self._allow_copy)
 
     def get_columns(self) -> Iterable[PyArrowColumn]:
-        return self._df.columns
+        return PyArrowColumn(self._df.columns,
+                             allow_copy=self._allow_copy)
 
     def select_columns(self, indices: Sequence[int]) -> TableXchg:
         return TableXchg(
-            self._df.select(indices), self._nan_as_null, self._allow_copy
+            self._df.select(list(indices)), self._nan_as_null, self._allow_copy
         )
 
     def select_columns_by_name(self, names: Sequence[str]) -> TableXchg:
         return TableXchg(
-            self._df.select(names), self._nan_as_null, self._allow_copy
+            self._df.select(list(names)), self._nan_as_null, self._allow_copy
         )
 
     def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable[TableXchg]:
diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py
index 49bc8e65a44..1dd971067a5 100644
--- a/python/pyarrow/tests/interchange/test_interchange_spec.py
+++ b/python/pyarrow/tests/interchange/test_interchange_spec.py
@@ -16,6 +16,29 @@
 # under the License.
 
 import pyarrow as pa
+import pytest
+
+@pytest.mark.parametrize(
+    "test_data",
+    [
+        {"a": ["foo", "bar"], "b": ["baz", "qux"]},
+        {"a": [1.5, 2.5, 3.5], "b": [9.2, 10.5, 11.8]},
+        {"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]},
+    ],
+    ids=["str_data", "float_data", "int_data"],
+)
+def test_only_one_dtype(test_data):
+    columns = list(test_data.keys())
+    table = pa.Table.from_pylist([test_data])
+    df = table.__dataframe__()
+
+    column_size = len(test_data[columns[0]])
+    for column in columns:
+        null_count = df.get_column_by_name(column).null_count
+        assert null_count == 0
+        assert isinstance(null_count, int)
+        assert df.get_column_by_name(column).size() == column_size
+        assert df.get_column_by_name(column).offset == 0
 
 
 def test_dataframe():
@@ -29,6 +52,6 @@ def test_dataframe():
     assert df.num_rows() == 6
     assert df.num_chunks() == 2
     assert list(df.column_names()) == ['n_legs', 'animals']
-    assert list(df.select_columns([1]).column_names()) == list(
-        df.select_columns_by_name(["animals"]).column_names()
+    assert list(df.select_columns((1,)).column_names()) == list(
+        df.select_columns_by_name(("animals",)).column_names()
     )

From 61eb00fb4c3a686dc7176b1e6434aa143965e9e7 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Tue, 8 Nov 2022 12:02:01 +0100
Subject: [PATCH 05/29] Add buffer (PyArrowBuffer) class methods, some changes
 and main tests

---
 python/pyarrow/interchange/buffer.py          |  29 ++--
 python/pyarrow/interchange/column.py          | 145 +++++++++++++-----
 python/pyarrow/interchange/dataframe.py       |  26 +++-
 .../interchange/test_interchange_spec.py      | 126 ++++++++++++++-
 4 files changed, 262 insertions(+), 64 deletions(-)

diff --git a/python/pyarrow/interchange/buffer.py b/python/pyarrow/interchange/buffer.py
index 07b21190039..ff0caf85dbb 100644
--- a/python/pyarrow/interchange/buffer.py
+++ b/python/pyarrow/interchange/buffer.py
@@ -15,11 +15,15 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from __future__ import annotations
+
+import numpy as np
+import pyarrow as pa
+
 from pyarrow.interchange.dataframe_protocol import (
     Buffer,
     DlpackDeviceType,
 )
-import numpy as np
 
 
 class PyArrowBuffer(Buffer):
@@ -27,37 +31,40 @@ class PyArrowBuffer(Buffer):
     Data in the buffer is guaranteed to be contiguous in memory.
     """
 
-    def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None:
+    def __init__(self, x: pa.Buffer, allow_copy: bool = True) -> None:
         """
         Handle only regular columns (= numpy arrays) for now.
         """
-        pass
+        self._x = x
 
     @property
     def bufsize(self) -> int:
         """
         Buffer size in bytes.
         """
-        pass
+        return self._x.size
 
     @property
     def ptr(self) -> int:
         """
         Pointer to start of the buffer as an integer.
         """
-        pass
+        return self._x.address
 
     def __dlpack__(self):
         """
         Represent this structure as DLPack interface.
         """
-        pass
+        raise NotImplementedError("__dlpack__")
 
-    # def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
-    #     """
-    #     Device type and device ID for where the data in the buffer resides.
-    #     """
-    #     pass
+    def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
+        """
+        Device type and device ID for where the data in the buffer resides.
+        """
+        if self._x.is_cpu:
+            return (DlpackDeviceType.CPU, None)
+        else:
+            raise NotImplementedError("__dlpack_device__")
 
     def __repr__(self) -> str:
         return (
diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py
index 40e535e90e6..37bba874a82 100644
--- a/python/pyarrow/interchange/column.py
+++ b/python/pyarrow/interchange/column.py
@@ -15,18 +15,15 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from typing import (Dict, Tuple, Any)
+from __future__ import annotations
 
-import pyarrow as pa
+import warnings
+from typing import Any, Dict, Iterable, Tuple
 
+import pyarrow as pa
 from pyarrow.interchange.buffer import PyArrowBuffer
-from pyarrow.interchange.dataframe_protocol import (
-    Column,
-    ColumnBuffers,
-    ColumnNullType,
-    DtypeKind,
-)
-
+from pyarrow.interchange.dataframe_protocol import (Column, ColumnBuffers,
+                                                    ColumnNullType, DtypeKind)
 
 _PYARROW_KINDS = {
     pa.int8(): (DtypeKind.INT, "c"),
@@ -43,13 +40,6 @@
     pa.bool_(): (DtypeKind.BOOL, "b"),
     pa.string(): (DtypeKind.STRING, "u"),  # utf-8
     pa.large_string(): (DtypeKind.STRING, "U"),
-    # Resoulution:
-    #   - seconds -> 's'
-    #   - milliseconds -> 'm'
-    #   - microseconds -> 'u'
-    #   - nanoseconds -> 'n'
-    pa.timestamp(): (DtypeKind.DATETIME, "ts{resolution}:{tz}"),
-    pa.dictionary(): (DtypeKind.CATEGORICAL, "L")
 }
 
 
@@ -78,7 +68,7 @@ class PyArrowColumn(Column):
           doesn't need its own version or ``__column__`` protocol.
     """
 
-    def __init__(self, column: pa.Array, allow_copy: bool = True) -> None:
+    def __init__(self, column: pa.Array | pa.ChunkedArray, allow_copy: bool = True) -> None:
         """
         Note: doesn't deal with extension arrays yet, just assume a regular
         Series/ndarray for now.
@@ -91,14 +81,18 @@ def size(self) -> int:
         """
         Size of the column, in elements.
         """
-        return self._col.to_numpy().size
+        if isinstance(self._col, pa.Array):
+            len = self._col.to_numpy().size
+        else:
+            len = self._col.length()
+        return len
 
     @property
     def offset(self) -> int:
         """
         Offset of first element.
         """
-        return self._col.offset
+        return 0
 
     @property
     def dtype(self) -> tuple[DtypeKind, int, str, str]:
@@ -129,10 +123,34 @@ def dtype(self) -> tuple[DtypeKind, int, str, str]:
               decimal, and nested (list, struct, map, union) dtypes.
         """
         dtype = self._col.type
+        try:
+            bit_width = dtype.bit_width
+        except: # in case of a non-fixed width type (string)
+            bit_width = 8
+
+        if pa.types.is_timestamp(dtype):
+            kind = DtypeKind.DATETIME
+            f_string = "ts{dtype.unit}:{dtype.tz}"
+            return kind, bit_width, f_string, Endianness.NATIVE
+        elif pa.types.is_dictionary(dtype):
+            kind = DtypeKind.CATEGORICAL
+            f_string = "L"
+            return kind, bit_width, f_string, Endianness.NATIVE
+        else:
+            return self._dtype_from_arrowdtype(dtype, bit_width)
+
+
+    def _dtype_from_arrowdtype(self, dtype, bit_width) -> tuple[DtypeKind, int, str, str]:
+        """
+        See `self.dtype` for details.
+        """
+        # Note: 'c' (complex) not handled yet (not in array spec v1).
+        #       'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled
+        #       datetime and timedelta both map to datetime (is timedelta handled?)
+
         kind, f_string = _PYARROW_KINDS.get(dtype, (None, None))
         if kind is None:
             raise ValueError(f"Data type {dtype} not supported by interchange protocol")
-        bit_width = self._col.nbytes * 8
 
         return kind, bit_width, f_string, Endianness.NATIVE
 
@@ -154,15 +172,20 @@ def describe_categorical(self):
                              cat1, cat2, ...). None if not a dictionary-style
                              categorical.
         """
-        if pa.types.is_dictionary(self._col.type):
+        if isinstance(self._col, pa.ChunkedArray):
+            arr = self._col.combine_chunks()
+        else:
+            arr = self._col 
+
+        if not pa.types.is_dictionary(arr.type):
             raise TypeError(
                 "describe_categorical only works on a column with categorical dtype!"
             )
 
         return {
-            "is_ordered": True,
+            "is_ordered": self._col.type.ordered,
             "is_dictionary": True,
-            "categories": PyArrowColumn(self._col.dictionary),
+            "categories": PyArrowColumn(arr.dictionary),
         }
 
     @property
@@ -187,7 +210,11 @@ def num_chunks(self) -> int:
         """
         Return the number of chunks the column consists of.
         """
-        return 1
+        if isinstance(self._col, pa.Array):
+            n_chunks = 1
+        else:
+            n_chunks = self._col.num_chunks
+        return n_chunks
 
     def get_chunks(self, n_chunks: int | None = None):
         """
@@ -195,16 +222,37 @@ def get_chunks(self, n_chunks: int | None = None):
         See `DataFrame.get_chunks` for details on ``n_chunks``.
         """
         if n_chunks and n_chunks > 1:
-            size = self.size()
-            step = size // n_chunks
-            if size % n_chunks != 0:
-                step += 1
-            for start in range(0, step * n_chunks, step):
-                yield PyArrowColumn(
-                    self._col.slice(start,step), self._allow_copy
-                )
+            if n_chunks % self.num_chunks() == 0:
+                chunk_size = self.size() // n_chunks
+                if self.size() % n_chunks != 0:
+                    chunk_size += 1
+
+                if isinstance(self._col, pa.ChunkedArray):
+                    array = self._col.combine_chunks()
+                else:
+                    array = self._col
+
+                i = 0
+                for start in range(0, chunk_size * n_chunks, chunk_size):
+                    yield PyArrowColumn(
+                        array.slice(start,chunk_size), self._allow_copy
+                    )
+                    i +=1
+                # In case when the size of the chunk is such that the resulting
+                # list is one less chunk then n_chunks -> append an empty chunk
+                if i == n_chunks - 1:
+                    yield PyArrowColumn(pa.array([]), self._allow_copy)
+            else:
+                warnings.warn(
+                    "``n_chunks`` must be a multiple of ``self.num_chunks()``")
+        elif isinstance(self._col, pa.ChunkedArray):
+            return [
+                PyArrowColumn(chunk, self._allow_copy)
+                for chunk in self._col.chunks
+            ]
         else:
             yield self
+            
 
     def get_buffers(self) -> ColumnBuffers:
         """
@@ -249,11 +297,16 @@ def _get_data_buffer(
         """
         Return the buffer containing the data and the buffer's associated dtype.
         """
-        len = len(self._col.buffers())
-        if len == 2:
-            return PyArrowBuffer(self._col.buffers()[1]), self.dtype
-        elif len == 3:
-            return PyArrowBuffer(self._col.buffers()[2]), self.dtype
+        if isinstance(self._col, pa.ChunkedArray):
+            array = self._col.combine_chunks()
+        else:
+            array = self._col
+        n = len(array.buffers())
+        if n == 2:
+            return PyArrowBuffer(array.buffers()[1]), self.dtype
+        elif n == 3:
+            return PyArrowBuffer(array.buffers()[2]), self.dtype
+
 
     def _get_validity_buffer(self) -> tuple[PyArrowBuffer, Any]:
         """
@@ -263,7 +316,11 @@ def _get_validity_buffer(self) -> tuple[PyArrowBuffer, Any]:
         """
         # Define the dtype of the returned buffer
         dtype = (DtypeKind.BOOL, 8, "b", Endianness.NATIVE)
-        buff = self._col.buffers()[0]
+        if isinstance(self._col, pa.ChunkedArray):
+            array = self._col.combine_chunks()
+        else:
+            array = self._col
+        buff = array.buffers()[0]
         if buff:
             return PyArrowBuffer(buff), dtype
 
@@ -278,13 +335,17 @@ def _get_offsets_buffer(self) -> tuple[PyArrowBuffer, Any]:
         Raises NoBufferPresent if the data buffer does not have an associated
         offsets buffer.
         """
-        len = len(self._col.buffers())
-        if len == 2:
+        if isinstance(self._col, pa.ChunkedArray):
+            array = self._col.combine_chunks()
+        else:
+            array = self._col
+        n = len(array.buffers())
+        if n == 2:
             raise NoBufferPresent(
                 "This column has a fixed-length dtype so "
                 "it does not have an offsets buffer"
             )
-        elif len == 3:
+        elif n == 3:
             # Define the dtype of the returned buffer
             dtype = (DtypeKind.INT, 64, "L", Endianness.NATIVE)
-            return PyArrowBuffer(self._col.buffers()[2]), dtype
+            return PyArrowBuffer(array.buffers()[2]), dtype
diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py
index 43e4f03c6b7..f5c7335a283 100644
--- a/python/pyarrow/interchange/dataframe.py
+++ b/python/pyarrow/interchange/dataframe.py
@@ -91,8 +91,10 @@ def get_column_by_name(self, name: str) -> PyArrowColumn:
                              allow_copy=self._allow_copy)
 
     def get_columns(self) -> Iterable[PyArrowColumn]:
-        return PyArrowColumn(self._df.columns,
-                             allow_copy=self._allow_copy)
+        return [
+            PyArrowColumn(col, allow_copy=self._allow_copy)
+            for col in self._df.columns
+        ]
 
     def select_columns(self, indices: Sequence[int]) -> TableXchg:
         return TableXchg(
@@ -108,17 +110,25 @@ def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable[TableXchg]:
         """
         Return an iterator yielding the chunks.
         """
-        if n_chunks:
-            if n_chunks % self._df.num_chunks == 0:
-                chunk_size = self._df.num_rows // n_chunks
-                if self.num_rows %n_chunks != 0:
-                    warnings.warn("Converting dataframe into smaller chunks")
+        if n_chunks and n_chunks > 1:
+            if n_chunks % self.num_chunks() == 0:
+                chunk_size = self.num_rows() // n_chunks
+                if self.num_rows() % n_chunks != 0:
+                    chunk_size += 1
                 batches = self._df.to_batches(max_chunksize=chunk_size)
+                # In case when the size of the chunk is such that the resulting
+                # list is one less chunk then n_chunks -> append an empty chunk
+                if len(batches) == n_chunks - 1:
+                    batches.append(pa.record_batch([]))
             else:
                 warnings.warn(
                     "``n_chunks`` must be a multiple of ``self.num_chunks()``")
         else:
             batches = self._df.to_batches()
 
-        iterator_tables = [pa.Table.from_batches([batch]) for batch in batches]
+        iterator_tables = [TableXchg(
+                pa.Table.from_batches([batch]), self._nan_as_null, self._allow_copy
+            )
+            for batch in batches
+        ]
         return iterator_tables
diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py
index 1dd971067a5..97e9302c700 100644
--- a/python/pyarrow/tests/interchange/test_interchange_spec.py
+++ b/python/pyarrow/tests/interchange/test_interchange_spec.py
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import ctypes
+
 import pyarrow as pa
 import pytest
 
@@ -25,11 +27,10 @@
         {"a": [1.5, 2.5, 3.5], "b": [9.2, 10.5, 11.8]},
         {"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]},
     ],
-    ids=["str_data", "float_data", "int_data"],
 )
 def test_only_one_dtype(test_data):
     columns = list(test_data.keys())
-    table = pa.Table.from_pylist([test_data])
+    table = pa.table(test_data)
     df = table.__dataframe__()
 
     column_size = len(test_data[columns[0]])
@@ -41,11 +42,69 @@ def test_only_one_dtype(test_data):
         assert df.get_column_by_name(column).offset == 0
 
 
+def test_mixed_dtypes():
+    table = pa.table(
+        {
+            "a": [1, 2, 3],  # dtype kind INT = 0
+            "b": [3, 4, 5],  # dtype kind INT = 0
+            "c": [1.5, 2.5, 3.5],  # dtype kind FLOAT = 2
+            "d": [9, 10, 11],  # dtype kind INT = 0
+            "e": [True, False, True],  # dtype kind BOOLEAN = 20
+            "f": ["a", "", "c"],  # dtype kind STRING = 21
+        }
+    )
+    df = table.__dataframe__()
+    # for meanings of dtype[0] see the spec; we cannot import the spec here as this
+    # file is expected to be vendored *anywhere*;
+    # values for dtype[0] are explained above
+    columns = {"a": 0, "b": 0, "c": 2, "d": 0, "e": 20, "f": 21}
+
+    for column, kind in columns.items():
+        col = df.get_column_by_name(column)
+        assert col.null_count == 0
+        assert isinstance(col.null_count, int)
+        assert col.size() == 3
+        assert col.offset == 0
+
+        assert col.dtype[0] == kind
+
+    assert df.get_column_by_name("c").dtype[1] == 64
+
+
+def test_na_float():
+    table = pa.table({"a": [1.0, None, 2.0]})
+    df = table.__dataframe__()
+    col = df.get_column_by_name("a")
+    assert col.null_count == 1
+    assert isinstance(col.null_count, int)
+
+
+def test_noncategorical():
+    table = pa.table({"a": [1, 2, 3]})
+    df = table.__dataframe__()
+    col = df.get_column_by_name("a")
+    with pytest.raises(TypeError, match=".*categorical.*"):
+        col.describe_categorical
+
+
+def test_categorical():
+    import pyarrow as pa
+    arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]
+    table = pa.table(
+        {"weekday": pa.array(arr).dictionary_encode()}
+    )
+
+    col = table.__dataframe__().get_column_by_name("weekday")
+    categorical = col.describe_categorical
+    assert isinstance(categorical["is_ordered"], bool)
+    assert isinstance(categorical["is_dictionary"], bool)
+
+
 def test_dataframe():
     n = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
     a = pa.chunked_array([["Flamingo", "Parrot", "Cow"],
                          ["Horse", "Brittle stars", "Centipede"]])
-    table = pa.Table.from_arrays([n, a], names=['n_legs', 'animals'])
+    table = pa.table([n, a], names=['n_legs', 'animals'])
     df = table.__dataframe__()
 
     assert df.num_columns() == 2
@@ -55,3 +114,64 @@ def test_dataframe():
     assert list(df.select_columns((1,)).column_names()) == list(
         df.select_columns_by_name(("animals",)).column_names()
     )
+
+
+@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)])
+def test_df_get_chunks(size, n_chunks):
+    table = pa.table({"x": list(range(size))})
+    df = table.__dataframe__()
+    chunks = list(df.get_chunks(n_chunks))
+    assert len(chunks) == n_chunks
+    assert sum(chunk.num_rows() for chunk in chunks) == size
+
+
+@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)])
+def test_column_get_chunks(size, n_chunks):
+    table = pa.table({"x": list(range(size))})
+    df = table.__dataframe__()
+    chunks = list(df.get_column(0).get_chunks(n_chunks))
+    assert len(chunks) == n_chunks
+    assert sum(chunk.size() for chunk in chunks) == size
+
+
+def test_get_columns():
+    table = pa.table({"a": [0, 1], "b": [2.5, 3.5]})
+    df = table.__dataframe__()
+    for col in df.get_columns():
+        assert col.size() == 2
+        assert col.num_chunks() == 1
+    # for meanings of dtype[0] see the spec; we cannot import the spec here as this
+    # file is expected to be vendored *anywhere*
+    assert df.get_column(0).dtype[0] == 0  # INT
+    assert df.get_column(1).dtype[0] == 2  # FLOAT
+
+
+def test_buffer():
+    arr = [0, 1, -1]
+    table = pa.table({"a": arr})
+    df = table.__dataframe__()
+    col = df.get_column(0)
+    buf = col.get_buffers()
+
+    dataBuf, dataDtype = buf["data"]
+
+    assert dataBuf.bufsize > 0
+    assert dataBuf.ptr != 0
+    device, _ = dataBuf.__dlpack_device__()
+
+    # for meanings of dtype[0] see the spec; we cannot import the spec here as this
+    # file is expected to be vendored *anywhere*
+    assert dataDtype[0] == 0  # INT
+
+    if device == 1:  # CPU-only as we're going to directly read memory here
+        bitwidth = dataDtype[1]
+        ctype = {
+            8: ctypes.c_int8,
+            16: ctypes.c_int16,
+            32: ctypes.c_int32,
+            64: ctypes.c_int64,
+        }[bitwidth]
+
+        for idx, truth in enumerate(arr):
+            val = ctype.from_address(dataBuf.ptr + idx * (bitwidth // 8)).value
+            assert val == truth, f"Buffer at index {idx} mismatch"

From 027012d2ea56ab3ad6ab0c87ba43664c3c27eb9d Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Tue, 8 Nov 2022 12:43:39 +0100
Subject: [PATCH 06/29] Make changes to buffer, column and dataframe classes

---
 python/pyarrow/interchange/buffer.py    |  2 +-
 python/pyarrow/interchange/column.py    | 82 +++++++++++++------------
 python/pyarrow/interchange/dataframe.py | 20 +++---
 3 files changed, 51 insertions(+), 53 deletions(-)

diff --git a/python/pyarrow/interchange/buffer.py b/python/pyarrow/interchange/buffer.py
index ff0caf85dbb..6c183c034ce 100644
--- a/python/pyarrow/interchange/buffer.py
+++ b/python/pyarrow/interchange/buffer.py
@@ -33,7 +33,7 @@ class PyArrowBuffer(Buffer):
 
     def __init__(self, x: pa.Buffer, allow_copy: bool = True) -> None:
         """
-        Handle only regular columns (= numpy arrays) for now.
+        Handle PyArrow Buffers.
         """
         self._x = x
 
diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py
index 37bba874a82..4c1aa2f6608 100644
--- a/python/pyarrow/interchange/column.py
+++ b/python/pyarrow/interchange/column.py
@@ -18,12 +18,19 @@
 from __future__ import annotations
 
 import warnings
-from typing import Any, Dict, Iterable, Tuple
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    Optional,
+    Tuple,
+)
 
 import pyarrow as pa
 from pyarrow.interchange.buffer import PyArrowBuffer
 from pyarrow.interchange.dataframe_protocol import (Column, ColumnBuffers,
-                                                    ColumnNullType, DtypeKind)
+                                                    ColumnNullType, DtypeKind,
+                                                    CategoricalDescription)
 
 _PYARROW_KINDS = {
     pa.int8(): (DtypeKind.INT, "c"),
@@ -70,8 +77,7 @@ class PyArrowColumn(Column):
 
     def __init__(self, column: pa.Array | pa.ChunkedArray, allow_copy: bool = True) -> None:
         """
-        Note: doesn't deal with extension arrays yet, just assume a regular
-        Series/ndarray for now.
+        Handles PyArrow Arrays and ChunkedArrays.
         """
         # Store the column as a private attribute
         self._col = column
@@ -95,7 +101,7 @@ def offset(self) -> int:
         return 0
 
     @property
-    def dtype(self) -> tuple[DtypeKind, int, str, str]:
+    def dtype(self) -> Tuple[DtypeKind, int, str, str]:
         """
         Dtype description as a tuple ``(kind, bit-width, format string,
         endianness)``.
@@ -125,8 +131,8 @@ def dtype(self) -> tuple[DtypeKind, int, str, str]:
         dtype = self._col.type
         try:
             bit_width = dtype.bit_width
-        except: # in case of a non-fixed width type (string)
-            bit_width = 8
+        except: # in case of a variable-length strings
+            bit_width = None
 
         if pa.types.is_timestamp(dtype):
             kind = DtypeKind.DATETIME
@@ -140,7 +146,7 @@ def dtype(self) -> tuple[DtypeKind, int, str, str]:
             return self._dtype_from_arrowdtype(dtype, bit_width)
 
 
-    def _dtype_from_arrowdtype(self, dtype, bit_width) -> tuple[DtypeKind, int, str, str]:
+    def _dtype_from_arrowdtype(self, dtype, bit_width) -> Tuple[DtypeKind, int, str, str]:
         """
         See `self.dtype` for details.
         """
@@ -155,7 +161,7 @@ def _dtype_from_arrowdtype(self, dtype, bit_width) -> tuple[DtypeKind, int, str,
         return kind, bit_width, f_string, Endianness.NATIVE
 
     @property
-    def describe_categorical(self):
+    def describe_categorical(self) -> CategoricalDescription:
         """
         If the dtype is categorical, there are two options:
         - There are only values in the data buffer.
@@ -189,7 +195,7 @@ def describe_categorical(self):
         }
 
     @property
-    def describe_null(self):
+    def describe_null(self) -> Tuple[ColumnNullType, Any]:
         return ColumnNullType.USE_BYTEMASK, 0
 
     @property
@@ -216,35 +222,31 @@ def num_chunks(self) -> int:
             n_chunks = self._col.num_chunks
         return n_chunks
 
-    def get_chunks(self, n_chunks: int | None = None):
+    def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]:
         """
         Return an iterator yielding the chunks.
         See `DataFrame.get_chunks` for details on ``n_chunks``.
         """
         if n_chunks and n_chunks > 1:
-            if n_chunks % self.num_chunks() == 0:
-                chunk_size = self.size() // n_chunks
-                if self.size() % n_chunks != 0:
-                    chunk_size += 1
-
-                if isinstance(self._col, pa.ChunkedArray):
-                    array = self._col.combine_chunks()
-                else:
-                    array = self._col
-
-                i = 0
-                for start in range(0, chunk_size * n_chunks, chunk_size):
-                    yield PyArrowColumn(
-                        array.slice(start,chunk_size), self._allow_copy
-                    )
-                    i +=1
-                # In case when the size of the chunk is such that the resulting
-                # list is one less chunk then n_chunks -> append an empty chunk
-                if i == n_chunks - 1:
-                    yield PyArrowColumn(pa.array([]), self._allow_copy)
+            chunk_size = self.size() // n_chunks
+            if self.size() % n_chunks != 0:
+                chunk_size += 1
+
+            if isinstance(self._col, pa.ChunkedArray):
+                array = self._col.combine_chunks()
             else:
-                warnings.warn(
-                    "``n_chunks`` must be a multiple of ``self.num_chunks()``")
+                array = self._col
+
+            i = 0
+            for start in range(0, chunk_size * n_chunks, chunk_size):
+                yield PyArrowColumn(
+                    array.slice(start,chunk_size), self._allow_copy
+                )
+                i +=1
+            # In case when the size of the chunk is such that the resulting
+            # list is one less chunk then n_chunks -> append an empty chunk
+            if i == n_chunks - 1:
+                yield PyArrowColumn(pa.array([]), self._allow_copy)
         elif isinstance(self._col, pa.ChunkedArray):
             return [
                 PyArrowColumn(chunk, self._allow_copy)
@@ -293,7 +295,7 @@ def get_buffers(self) -> ColumnBuffers:
 
     def _get_data_buffer(
         self,
-    ) -> tuple[PyArrowBuffer, Any]:  # Any is for self.dtype tuple
+    ) -> Tuple[PyArrowBuffer, Any]:  # Any is for self.dtype tuple
         """
         Return the buffer containing the data and the buffer's associated dtype.
         """
@@ -308,7 +310,7 @@ def _get_data_buffer(
             return PyArrowBuffer(array.buffers()[2]), self.dtype
 
 
-    def _get_validity_buffer(self) -> tuple[PyArrowBuffer, Any]:
+    def _get_validity_buffer(self) -> Tuple[PyArrowBuffer, Any]:
         """
         Return the buffer containing the mask values indicating missing data and
         the buffer's associated dtype.
@@ -323,12 +325,12 @@ def _get_validity_buffer(self) -> tuple[PyArrowBuffer, Any]:
         buff = array.buffers()[0]
         if buff:
             return PyArrowBuffer(buff), dtype
+        else:
+            raise NoBufferPresent(
+                "There are no missing values so "
+                "does not have a separate mask")
 
-        raise NoBufferPresent(
-            "There are no missing values so "
-            "does not have a separate mask")
-
-    def _get_offsets_buffer(self) -> tuple[PyArrowBuffer, Any]:
+    def _get_offsets_buffer(self) -> Tuple[PyArrowBuffer, Any]:
         """
         Return the buffer containing the offset values for variable-size binary
         data (e.g., variable-length strings) and the buffer's associated dtype.
diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py
index f5c7335a283..1ebf87a7d2d 100644
--- a/python/pyarrow/interchange/dataframe.py
+++ b/python/pyarrow/interchange/dataframe.py
@@ -111,18 +111,14 @@ def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable[TableXchg]:
         Return an iterator yielding the chunks.
         """
         if n_chunks and n_chunks > 1:
-            if n_chunks % self.num_chunks() == 0:
-                chunk_size = self.num_rows() // n_chunks
-                if self.num_rows() % n_chunks != 0:
-                    chunk_size += 1
-                batches = self._df.to_batches(max_chunksize=chunk_size)
-                # In case when the size of the chunk is such that the resulting
-                # list is one less chunk then n_chunks -> append an empty chunk
-                if len(batches) == n_chunks - 1:
-                    batches.append(pa.record_batch([]))
-            else:
-                warnings.warn(
-                    "``n_chunks`` must be a multiple of ``self.num_chunks()``")
+            chunk_size = self.num_rows() // n_chunks
+            if self.num_rows() % n_chunks != 0:
+                chunk_size += 1
+            batches = self._df.to_batches(max_chunksize=chunk_size)
+            # In case when the size of the chunk is such that the resulting
+            # list is one less chunk then n_chunks -> append an empty chunk
+            if len(batches) == n_chunks - 1:
+                batches.append(pa.record_batch([]))
         else:
             batches = self._df.to_batches()
 

From 6f746fb42f3d73e679db1d3468197e1436915f36 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Tue, 8 Nov 2022 12:50:02 +0100
Subject: [PATCH 07/29] Make changes to from_dataframe.py skeleton

---
 python/pyarrow/interchange/from_dataframe.py | 24 ++++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py
index 884c2c2c3f8..5865280c543 100644
--- a/python/pyarrow/interchange/from_dataframe.py
+++ b/python/pyarrow/interchange/from_dataframe.py
@@ -26,9 +26,9 @@
 import pyarrow as pa
 
 
-def from_dataframe(df, allow_copy=True) -> pd.DataFrame:
+def from_dataframe(df, allow_copy=True) -> pa.Table:
     """
-    Build a ``pd.DataFrame`` from any DataFrame supporting the interchange
+    Build a ``pa.Table`` from any DataFrame supporting the interchange
     protocol.
 
     Parameters
@@ -41,7 +41,7 @@ def from_dataframe(df, allow_copy=True) -> pd.DataFrame:
         (if false then zero-copy approach is requested).
     Returns
     -------
-    pd.DataFrame
+    pa.Table
     """
     if isinstance(df, pa.Table):
         return df
@@ -54,7 +54,7 @@ def from_dataframe(df, allow_copy=True) -> pd.DataFrame:
 
 def _from_dataframe(df: DataFrameXchg, allow_copy=True):
     """
-    Build a ``pd.DataFrame`` from the DataFrame interchange object.
+    Build a ``pa.Table`` from the DataFrame interchange object.
     Parameters
     ----------
     df : DataFrameXchg
@@ -65,12 +65,12 @@ def _from_dataframe(df: DataFrameXchg, allow_copy=True):
         (if false then zero-copy approach is requested).
     Returns
     -------
-    pd.DataFrame
+    pa.Table
     """
     pass
 
 
-def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame:
+def protocol_df_chunk_to_pyarrow(df: DataFrameXchg) -> pa.Table:
     """
     Convert interchange protocol chunk to ``pd.DataFrame``.
     Parameters
@@ -78,7 +78,7 @@ def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame:
     df : DataFrameXchg
     Returns
     -------
-    pd.DataFrame
+    pa.Table
     """
     # We need a dict of columns here, with each column being a NumPy array
     # (at least for now, deal with non-NumPy dtypes later).
@@ -128,7 +128,7 @@ def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
     pass
 
 
-def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]:
+def categorical_column_to_dictionary(col: Column) -> tuple[pa.ChunkedArray, Any]:
     """
     Convert a column holding categorical data to a pandas Series.
     Parameters
@@ -137,7 +137,7 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]:
     Returns
     -------
     tuple
-        Tuple of pd.Series holding the data and the memory owner object
+        Tuple of pa.ChunkedArray holding the data and the memory owner object
         that keeps the memory alive.
     """
     pass
@@ -230,7 +230,7 @@ def bitmask_to_bool_ndarray(
 
 
 def set_nulls(
-    data: np.ndarray | pd.Series,
+    data: np.ndarray | pa.Array | pa.ChunkedArray,
     col: Column,
     validity: tuple[Buffer, tuple[DtypeKind, int, str, str]] | None,
     allow_modify_inplace: bool = True,
@@ -239,7 +239,7 @@ def set_nulls(
     Set null values for the data according to the column null kind.
     Parameters
     ----------
-    data : np.ndarray or pd.Series
+    data : np.ndarray, pa.Array or pa.ChunkedArray,
         Data to set nulls in.
     col : Column
         Column object that describes the `data`.
@@ -252,7 +252,7 @@ def set_nulls(
         (True) or always modify a copy of the `data` (False).
     Returns
     -------
-    np.ndarray or pd.Series
+    np.ndarray, pa.Array or pa.ChunkedArray,
         Data with the nulls being set.
     """
     pass

From 1669224ac8cbb7cdea3dbf06868bcbfcfc7d2661 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Tue, 8 Nov 2022 14:50:03 +0100
Subject: [PATCH 08/29] Add extra tests and make minor corrections

---
 python/pyarrow/interchange/buffer.py          |  1 -
 python/pyarrow/interchange/column.py          |  2 +-
 python/pyarrow/interchange/dataframe.py       |  1 -
 .../pyarrow/tests/interchange/test_extra.py   | 63 +++++++++++++++++++
 4 files changed, 64 insertions(+), 3 deletions(-)
 create mode 100644 python/pyarrow/tests/interchange/test_extra.py

diff --git a/python/pyarrow/interchange/buffer.py b/python/pyarrow/interchange/buffer.py
index 6c183c034ce..d09fc793810 100644
--- a/python/pyarrow/interchange/buffer.py
+++ b/python/pyarrow/interchange/buffer.py
@@ -17,7 +17,6 @@
 
 from __future__ import annotations
 
-import numpy as np
 import pyarrow as pa
 
 from pyarrow.interchange.dataframe_protocol import (
diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py
index 4c1aa2f6608..085a7dd2294 100644
--- a/python/pyarrow/interchange/column.py
+++ b/python/pyarrow/interchange/column.py
@@ -88,7 +88,7 @@ def size(self) -> int:
         Size of the column, in elements.
         """
         if isinstance(self._col, pa.Array):
-            len = self._col.to_numpy().size
+            len = self._col.to_numpy(zero_copy_only=False).size
         else:
             len = self._col.length()
         return len
diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py
index 1ebf87a7d2d..319c54e31a8 100644
--- a/python/pyarrow/interchange/dataframe.py
+++ b/python/pyarrow/interchange/dataframe.py
@@ -27,7 +27,6 @@
 )
 
 import pyarrow as pa
-import warnings
 
 from pyarrow.interchange.column import PyArrowColumn
 from pyarrow.interchange.dataframe_protocol import DataFrame as DataFrameXchg
diff --git a/python/pyarrow/tests/interchange/test_extra.py b/python/pyarrow/tests/interchange/test_extra.py
new file mode 100644
index 00000000000..5046cecea60
--- /dev/null
+++ b/python/pyarrow/tests/interchange/test_extra.py
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import ctypes
+
+import pandas as pd
+import pyarrow as pa
+import pytest
+
+from pyarrow.interchange.column import PyArrowColumn
+from pyarrow.interchange.dataframe_protocol import (
+    ColumnNullType,
+    DtypeKind,
+)
+
+def test_datetime():
+    df = pd.DataFrame({"A": [pd.Timestamp("2022-01-01"), pd.NaT]})
+    table = pa.table(df)
+    col = table.__dataframe__().get_column_by_name("A")
+
+    assert col.size() == 2
+    assert col.null_count == 1
+    assert col.dtype[0] == DtypeKind.DATETIME
+    assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0)
+
+
+@pytest.mark.parametrize(
+    ["test_data", "kind"],
+    [
+        (["foo", "bar"], 21),
+        ([1.5, 2.5, 3.5], 2),
+        ([1, 2, 3, 4], 0),
+    ],
+)
+def test_array_to_pyarrowcolumn(test_data, kind):
+    arr = pa.array(test_data)
+    arr_column = PyArrowColumn(arr)
+
+    assert arr_column._col == arr
+    assert arr_column.size() == len(test_data)
+    assert arr_column.dtype[0] == kind
+    assert arr_column.num_chunks() == 1
+    assert arr_column.null_count == 0
+    assert arr_column.get_buffers()["validity"] == None
+    assert len(list(arr_column.get_chunks())) == 1
+    
+    for chunk in arr_column.get_chunks():
+        assert chunk == arr_column
+

From 473414e49c31188c09cdcc416fbb0a059a3913f1 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Tue, 8 Nov 2022 18:18:59 +0100
Subject: [PATCH 09/29] Run linter

---
 python/pyarrow/interchange/column.py          | 41 +++++++++++--------
 python/pyarrow/interchange/dataframe.py       | 18 ++++----
 .../pyarrow/interchange/dataframe_protocol.py | 41 +++++++++++--------
 python/pyarrow/interchange/from_dataframe.py  | 14 +++++--
 .../pyarrow/tests/interchange/test_extra.py   |  8 ++--
 .../interchange/test_interchange_spec.py      | 13 +++---
 6 files changed, 75 insertions(+), 60 deletions(-)

diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py
index 085a7dd2294..e4110e51fb4 100644
--- a/python/pyarrow/interchange/column.py
+++ b/python/pyarrow/interchange/column.py
@@ -17,7 +17,6 @@
 
 from __future__ import annotations
 
-import warnings
 from typing import (
     Any,
     Dict,
@@ -75,7 +74,9 @@ class PyArrowColumn(Column):
           doesn't need its own version or ``__column__`` protocol.
     """
 
-    def __init__(self, column: pa.Array | pa.ChunkedArray, allow_copy: bool = True) -> None:
+    def __init__(
+        self, column: pa.Array | pa.ChunkedArray, allow_copy: bool = True
+    ) -> None:
         """
         Handles PyArrow Arrays and ChunkedArrays.
         """
@@ -131,7 +132,7 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]:
         dtype = self._col.type
         try:
             bit_width = dtype.bit_width
-        except: # in case of a variable-length strings
+        except ValueError:  # in case of a variable-length strings
             bit_width = None
 
         if pa.types.is_timestamp(dtype):
@@ -145,18 +146,21 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]:
         else:
             return self._dtype_from_arrowdtype(dtype, bit_width)
 
-
-    def _dtype_from_arrowdtype(self, dtype, bit_width) -> Tuple[DtypeKind, int, str, str]:
+    def _dtype_from_arrowdtype(
+        self, dtype, bit_width
+    ) -> Tuple[DtypeKind, int, str, str]:
         """
         See `self.dtype` for details.
         """
         # Note: 'c' (complex) not handled yet (not in array spec v1).
-        #       'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled
-        #       datetime and timedelta both map to datetime (is timedelta handled?)
+        #       'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void)
+        #       not handled datetime and timedelta both map to datetime
+        #       (is timedelta handled?)
 
         kind, f_string = _PYARROW_KINDS.get(dtype, (None, None))
         if kind is None:
-            raise ValueError(f"Data type {dtype} not supported by interchange protocol")
+            raise ValueError(
+                f"Data type {dtype} not supported by interchange protocol")
 
         return kind, bit_width, f_string, Endianness.NATIVE
 
@@ -181,11 +185,12 @@ def describe_categorical(self) -> CategoricalDescription:
         if isinstance(self._col, pa.ChunkedArray):
             arr = self._col.combine_chunks()
         else:
-            arr = self._col 
+            arr = self._col
 
         if not pa.types.is_dictionary(arr.type):
             raise TypeError(
-                "describe_categorical only works on a column with categorical dtype!"
+                "describe_categorical only works on a column with "
+                "categorical dtype!"
             )
 
         return {
@@ -240,9 +245,9 @@ def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]:
             i = 0
             for start in range(0, chunk_size * n_chunks, chunk_size):
                 yield PyArrowColumn(
-                    array.slice(start,chunk_size), self._allow_copy
+                    array.slice(start, chunk_size), self._allow_copy
                 )
-                i +=1
+                i += 1
             # In case when the size of the chunk is such that the resulting
             # list is one less chunk then n_chunks -> append an empty chunk
             if i == n_chunks - 1:
@@ -254,7 +259,6 @@ def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]:
             ]
         else:
             yield self
-            
 
     def get_buffers(self) -> ColumnBuffers:
         """
@@ -297,7 +301,8 @@ def _get_data_buffer(
         self,
     ) -> Tuple[PyArrowBuffer, Any]:  # Any is for self.dtype tuple
         """
-        Return the buffer containing the data and the buffer's associated dtype.
+        Return the buffer containing the data and the buffer's
+        associated dtype.
         """
         if isinstance(self._col, pa.ChunkedArray):
             array = self._col.combine_chunks()
@@ -309,12 +314,12 @@ def _get_data_buffer(
         elif n == 3:
             return PyArrowBuffer(array.buffers()[2]), self.dtype
 
-
     def _get_validity_buffer(self) -> Tuple[PyArrowBuffer, Any]:
         """
-        Return the buffer containing the mask values indicating missing data and
-        the buffer's associated dtype.
-        Raises NoBufferPresent if null representation is not a bit or byte mask.
+        Return the buffer containing the mask values indicating missing data
+        and the buffer's associated dtype.
+        Raises NoBufferPresent if null representation is not a bit or byte
+        mask.
         """
         # Define the dtype of the returned buffer
         dtype = (DtypeKind.BOOL, 8, "b", Endianness.NATIVE)
diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py
index 319c54e31a8..3c9b0cb1f74 100644
--- a/python/pyarrow/interchange/dataframe.py
+++ b/python/pyarrow/interchange/dataframe.py
@@ -18,12 +18,9 @@
 from __future__ import annotations
 from typing import (
     Any,
-    Dict,
     Iterable,
     Optional,
     Sequence,
-    Tuple,
-    TypedDict,
 )
 
 import pyarrow as pa
@@ -50,9 +47,10 @@ def __init__(
         """
         self._df = df
         # ``nan_as_null`` is a keyword intended for the consumer to tell the
-        # producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
-        # This currently has no effect; once support for nullable extension
-        # dtypes is added, this value should be propagated to columns.
+        # producer to overwrite null values in the data with ``NaN`` (or
+        # ``NaT``). This currently has no effect; once support for nullable
+        # extension dtypes is added, this value should be propagated to
+        # columns.
         self._nan_as_null = nan_as_null
         self._allow_copy = allow_copy
 
@@ -105,7 +103,9 @@ def select_columns_by_name(self, names: Sequence[str]) -> TableXchg:
             self._df.select(list(names)), self._nan_as_null, self._allow_copy
         )
 
-    def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable[TableXchg]:
+    def get_chunks(
+        self, n_chunks: Optional[int] = None
+    ) -> Iterable[TableXchg]:
         """
         Return an iterator yielding the chunks.
         """
@@ -122,8 +122,8 @@ def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable[TableXchg]:
             batches = self._df.to_batches()
 
         iterator_tables = [TableXchg(
-                pa.Table.from_batches([batch]), self._nan_as_null, self._allow_copy
-            )
+            pa.Table.from_batches([batch]), self._nan_as_null, self._allow_copy
+        )
             for batch in batches
         ]
         return iterator_tables
diff --git a/python/pyarrow/interchange/dataframe_protocol.py b/python/pyarrow/interchange/dataframe_protocol.py
index de30ede8d63..13a5d337c2d 100644
--- a/python/pyarrow/interchange/dataframe_protocol.py
+++ b/python/pyarrow/interchange/dataframe_protocol.py
@@ -256,11 +256,11 @@ def dtype(self) -> Dtype:
         Notes:
             - Kind specifiers are aligned with DLPack where possible (hence the
               jump to 20, leave enough room for future extension)
-            - Masks must be specified as boolean with either bit width 1 (for bit
-              masks) or 8 (for byte masks).
+            - Masks must be specified as boolean with either bit width 1 (for
+              bit masks) or 8 (for byte masks).
             - Dtype width in bits was preferred over bytes
-            - Endianness isn't too useful, but included now in case in the future
-              we need to support non-native endianness
+            - Endianness isn't too useful, but included now in case in the
+              future we need to support non-native endianness
             - Went with Apache Arrow format strings over NumPy format strings
               because they're more complete from a dataframe perspective
             - Format strings are mostly useful for datetime specification, and
@@ -269,8 +269,8 @@ def dtype(self) -> Dtype:
               categorical in the data buffer. In case of a separate encoding of
               the categorical (e.g. an integer to string mapping), this can
               be derived from ``self.describe_categorical``.
-            - Data types not included: complex, Arrow-style null, binary, decimal,
-              and nested (list, struct, map, union) dtypes.
+            - Data types not included: complex, Arrow-style null, binary,
+              decimal, and nested (list, struct, map, union) dtypes.
         """
         pass
 
@@ -280,16 +280,19 @@ def describe_categorical(self) -> CategoricalDescription:
         """
         If the dtype is categorical, there are two options:
         - There are only values in the data buffer.
-        - There is a separate non-categorical Column encoding categorical values.
+        - There is a separate non-categorical Column encoding categorical
+          values.
         Raises TypeError if the dtype is not categorical
-        Returns the dictionary with description on how to interpret the data buffer:
-            - "is_ordered" : bool, whether the ordering of dictionary indices is
-                             semantically meaningful.
+        Returns the dictionary with description on how to interpret the
+        data buffer:
+            - "is_ordered" : bool, whether the ordering of dictionary indices
+                             is semantically meaningful.
             - "is_dictionary" : bool, whether a mapping of
                                 categorical values to other objects exists
-            - "categories" : Column representing the (implicit) mapping of indices
-                             to category values (e.g. an array of cat1, cat2, ...).
-                             None if not a dictionary-style categorical.
+            - "categories" : Column representing the (implicit) mapping of
+                             indices to category values (e.g. an array of
+                             cat1, cat2, ...). None if not a dictionary-style
+                             categorical.
         TBD: are there any other in-memory representations that are needed?
         """
         pass
@@ -301,8 +304,8 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]:
         Return the missing value (or "null") representation the column dtype
         uses, as a tuple ``(kind, value)``.
         Value : if kind is "sentinel value", the actual value. If kind is a bit
-        mask or a byte mask, the value (0 or 1) indicating a missing value. None
-        otherwise.
+        mask or a byte mask, the value (0 or 1) indicating a missing value.
+        None otherwise.
         """
         pass
 
@@ -396,8 +399,8 @@ def __dataframe__(
         mask or byte mask that is the producer's native representation.
         ``allow_copy`` is a keyword that defines whether or not the library is
         allowed to make a copy of the data. For example, copying data would be
-        necessary if a library supports strided buffers, given that this protocol
-        specifies contiguous buffers.
+        necessary if a library supports strided buffers, given that this
+        protocol specifies contiguous buffers.
         """
         pass
 
@@ -482,7 +485,9 @@ def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame":
         pass
 
     @abstractmethod
-    def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["DataFrame"]:
+    def get_chunks(
+        self, n_chunks: Optional[int] = None
+    ) -> Iterable["DataFrame"]:
         """
         Return an iterator yielding the chunks.
         By default (None), yields the chunks that the data is stored as by the
diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py
index 5865280c543..9d05c7ba00d 100644
--- a/python/pyarrow/interchange/from_dataframe.py
+++ b/python/pyarrow/interchange/from_dataframe.py
@@ -15,14 +15,18 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from pyarrow.interchange.column import PyArrowColumn
+from typing import (
+    Any,
+)
+
 from pyarrow.interchange.dataframe_protocol import (
     Buffer,
     Column,
-    ColumnNullType,
     DataFrame as DataFrameXchg,
     DtypeKind,
 )
+
+import numpy as np
 import pyarrow as pa
 
 
@@ -99,7 +103,7 @@ def protocol_df_chunk_to_pyarrow(df: DataFrameXchg) -> pa.Table:
         ):
             columns[name], buf = primitive_column_to_ndarray(col)
         elif dtype == DtypeKind.CATEGORICAL:
-            columns[name], buf = categorical_column_to_series(col)
+            columns[name], buf = categorical_column_to_dictionary(col)
         elif dtype == DtypeKind.STRING:
             columns[name], buf = string_column_to_ndarray(col)
         elif dtype == DtypeKind.DATETIME:
@@ -128,7 +132,9 @@ def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
     pass
 
 
-def categorical_column_to_dictionary(col: Column) -> tuple[pa.ChunkedArray, Any]:
+def categorical_column_to_dictionary(
+    col: Column
+) -> tuple[pa.ChunkedArray, Any]:
     """
     Convert a column holding categorical data to a pandas Series.
     Parameters
diff --git a/python/pyarrow/tests/interchange/test_extra.py b/python/pyarrow/tests/interchange/test_extra.py
index 5046cecea60..8fd683c8bf5 100644
--- a/python/pyarrow/tests/interchange/test_extra.py
+++ b/python/pyarrow/tests/interchange/test_extra.py
@@ -15,8 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import ctypes
-
 import pandas as pd
 import pyarrow as pa
 import pytest
@@ -27,6 +25,7 @@
     DtypeKind,
 )
 
+
 def test_datetime():
     df = pd.DataFrame({"A": [pd.Timestamp("2022-01-01"), pd.NaT]})
     table = pa.table(df)
@@ -55,9 +54,8 @@ def test_array_to_pyarrowcolumn(test_data, kind):
     assert arr_column.dtype[0] == kind
     assert arr_column.num_chunks() == 1
     assert arr_column.null_count == 0
-    assert arr_column.get_buffers()["validity"] == None
+    assert arr_column.get_buffers()["validity"] is None
     assert len(list(arr_column.get_chunks())) == 1
-    
+
     for chunk in arr_column.get_chunks():
         assert chunk == arr_column
-
diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py
index 97e9302c700..c292c9eefab 100644
--- a/python/pyarrow/tests/interchange/test_interchange_spec.py
+++ b/python/pyarrow/tests/interchange/test_interchange_spec.py
@@ -20,6 +20,7 @@
 import pyarrow as pa
 import pytest
 
+
 @pytest.mark.parametrize(
     "test_data",
     [
@@ -54,8 +55,8 @@ def test_mixed_dtypes():
         }
     )
     df = table.__dataframe__()
-    # for meanings of dtype[0] see the spec; we cannot import the spec here as this
-    # file is expected to be vendored *anywhere*;
+    # for meanings of dtype[0] see the spec; we cannot import the
+    # spec here as this file is expected to be vendored *anywhere*;
     # values for dtype[0] are explained above
     columns = {"a": 0, "b": 0, "c": 2, "d": 0, "e": 20, "f": 21}
 
@@ -140,8 +141,8 @@ def test_get_columns():
     for col in df.get_columns():
         assert col.size() == 2
         assert col.num_chunks() == 1
-    # for meanings of dtype[0] see the spec; we cannot import the spec here as this
-    # file is expected to be vendored *anywhere*
+    # for meanings of dtype[0] see the spec; we cannot import the
+    # spec here as this file is expected to be vendored *anywhere*
     assert df.get_column(0).dtype[0] == 0  # INT
     assert df.get_column(1).dtype[0] == 2  # FLOAT
 
@@ -159,8 +160,8 @@ def test_buffer():
     assert dataBuf.ptr != 0
     device, _ = dataBuf.__dlpack_device__()
 
-    # for meanings of dtype[0] see the spec; we cannot import the spec here as this
-    # file is expected to be vendored *anywhere*
+    # for meanings of dtype[0] see the spec; we cannot import the spec
+    # here as this file is expected to be vendored *anywhere*
     assert dataDtype[0] == 0  # INT
 
     if device == 1:  # CPU-only as we're going to directly read memory here

From cba43740e2cdc7d72e34ee8d362e94938489317d Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Wed, 9 Nov 2022 08:30:56 +0100
Subject: [PATCH 10/29] Make changes to the code to make pa.Table ->
 pd.DataFrame work for int, float with missing values

---
 python/pyarrow/interchange/column.py    | 6 +++++-
 python/pyarrow/interchange/dataframe.py | 7 +++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py
index e4110e51fb4..9953a30eb2e 100644
--- a/python/pyarrow/interchange/column.py
+++ b/python/pyarrow/interchange/column.py
@@ -133,7 +133,11 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]:
         try:
             bit_width = dtype.bit_width
         except ValueError:  # in case of a variable-length strings
-            bit_width = None
+            bit_width = 8
+        # In case of bool data type, bit_width is 1 and has to be multiplied
+        # by 8 (why is that not the case for other dtypes?)
+        if pa.types.is_boolean(dtype):
+            bit_width *= 8
 
         if pa.types.is_timestamp(dtype):
             kind = DtypeKind.DATETIME
diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py
index 3c9b0cb1f74..c5b4be9b539 100644
--- a/python/pyarrow/interchange/dataframe.py
+++ b/python/pyarrow/interchange/dataframe.py
@@ -63,9 +63,12 @@ def __dataframe__(
     def metadata(self) -> dict[str, Any]:
         # The metadata for the data frame, as a dictionary with string keys.
         # Add schema metadata here (pandas metadata, ot custom metadata)
-        schema_metadata = {k.decode('utf8'): v.decode('utf8')
+        if self._df.schema.metadata:
+            schema_metadata = {k.decode('utf8'): v.decode('utf8')
                            for k, v in self._df.schema.metadata.items()}
-        return schema_metadata
+            return schema_metadata
+        else:
+            return {}
 
     def num_columns(self) -> int:
         return self._df.num_columns

From c02145152ccbdb5c6f8ace6801b9479d31e2c547 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Wed, 9 Nov 2022 09:42:48 +0100
Subject: [PATCH 11/29] Correct linter error and add a check for TypedDict
 import

---
 python/pyarrow/interchange/dataframe.py          | 2 +-
 python/pyarrow/interchange/dataframe_protocol.py | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py
index c5b4be9b539..5eb93abacc3 100644
--- a/python/pyarrow/interchange/dataframe.py
+++ b/python/pyarrow/interchange/dataframe.py
@@ -65,7 +65,7 @@ def metadata(self) -> dict[str, Any]:
         # Add schema metadata here (pandas metadata, ot custom metadata)
         if self._df.schema.metadata:
             schema_metadata = {k.decode('utf8'): v.decode('utf8')
-                           for k, v in self._df.schema.metadata.items()}
+                               for k, v in self._df.schema.metadata.items()}
             return schema_metadata
         else:
             return {}
diff --git a/python/pyarrow/interchange/dataframe_protocol.py b/python/pyarrow/interchange/dataframe_protocol.py
index 13a5d337c2d..aa3637239d1 100644
--- a/python/pyarrow/interchange/dataframe_protocol.py
+++ b/python/pyarrow/interchange/dataframe_protocol.py
@@ -34,6 +34,13 @@
     TypedDict,
 )
 
+import sys
+
+if sys.version_info >= (3, 8):
+    from typing import TypedDict
+else:
+    from typing_extensions import TypedDict
+
 
 class DlpackDeviceType(enum.IntEnum):
     """Integer enum for device type codes matching DLPack."""

From 7e1e6bd692a61ef87379f564b3a6e10788116929 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Wed, 9 Nov 2022 10:21:15 +0100
Subject: [PATCH 12/29] Use len(...) for the size of the
 pa.Array/pa.ChunkedArray

---
 python/pyarrow/interchange/column.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py
index 9953a30eb2e..f6de4de732d 100644
--- a/python/pyarrow/interchange/column.py
+++ b/python/pyarrow/interchange/column.py
@@ -88,11 +88,7 @@ def size(self) -> int:
         """
         Size of the column, in elements.
         """
-        if isinstance(self._col, pa.Array):
-            len = self._col.to_numpy(zero_copy_only=False).size
-        else:
-            len = self._col.length()
-        return len
+        return len(self._col)
 
     @property
     def offset(self) -> int:

From df9b24bc9bfdfa625711c0260ee688ba8d0678af Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Wed, 9 Nov 2022 11:58:09 +0100
Subject: [PATCH 13/29] Add missing annotations import and remove TypedDict
 leftover

---
 python/pyarrow/interchange/dataframe_protocol.py | 1 -
 python/pyarrow/interchange/from_dataframe.py     | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/pyarrow/interchange/dataframe_protocol.py b/python/pyarrow/interchange/dataframe_protocol.py
index aa3637239d1..d83dec30495 100644
--- a/python/pyarrow/interchange/dataframe_protocol.py
+++ b/python/pyarrow/interchange/dataframe_protocol.py
@@ -31,7 +31,6 @@
     Optional,
     Sequence,
     Tuple,
-    TypedDict,
 )
 
 import sys
diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py
index 9d05c7ba00d..f4fcfe45855 100644
--- a/python/pyarrow/interchange/from_dataframe.py
+++ b/python/pyarrow/interchange/from_dataframe.py
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from __future__ import annotations
+
 from typing import (
     Any,
 )

From 494ffbc6dfcd251cb897dee318ad5fb434304a1d Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Thu, 10 Nov 2022 12:31:40 +0100
Subject: [PATCH 14/29] Remove bool bit_width check

---
 python/pyarrow/interchange/column.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py
index f6de4de732d..1a35f79c905 100644
--- a/python/pyarrow/interchange/column.py
+++ b/python/pyarrow/interchange/column.py
@@ -130,10 +130,6 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]:
             bit_width = dtype.bit_width
         except ValueError:  # in case of a variable-length strings
             bit_width = 8
-        # In case of bool data type, bit_width is 1 and has to be multiplied
-        # by 8 (why is that not the case for other dtypes?)
-        if pa.types.is_boolean(dtype):
-            bit_width *= 8
 
         if pa.types.is_timestamp(dtype):
             kind = DtypeKind.DATETIME

From 784d178f59351ffdfe44dfe7b6a342232a2bec71 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Mon, 14 Nov 2022 20:40:04 +0100
Subject: [PATCH 15/29] Change buffer representation of boolean arrays

---
 python/pyarrow/interchange/column.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py
index 1a35f79c905..ef130765f4e 100644
--- a/python/pyarrow/interchange/column.py
+++ b/python/pyarrow/interchange/column.py
@@ -26,6 +26,7 @@
 )
 
 import pyarrow as pa
+import pyarrow.compute as pc
 from pyarrow.interchange.buffer import PyArrowBuffer
 from pyarrow.interchange.dataframe_protocol import (Column, ColumnBuffers,
                                                     ColumnNullType, DtypeKind,
@@ -197,7 +198,7 @@ def describe_categorical(self) -> CategoricalDescription:
 
     @property
     def describe_null(self) -> Tuple[ColumnNullType, Any]:
-        return ColumnNullType.USE_BYTEMASK, 0
+        return ColumnNullType.USE_BITMASK, 0
 
     @property
     def null_count(self) -> int:
@@ -304,11 +305,19 @@ def _get_data_buffer(
             array = self._col.combine_chunks()
         else:
             array = self._col
+        dtype = self.dtype
+
+        # In case of boolean arrays, cast to uint8 array
+        # as bit packed buffers are not supported
+        if pa.types.is_boolean(array.type):
+            array = pc.cast(array, pa.uint8())
+            dtype = PyArrowColumn(array).dtype
+
         n = len(array.buffers())
         if n == 2:
-            return PyArrowBuffer(array.buffers()[1]), self.dtype
+            return PyArrowBuffer(array.buffers()[1]), dtype
         elif n == 3:
-            return PyArrowBuffer(array.buffers()[2]), self.dtype
+            return PyArrowBuffer(array.buffers()[2]), dtype
 
     def _get_validity_buffer(self) -> Tuple[PyArrowBuffer, Any]:
         """

From 33784dad41b1395cb4b4989ef0ce31c5679ce291 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Tue, 15 Nov 2022 11:55:31 +0100
Subject: [PATCH 16/29] Remove dataframe protocol abstract classes and move the
 docstrings and necessary defenitions to separate implementation files

---
 python/pyarrow/interchange/buffer.py          |  36 +-
 python/pyarrow/interchange/column.py          | 192 ++++++-
 python/pyarrow/interchange/dataframe.py       | 104 +++-
 .../pyarrow/interchange/dataframe_protocol.py | 506 ------------------
 python/pyarrow/table.pxi                      |   4 +-
 5 files changed, 276 insertions(+), 566 deletions(-)
 delete mode 100644 python/pyarrow/interchange/dataframe_protocol.py

diff --git a/python/pyarrow/interchange/buffer.py b/python/pyarrow/interchange/buffer.py
index d09fc793810..9f30f2b99e3 100644
--- a/python/pyarrow/interchange/buffer.py
+++ b/python/pyarrow/interchange/buffer.py
@@ -16,18 +16,35 @@
 # under the License.
 
 from __future__ import annotations
+import enum
 
 import pyarrow as pa
 
-from pyarrow.interchange.dataframe_protocol import (
-    Buffer,
-    DlpackDeviceType,
-)
 
+class DlpackDeviceType(enum.IntEnum):
+    """Integer enum for device type codes matching DLPack."""
 
-class PyArrowBuffer(Buffer):
+    CPU = 1
+    CUDA = 2
+    CPU_PINNED = 3
+    OPENCL = 4
+    VULKAN = 7
+    METAL = 8
+    VPI = 9
+    ROCM = 10
+
+
+class _PyArrowBuffer:
     """
     Data in the buffer is guaranteed to be contiguous in memory.
+    Note that there is no dtype attribute present, a buffer can be thought of
+    as simply a block of memory. However, if the column that the buffer is
+    attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
+    implemented, then that dtype information will be contained in the return
+    value from ``__dlpack__``.
+    This distinction is useful to support both data exchange via DLPack on a
+    buffer and (b) dtypes like variable-length strings which do not have a
+    fixed number of bytes per element.
     """
 
     def __init__(self, x: pa.Buffer, allow_copy: bool = True) -> None:
@@ -52,13 +69,20 @@ def ptr(self) -> int:
 
     def __dlpack__(self):
         """
-        Represent this structure as DLPack interface.
+        Produce DLPack capsule (see array API standard).
+        Raises:
+            - TypeError : if the buffer contains unsupported dtypes.
+            - NotImplementedError : if DLPack support is not implemented
+        Useful to have to connect to array libraries. Support optional because
+        it's not completely trivial to implement for a Python-only library.
         """
         raise NotImplementedError("__dlpack__")
 
     def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
         """
         Device type and device ID for where the data in the buffer resides.
+        Uses device type codes matching DLPack.
+        Note: must be implemented even if ``__dlpack__`` is not.
         """
         if self._x.is_cpu:
             return (DlpackDeviceType.CPU, None)
diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py
index ef130765f4e..10c3f7c8c5b 100644
--- a/python/pyarrow/interchange/column.py
+++ b/python/pyarrow/interchange/column.py
@@ -17,6 +17,7 @@
 
 from __future__ import annotations
 
+import enum
 from typing import (
     Any,
     Dict,
@@ -25,12 +26,49 @@
     Tuple,
 )
 
+import sys
+if sys.version_info >= (3, 8):
+    from typing import TypedDict
+else:
+    from typing_extensions import TypedDict
+
 import pyarrow as pa
 import pyarrow.compute as pc
-from pyarrow.interchange.buffer import PyArrowBuffer
-from pyarrow.interchange.dataframe_protocol import (Column, ColumnBuffers,
-                                                    ColumnNullType, DtypeKind,
-                                                    CategoricalDescription)
+from pyarrow.interchange.buffer import _PyArrowBuffer
+
+
+class DtypeKind(enum.IntEnum):
+    """
+    Integer enum for data types.
+    Attributes
+    ----------
+    INT : int
+        Matches to signed integer data type.
+    UINT : int
+        Matches to unsigned integer data type.
+    FLOAT : int
+        Matches to floating point data type.
+    BOOL : int
+        Matches to boolean data type.
+    STRING : int
+        Matches to string data type (UTF-8 encoded).
+    DATETIME : int
+        Matches to datetime data type.
+    CATEGORICAL : int
+        Matches to categorical data type.
+    """
+
+    INT = 0
+    UINT = 1
+    FLOAT = 2
+    BOOL = 20
+    STRING = 21  # UTF-8
+    DATETIME = 22
+    CATEGORICAL = 23
+
+
+Dtype = Tuple[DtypeKind, int, str, str]  # see Column.dtype
+
 
 _PYARROW_KINDS = {
     pa.int8(): (DtypeKind.INT, "c"),
@@ -50,6 +88,58 @@
 }
 
 
+class ColumnNullType(enum.IntEnum):
+    """
+    Integer enum for null type representation.
+    Attributes
+    ----------
+    NON_NULLABLE : int
+        Non-nullable column.
+    USE_NAN : int
+        Use explicit float NaN value.
+    USE_SENTINEL : int
+        Sentinel value besides NaN.
+    USE_BITMASK : int
+        The bit is set/unset representing a null on a certain position.
+    USE_BYTEMASK : int
+        The byte is set/unset representing a null on a certain position.
+    """
+
+    NON_NULLABLE = 0
+    USE_NAN = 1
+    USE_SENTINEL = 2
+    USE_BITMASK = 3
+    USE_BYTEMASK = 4
+
+
+class ColumnBuffers(TypedDict):
+    # first element is a buffer containing the column data;
+    # second element is the data buffer's associated dtype
+    data: Tuple[_PyArrowBuffer, Dtype]
+
+    # first element is a buffer containing mask values indicating missing data;
+    # second element is the mask value buffer's associated dtype.
+    # None if the null representation is not a bit or byte mask
+    validity: Optional[Tuple[_PyArrowBuffer, Dtype]]
+
+    # first element is a buffer containing the offset values for
+    # variable-size binary data (e.g., variable-length strings);
+    # second element is the offsets buffer's associated dtype.
+    # None if the data buffer does not have an associated offsets buffer
+    offsets: Optional[Tuple[_PyArrowBuffer, Dtype]]
+
+
+class CategoricalDescription(TypedDict):
+    # whether the ordering of dictionary indices is semantically meaningful
+    is_ordered: bool
+    # whether a dictionary-style mapping of categorical values to other objects
+    # exists
+    is_dictionary: bool
+    # Python-level only (e.g. ``{int: str}``).
+    # None if not a dictionary-style categorical.
+    # categories: Optional[Column]
+
+
 class Endianness:
     """Enum indicating the byte-order of a data-type."""
 
@@ -63,7 +153,7 @@ class NoBufferPresent(Exception):
     """Exception to signal that there is no requested buffer."""
 
 
-class PyArrowColumn(Column):
+class _PyArrowColumn:
     """
     A column object, with only the methods and properties required by the
     interchange protocol defined.
@@ -71,6 +161,31 @@ class PyArrowColumn(Column):
     buffers - a data buffer, a mask buffer (depending on null representation),
     and an offsets buffer (if variable-size binary; e.g., variable-length
     strings).
+    TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
+         Instead, it seems to use "children" for both columns with a bit mask,
+         and for nested dtypes. Unclear whether this is elegant or confusing.
+         This design requires checking the null representation explicitly.
+         The Arrow design requires checking:
+         1. the ARROW_FLAG_NULLABLE (for sentinel values)
+         2. if a column has two children, combined with one of those children
+            having a null dtype.
+         Making the mask concept explicit seems useful. One null dtype would
+         not be enough to cover both bit and byte masks, so that would mean
+         even more checking if we did it the Arrow way.
+    TBD: there's also the "chunk" concept here, which is implicit in Arrow as
+         multiple buffers per array (= column here). Semantically it may make
+         sense to have both: chunks were meant for example for lazy evaluation
+         of data which doesn't fit in memory, while multiple buffers per column
+         could also come from doing a selection operation on a single
+         contiguous buffer.
+         Given these concepts, one would expect chunks to be all of the same
+         size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),
+         while multiple buffers could have data-dependent lengths. Not an issue
+         in pandas if one column is backed by a single NumPy array, but in
+         Arrow it seems possible.
+         Are multiple chunks *and* multiple buffers per column necessary for
+         the purposes of this interchange protocol, or must producers either
+         reuse the chunk concept for this or copy the data?
     Note: this Column object can only be produced by ``__dataframe__``, so
           doesn't need its own version or ``__column__`` protocol.
     """
@@ -88,6 +203,10 @@ def __init__(
     def size(self) -> int:
         """
         Size of the column, in elements.
+        Corresponds to DataFrame.num_rows() if column is a single chunk;
+        equal to size of this current chunk otherwise.
+        Is a method rather than a property because it may cause a (potentially
+        expensive) computation for some dataframe implementations.
         """
         return len(self._col)
 
@@ -95,6 +214,9 @@ def size(self) -> int:
     def offset(self) -> int:
         """
         Offset of first element.
+        May be > 0 if using chunks; for example for a column with N chunks of
+        equal size M (only the last chunk may be shorter),
+        ``offset = n * M``, ``n = 0 .. N-1``.
         """
         return 0
 
@@ -104,14 +226,14 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]:
         Dtype description as a tuple ``(kind, bit-width, format string,
         endianness)``.
         Bit-width : the number of bits as an integer
-        Format string : data type description format string in Apache Arrow
-                        C Data Interface format.
+        Format string : data type description format string in Apache Arrow C
+                        Data Interface format.
         Endianness : current only native endianness (``=``) is supported
         Notes:
-            - Kind specifiers are aligned with DLPack where possible (hence
-            the jump to 20, leave enough room for future extension)
-            - Masks must be specified as boolean with either bit width 1
-              (for bit masks) or 8 (for byte masks).
+            - Kind specifiers are aligned with DLPack where possible (hence the
+              jump to 20, leave enough room for future extension)
+            - Masks must be specified as boolean with either bit width 1 (for
+              bit masks) or 8 (for byte masks).
             - Dtype width in bits was preferred over bytes
             - Endianness isn't too useful, but included now in case in the
               future we need to support non-native endianness
@@ -166,18 +288,20 @@ def describe_categorical(self) -> CategoricalDescription:
         """
         If the dtype is categorical, there are two options:
         - There are only values in the data buffer.
-        - There is a separate non-categorical Column encoding for categorical
+        - There is a separate non-categorical Column encoding categorical
           values.
         Raises TypeError if the dtype is not categorical
-        Content of returned dict:
+        Returns the dictionary with description on how to interpret the
+        data buffer:
             - "is_ordered" : bool, whether the ordering of dictionary indices
                              is semantically meaningful.
-            - "is_dictionary" : bool, whether a dictionary-style mapping of
+            - "is_dictionary" : bool, whether a mapping of
                                 categorical values to other objects exists
             - "categories" : Column representing the (implicit) mapping of
                              indices to category values (e.g. an array of
                              cat1, cat2, ...). None if not a dictionary-style
                              categorical.
+        TBD: are there any other in-memory representations that are needed?
         """
         if isinstance(self._col, pa.ChunkedArray):
             arr = self._col.combine_chunks()
@@ -193,24 +317,32 @@ def describe_categorical(self) -> CategoricalDescription:
         return {
             "is_ordered": self._col.type.ordered,
             "is_dictionary": True,
-            "categories": PyArrowColumn(arr.dictionary),
+            "categories": _PyArrowColumn(arr.dictionary),
         }
 
     @property
     def describe_null(self) -> Tuple[ColumnNullType, Any]:
+        """
+        Return the missing value (or "null") representation the column dtype
+        uses, as a tuple ``(kind, value)``.
+        Value : if kind is "sentinel value", the actual value. If kind is a bit
+        mask or a byte mask, the value (0 or 1) indicating a missing value.
+        None otherwise.
+        """
         return ColumnNullType.USE_BITMASK, 0
 
     @property
     def null_count(self) -> int:
         """
-        Number of null elements. Should always be known.
+        Number of null elements, if known.
+        Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.
         """
         return self._col.null_count
 
     @property
     def metadata(self) -> Dict[str, Any]:
         """
-        Store specific metadata of the column.
+        The metadata for the column. See `DataFrame.metadata` for more details.
         """
         pass
 
@@ -224,7 +356,9 @@ def num_chunks(self) -> int:
             n_chunks = self._col.num_chunks
         return n_chunks
 
-    def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]:
+    def get_chunks(
+        self, n_chunks: Optional[int] = None
+    ) -> Iterable[_PyArrowColumn]:
         """
         Return an iterator yielding the chunks.
         See `DataFrame.get_chunks` for details on ``n_chunks``.
@@ -241,17 +375,17 @@ def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]:
 
             i = 0
             for start in range(0, chunk_size * n_chunks, chunk_size):
-                yield PyArrowColumn(
+                yield _PyArrowColumn(
                     array.slice(start, chunk_size), self._allow_copy
                 )
                 i += 1
             # In case when the size of the chunk is such that the resulting
             # list is one less chunk then n_chunks -> append an empty chunk
             if i == n_chunks - 1:
-                yield PyArrowColumn(pa.array([]), self._allow_copy)
+                yield _PyArrowColumn(pa.array([]), self._allow_copy)
         elif isinstance(self._col, pa.ChunkedArray):
             return [
-                PyArrowColumn(chunk, self._allow_copy)
+                _PyArrowColumn(chunk, self._allow_copy)
                 for chunk in self._col.chunks
             ]
         else:
@@ -296,7 +430,7 @@ def get_buffers(self) -> ColumnBuffers:
 
     def _get_data_buffer(
         self,
-    ) -> Tuple[PyArrowBuffer, Any]:  # Any is for self.dtype tuple
+    ) -> Tuple[_PyArrowBuffer, Any]:  # Any is for self.dtype tuple
         """
         Return the buffer containing the data and the buffer's
         associated dtype.
@@ -311,15 +445,15 @@ def _get_data_buffer(
         # as bit packed buffers are not supported
         if pa.types.is_boolean(array.type):
             array = pc.cast(array, pa.uint8())
-            dtype = PyArrowColumn(array).dtype
+            dtype = _PyArrowColumn(array).dtype
 
         n = len(array.buffers())
         if n == 2:
-            return PyArrowBuffer(array.buffers()[1]), dtype
+            return _PyArrowBuffer(array.buffers()[1]), dtype
         elif n == 3:
-            return PyArrowBuffer(array.buffers()[2]), dtype
+            return _PyArrowBuffer(array.buffers()[2]), dtype
 
-    def _get_validity_buffer(self) -> Tuple[PyArrowBuffer, Any]:
+    def _get_validity_buffer(self) -> Tuple[_PyArrowBuffer, Any]:
         """
         Return the buffer containing the mask values indicating missing data
         and the buffer's associated dtype.
@@ -334,13 +468,13 @@ def _get_validity_buffer(self) -> Tuple[PyArrowBuffer, Any]:
             array = self._col
         buff = array.buffers()[0]
         if buff:
-            return PyArrowBuffer(buff), dtype
+            return _PyArrowBuffer(buff), dtype
         else:
             raise NoBufferPresent(
                 "There are no missing values so "
                 "does not have a separate mask")
 
-    def _get_offsets_buffer(self) -> Tuple[PyArrowBuffer, Any]:
+    def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any]:
         """
         Return the buffer containing the offset values for variable-size binary
         data (e.g., variable-length strings) and the buffer's associated dtype.
@@ -360,4 +494,4 @@ def _get_offsets_buffer(self) -> Tuple[PyArrowBuffer, Any]:
         elif n == 3:
             # Define the dtype of the returned buffer
             dtype = (DtypeKind.INT, 64, "L", Endianness.NATIVE)
-            return PyArrowBuffer(array.buffers()[2]), dtype
+            return _PyArrowBuffer(array.buffers()[2]), dtype
diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py
index 5eb93abacc3..965432dd938 100644
--- a/python/pyarrow/interchange/dataframe.py
+++ b/python/pyarrow/interchange/dataframe.py
@@ -25,17 +25,20 @@
 
 import pyarrow as pa
 
-from pyarrow.interchange.column import PyArrowColumn
-from pyarrow.interchange.dataframe_protocol import DataFrame as DataFrameXchg
+from pyarrow.interchange.column import _PyArrowColumn
 
 
-class TableXchg(DataFrameXchg):
+class _PyArrowDataFrame:
     """
     A data frame class, with only the methods required by the interchange
     protocol defined.
-    Instances of this (private) class are returned from
-    ``pd.DataFrame.__dataframe__`` as objects with the methods and
-    attributes defined on this class.
+    A "data frame" represents an ordered collection of named columns.
+    A column's "name" must be a unique string.
+    Columns may be accessed by name or by position.
+    This could be a public data frame class, or an object with the methods and
+    attributes defined on this DataFrame class could be returned from the
+    ``__dataframe__`` method of a public data frame class in a library adhering
+    to the dataframe interchange protocol specification.
     """
 
     def __init__(
@@ -56,13 +59,33 @@ def __init__(
 
     def __dataframe__(
         self, nan_as_null: bool = False, allow_copy: bool = True
-    ) -> TableXchg:
-        return TableXchg(self._df, nan_as_null, allow_copy)
+    ) -> _PyArrowDataFrame:
+        """
+        Construct a new exchange object, potentially changing the parameters.
+        ``nan_as_null`` is a keyword intended for the consumer to tell the
+        producer to overwrite null values in the data with ``NaN``.
+        It is intended for cases where the consumer does not support the bit
+        mask or byte mask that is the producer's native representation.
+        ``allow_copy`` is a keyword that defines whether or not the library is
+        allowed to make a copy of the data. For example, copying data would be
+        necessary if a library supports strided buffers, given that this
+        protocol specifies contiguous buffers.
+        """
+        return _PyArrowDataFrame(self._df, nan_as_null, allow_copy)
 
     @property
     def metadata(self) -> dict[str, Any]:
+        """
+        The metadata for the data frame, as a dictionary with string keys. The
+        contents of `metadata` may be anything, they are meant for a library
+        to store information that it needs to, e.g., roundtrip losslessly or
+        for two implementations to share data that is not (yet) part of the
+        interchange protocol specification. For avoiding collisions with other
+        entries, please add name the keys with the name of the library
+        followed by a period and the desired name, e.g, ``pandas.indexcol``.
+        """
         # The metadata for the data frame, as a dictionary with string keys.
-        # Add schema metadata here (pandas metadata, ot custom metadata)
+        # Add schema metadata here (pandas metadata or custom metadata)
         if self._df.schema.metadata:
             schema_metadata = {k.decode('utf8'): v.decode('utf8')
                                for k, v in self._df.schema.metadata.items()}
@@ -71,46 +94,81 @@ def metadata(self) -> dict[str, Any]:
             return {}
 
     def num_columns(self) -> int:
+        """
+        Return the number of columns in the DataFrame.
+        """
         return self._df.num_columns
 
     def num_rows(self) -> int:
+        """
+        Return the number of rows in the DataFrame, if available.
+        """
         return self._df.num_rows
 
     def num_chunks(self) -> int:
+        """
+        Return the number of chunks the DataFrame consists of.
+        """
         return self._df.column(0).num_chunks
 
     def column_names(self) -> Iterable[str]:
+        """
+        Return an iterator yielding the column names.
+        """
         return self._df.column_names
 
-    def get_column(self, i: int) -> PyArrowColumn:
-        return PyArrowColumn(self._df.column(i),
-                             allow_copy=self._allow_copy)
+    def get_column(self, i: int) -> _PyArrowColumn:
+        """
+        Return the column at the indicated position.
+        """
+        return _PyArrowColumn(self._df.column(i),
+                              allow_copy=self._allow_copy)
 
-    def get_column_by_name(self, name: str) -> PyArrowColumn:
-        return PyArrowColumn(self._df.column(name),
-                             allow_copy=self._allow_copy)
+    def get_column_by_name(self, name: str) -> _PyArrowColumn:
+        """
+        Return the column whose name is the indicated name.
+        """
+        return _PyArrowColumn(self._df.column(name),
+                              allow_copy=self._allow_copy)
 
-    def get_columns(self) -> Iterable[PyArrowColumn]:
+    def get_columns(self) -> Iterable[_PyArrowColumn]:
+        """
+        Return an iterator yielding the columns.
+        """
         return [
-            PyArrowColumn(col, allow_copy=self._allow_copy)
+            _PyArrowColumn(col, allow_copy=self._allow_copy)
             for col in self._df.columns
         ]
 
-    def select_columns(self, indices: Sequence[int]) -> TableXchg:
-        return TableXchg(
+    def select_columns(self, indices: Sequence[int]) -> _PyArrowDataFrame:
+        """
+        Create a new DataFrame by selecting a subset of columns by index.
+        """
+        return _PyArrowDataFrame(
             self._df.select(list(indices)), self._nan_as_null, self._allow_copy
         )
 
-    def select_columns_by_name(self, names: Sequence[str]) -> TableXchg:
-        return TableXchg(
+    def select_columns_by_name(
+        self, names: Sequence[str]
+    ) -> _PyArrowDataFrame:
+        """
+        Create a new DataFrame by selecting a subset of columns by name.
+        """
+        return _PyArrowDataFrame(
             self._df.select(list(names)), self._nan_as_null, self._allow_copy
         )
 
     def get_chunks(
         self, n_chunks: Optional[int] = None
-    ) -> Iterable[TableXchg]:
+    ) -> Iterable[_PyArrowDataFrame]:
         """
         Return an iterator yielding the chunks.
+        By default (None), yields the chunks that the data is stored as by the
+        producer. If given, ``n_chunks`` must be a multiple of
+        ``self.num_chunks()``, meaning the producer must subdivide each chunk
+        before yielding it.
+        Note that the producer must ensure that all columns are chunked the
+        same way.
         """
         if n_chunks and n_chunks > 1:
             chunk_size = self.num_rows() // n_chunks
@@ -124,7 +182,7 @@ def get_chunks(
         else:
             batches = self._df.to_batches()
 
-        iterator_tables = [TableXchg(
+        iterator_tables = [_PyArrowDataFrame(
             pa.Table.from_batches([batch]), self._nan_as_null, self._allow_copy
         )
             for batch in batches
diff --git a/python/pyarrow/interchange/dataframe_protocol.py b/python/pyarrow/interchange/dataframe_protocol.py
deleted file mode 100644
index d83dec30495..00000000000
--- a/python/pyarrow/interchange/dataframe_protocol.py
+++ /dev/null
@@ -1,506 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Copy of the spec from https://github.com/data-apis/dataframe-api
-"""
-
-from abc import (
-    ABC,
-    abstractmethod,
-)
-import enum
-from typing import (
-    Any,
-    Dict,
-    Iterable,
-    Optional,
-    Sequence,
-    Tuple,
-)
-
-import sys
-
-if sys.version_info >= (3, 8):
-    from typing import TypedDict
-else:
-    from typing_extensions import TypedDict
-
-
-class DlpackDeviceType(enum.IntEnum):
-    """Integer enum for device type codes matching DLPack."""
-
-    CPU = 1
-    CUDA = 2
-    CPU_PINNED = 3
-    OPENCL = 4
-    VULKAN = 7
-    METAL = 8
-    VPI = 9
-    ROCM = 10
-
-
-class DtypeKind(enum.IntEnum):
-    """
-    Integer enum for data types.
-    Attributes
-    ----------
-    INT : int
-        Matches to signed integer data type.
-    UINT : int
-        Matches to unsigned integer data type.
-    FLOAT : int
-        Matches to floating point data type.
-    BOOL : int
-        Matches to boolean data type.
-    STRING : int
-        Matches to string data type (UTF-8 encoded).
-    DATETIME : int
-        Matches to datetime data type.
-    CATEGORICAL : int
-        Matches to categorical data type.
-    """
-
-    INT = 0
-    UINT = 1
-    FLOAT = 2
-    BOOL = 20
-    STRING = 21  # UTF-8
-    DATETIME = 22
-    CATEGORICAL = 23
-
-
-Dtype = Tuple[DtypeKind, int, str, str]  # see Column.dtype
-
-
-class ColumnNullType(enum.IntEnum):
-    """
-    Integer enum for null type representation.
-    Attributes
-    ----------
-    NON_NULLABLE : int
-        Non-nullable column.
-    USE_NAN : int
-        Use explicit float NaN value.
-    USE_SENTINEL : int
-        Sentinel value besides NaN.
-    USE_BITMASK : int
-        The bit is set/unset representing a null on a certain position.
-    USE_BYTEMASK : int
-        The byte is set/unset representing a null on a certain position.
-    """
-
-    NON_NULLABLE = 0
-    USE_NAN = 1
-    USE_SENTINEL = 2
-    USE_BITMASK = 3
-    USE_BYTEMASK = 4
-
-
-class ColumnBuffers(TypedDict):
-    # first element is a buffer containing the column data;
-    # second element is the data buffer's associated dtype
-    data: Tuple["Buffer", Dtype]
-
-    # first element is a buffer containing mask values indicating missing data;
-    # second element is the mask value buffer's associated dtype.
-    # None if the null representation is not a bit or byte mask
-    validity: Optional[Tuple["Buffer", Dtype]]
-
-    # first element is a buffer containing the offset values for
-    # variable-size binary data (e.g., variable-length strings);
-    # second element is the offsets buffer's associated dtype.
-    # None if the data buffer does not have an associated offsets buffer
-    offsets: Optional[Tuple["Buffer", Dtype]]
-
-
-class CategoricalDescription(TypedDict):
-    # whether the ordering of dictionary indices is semantically meaningful
-    is_ordered: bool
-    # whether a dictionary-style mapping of categorical values to other objects
-    # exists
-    is_dictionary: bool
-    # Python-level only (e.g. ``{int: str}``).
-    # None if not a dictionary-style categorical.
-    # categories: Optional[Column]
-
-
-class Buffer(ABC):
-    """
-    Data in the buffer is guaranteed to be contiguous in memory.
-    Note that there is no dtype attribute present, a buffer can be thought of
-    as simply a block of memory. However, if the column that the buffer is
-    attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
-    implemented, then that dtype information will be contained in the return
-    value from ``__dlpack__``.
-    This distinction is useful to support both data exchange via DLPack on a
-    buffer and (b) dtypes like variable-length strings which do not have a
-    fixed number of bytes per element.
-    """
-
-    @property
-    @abstractmethod
-    def bufsize(self) -> int:
-        """
-        Buffer size in bytes.
-        """
-        pass
-
-    @property
-    @abstractmethod
-    def ptr(self) -> int:
-        """
-        Pointer to start of the buffer as an integer.
-        """
-        pass
-
-    @abstractmethod
-    def __dlpack__(self):
-        """
-        Produce DLPack capsule (see array API standard).
-        Raises:
-            - TypeError : if the buffer contains unsupported dtypes.
-            - NotImplementedError : if DLPack support is not implemented
-        Useful to have to connect to array libraries. Support optional because
-        it's not completely trivial to implement for a Python-only library.
-        """
-        raise NotImplementedError("__dlpack__")
-
-    @abstractmethod
-    def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]:
-        """
-        Device type and device ID for where the data in the buffer resides.
-        Uses device type codes matching DLPack.
-        Note: must be implemented even if ``__dlpack__`` is not.
-        """
-        pass
-
-
-class Column(ABC):
-    """
-    A column object, with only the methods and properties required by the
-    interchange protocol defined.
-    A column can contain one or more chunks. Each chunk can contain up to three
-    buffers - a data buffer, a mask buffer (depending on null representation),
-    and an offsets buffer (if variable-size binary; e.g., variable-length
-    strings).
-    TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
-         Instead, it seems to use "children" for both columns with a bit mask,
-         and for nested dtypes. Unclear whether this is elegant or confusing.
-         This design requires checking the null representation explicitly.
-         The Arrow design requires checking:
-         1. the ARROW_FLAG_NULLABLE (for sentinel values)
-         2. if a column has two children, combined with one of those children
-            having a null dtype.
-         Making the mask concept explicit seems useful. One null dtype would
-         not be enough to cover both bit and byte masks, so that would mean
-         even more checking if we did it the Arrow way.
-    TBD: there's also the "chunk" concept here, which is implicit in Arrow as
-         multiple buffers per array (= column here). Semantically it may make
-         sense to have both: chunks were meant for example for lazy evaluation
-         of data which doesn't fit in memory, while multiple buffers per column
-         could also come from doing a selection operation on a single
-         contiguous buffer.
-         Given these concepts, one would expect chunks to be all of the same
-         size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),
-         while multiple buffers could have data-dependent lengths. Not an issue
-         in pandas if one column is backed by a single NumPy array, but in
-         Arrow it seems possible.
-         Are multiple chunks *and* multiple buffers per column necessary for
-         the purposes of this interchange protocol, or must producers either
-         reuse the chunk concept for this or copy the data?
-    Note: this Column object can only be produced by ``__dataframe__``, so
-          doesn't need its own version or ``__column__`` protocol.
-    """
-
-    @abstractmethod
-    def size(self) -> int:
-        """
-        Size of the column, in elements.
-        Corresponds to DataFrame.num_rows() if column is a single chunk;
-        equal to size of this current chunk otherwise.
-        Is a method rather than a property because it may cause a (potentially
-        expensive) computation for some dataframe implementations.
-        """
-        pass
-
-    @property
-    @abstractmethod
-    def offset(self) -> int:
-        """
-        Offset of first element.
-        May be > 0 if using chunks; for example for a column with N chunks of
-        equal size M (only the last chunk may be shorter),
-        ``offset = n * M``, ``n = 0 .. N-1``.
-        """
-        pass
-
-    @property
-    @abstractmethod
-    def dtype(self) -> Dtype:
-        """
-        Dtype description as a tuple ``(kind, bit-width, format string,
-        endianness)``.
-        Bit-width : the number of bits as an integer
-        Format string : data type description format string in Apache Arrow C
-                        Data Interface format.
-        Endianness : current only native endianness (``=``) is supported
-        Notes:
-            - Kind specifiers are aligned with DLPack where possible (hence the
-              jump to 20, leave enough room for future extension)
-            - Masks must be specified as boolean with either bit width 1 (for
-              bit masks) or 8 (for byte masks).
-            - Dtype width in bits was preferred over bytes
-            - Endianness isn't too useful, but included now in case in the
-              future we need to support non-native endianness
-            - Went with Apache Arrow format strings over NumPy format strings
-              because they're more complete from a dataframe perspective
-            - Format strings are mostly useful for datetime specification, and
-              for categoricals.
-            - For categoricals, the format string describes the type of the
-              categorical in the data buffer. In case of a separate encoding of
-              the categorical (e.g. an integer to string mapping), this can
-              be derived from ``self.describe_categorical``.
-            - Data types not included: complex, Arrow-style null, binary,
-              decimal, and nested (list, struct, map, union) dtypes.
-        """
-        pass
-
-    @property
-    @abstractmethod
-    def describe_categorical(self) -> CategoricalDescription:
-        """
-        If the dtype is categorical, there are two options:
-        - There are only values in the data buffer.
-        - There is a separate non-categorical Column encoding categorical
-          values.
-        Raises TypeError if the dtype is not categorical
-        Returns the dictionary with description on how to interpret the
-        data buffer:
-            - "is_ordered" : bool, whether the ordering of dictionary indices
-                             is semantically meaningful.
-            - "is_dictionary" : bool, whether a mapping of
-                                categorical values to other objects exists
-            - "categories" : Column representing the (implicit) mapping of
-                             indices to category values (e.g. an array of
-                             cat1, cat2, ...). None if not a dictionary-style
-                             categorical.
-        TBD: are there any other in-memory representations that are needed?
-        """
-        pass
-
-    @property
-    @abstractmethod
-    def describe_null(self) -> Tuple[ColumnNullType, Any]:
-        """
-        Return the missing value (or "null") representation the column dtype
-        uses, as a tuple ``(kind, value)``.
-        Value : if kind is "sentinel value", the actual value. If kind is a bit
-        mask or a byte mask, the value (0 or 1) indicating a missing value.
-        None otherwise.
-        """
-        pass
-
-    @property
-    @abstractmethod
-    def null_count(self) -> Optional[int]:
-        """
-        Number of null elements, if known.
-        Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.
-        """
-        pass
-
-    @property
-    @abstractmethod
-    def metadata(self) -> Dict[str, Any]:
-        """
-        The metadata for the column. See `DataFrame.metadata` for more details.
-        """
-        pass
-
-    @abstractmethod
-    def num_chunks(self) -> int:
-        """
-        Return the number of chunks the column consists of.
-        """
-        pass
-
-    @abstractmethod
-    def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]:
-        """
-        Return an iterator yielding the chunks.
-        See `DataFrame.get_chunks` for details on ``n_chunks``.
-        """
-        pass
-
-    @abstractmethod
-    def get_buffers(self) -> ColumnBuffers:
-        """
-        Return a dictionary containing the underlying buffers.
-        The returned dictionary has the following contents:
-            - "data": a two-element tuple whose first element is a buffer
-                      containing the data and whose second element is the data
-                      buffer's associated dtype.
-            - "validity": a two-element tuple whose first element is a buffer
-                          containing mask values indicating missing data and
-                          whose second element is the mask value buffer's
-                          associated dtype. None if the null representation is
-                          not a bit or byte mask.
-            - "offsets": a two-element tuple whose first element is a buffer
-                         containing the offset values for variable-size binary
-                         data (e.g., variable-length strings) and whose second
-                         element is the offsets buffer's associated dtype. None
-                         if the data buffer does not have an associated offsets
-                         buffer.
-        """
-        pass
-
-
-#    def get_children(self) -> Iterable[Column]:
-#        """
-#        Children columns underneath the column, each object in this iterator
-#        must adhere to the column specification.
-#        """
-#        pass
-
-
-class DataFrame(ABC):
-    """
-    A data frame class, with only the methods required by the interchange
-    protocol defined.
-    A "data frame" represents an ordered collection of named columns.
-    A column's "name" must be a unique string.
-    Columns may be accessed by name or by position.
-    This could be a public data frame class, or an object with the methods and
-    attributes defined on this DataFrame class could be returned from the
-    ``__dataframe__`` method of a public data frame class in a library adhering
-    to the dataframe interchange protocol specification.
-    """
-
-    version = 0  # version of the protocol
-
-    @abstractmethod
-    def __dataframe__(
-        self, nan_as_null: bool = False, allow_copy: bool = True
-    ) -> "DataFrame":
-        """
-        Construct a new exchange object, potentially changing the parameters.
-        ``nan_as_null`` is a keyword intended for the consumer to tell the
-        producer to overwrite null values in the data with ``NaN``.
-        It is intended for cases where the consumer does not support the bit
-        mask or byte mask that is the producer's native representation.
-        ``allow_copy`` is a keyword that defines whether or not the library is
-        allowed to make a copy of the data. For example, copying data would be
-        necessary if a library supports strided buffers, given that this
-        protocol specifies contiguous buffers.
-        """
-        pass
-
-    @property
-    @abstractmethod
-    def metadata(self) -> Dict[str, Any]:
-        """
-        The metadata for the data frame, as a dictionary with string keys. The
-        contents of `metadata` may be anything, they are meant for a library
-        to store information that it needs to, e.g., roundtrip losslessly or
-        for two implementations to share data that is not (yet) part of the
-        interchange protocol specification. For avoiding collisions with other
-        entries, please add name the keys with the name of the library
-        followed by a period and the desired name, e.g, ``pandas.indexcol``.
-        """
-        pass
-
-    @abstractmethod
-    def num_columns(self) -> int:
-        """
-        Return the number of columns in the DataFrame.
-        """
-        pass
-
-    @abstractmethod
-    def num_rows(self) -> Optional[int]:
-        # TODO: not happy with Optional, but need to flag it may be expensive
-        #       why include it if it may be None - what do we expect consumers
-        #       to do here?
-        """
-        Return the number of rows in the DataFrame, if available.
-        """
-        pass
-
-    @abstractmethod
-    def num_chunks(self) -> int:
-        """
-        Return the number of chunks the DataFrame consists of.
-        """
-        pass
-
-    @abstractmethod
-    def column_names(self) -> Iterable[str]:
-        """
-        Return an iterator yielding the column names.
-        """
-        pass
-
-    @abstractmethod
-    def get_column(self, i: int) -> Column:
-        """
-        Return the column at the indicated position.
-        """
-        pass
-
-    @abstractmethod
-    def get_column_by_name(self, name: str) -> Column:
-        """
-        Return the column whose name is the indicated name.
-        """
-        pass
-
-    @abstractmethod
-    def get_columns(self) -> Iterable[Column]:
-        """
-        Return an iterator yielding the columns.
-        """
-        pass
-
-    @abstractmethod
-    def select_columns(self, indices: Sequence[int]) -> "DataFrame":
-        """
-        Create a new DataFrame by selecting a subset of columns by index.
-        """
-        pass
-
-    @abstractmethod
-    def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame":
-        """
-        Create a new DataFrame by selecting a subset of columns by name.
-        """
-        pass
-
-    @abstractmethod
-    def get_chunks(
-        self, n_chunks: Optional[int] = None
-    ) -> Iterable["DataFrame"]:
-        """
-        Return an iterator yielding the chunks.
-        By default (None), yields the chunks that the data is stored as by the
-        producer. If given, ``n_chunks`` must be a multiple of
-        ``self.num_chunks()``, meaning the producer must subdivide each chunk
-        before yielding it.
-        Note that the producer must ensure that all columns are chunked the
-        same way.
-        """
-        pass
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 25f498cb0dd..94d8b6b6487 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -2833,9 +2833,9 @@ cdef class Table(_PandasConvertible):
         dtypes is added, this value should be propagated to columns.
         """
 
-        from pyarrow.interchange.dataframe import TableXchg
+        from pyarrow.interchange.dataframe import _PyArrowDataFrame
 
-        return TableXchg(self, nan_as_null, allow_copy)
+        return _PyArrowDataFrame(self, nan_as_null, allow_copy)
 
     # ----------------------------------------------------------------------
 

From 2860911ab671368f1f8004e3f4a9ab3cbd4cec12 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Tue, 15 Nov 2022 13:06:31 +0100
Subject: [PATCH 17/29] Add missing changes to the class names and references

---
 python/pyarrow/interchange/from_dataframe.py  | 36 +++++++++----------
 .../pyarrow/tests/interchange/test_extra.py   |  6 ++--
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py
index f4fcfe45855..6c5d576d467 100644
--- a/python/pyarrow/interchange/from_dataframe.py
+++ b/python/pyarrow/interchange/from_dataframe.py
@@ -21,10 +21,10 @@
     Any,
 )
 
-from pyarrow.interchange.dataframe_protocol import (
-    Buffer,
-    Column,
-    DataFrame as DataFrameXchg,
+from pyarrow.interchange.buffer import _PyArrowBuffer
+from pyarrow.interchange.column import _PyArrowColumn
+from pyarrow.interchange.dataframe import (
+    _PyArrowDataFrame,
     DtypeKind,
 )
 
@@ -39,7 +39,7 @@ def from_dataframe(df, allow_copy=True) -> pa.Table:
 
     Parameters
     ----------
-    df : DataFrameXchg
+    df : _PyArrowDataFrame
         Object supporting the interchange protocol, i.e. `__dataframe__`
         method.
     allow_copy : bool, default: True
@@ -58,12 +58,12 @@ def from_dataframe(df, allow_copy=True) -> pa.Table:
     return _from_dataframe(df.__dataframe__(allow_copy=allow_copy))
 
 
-def _from_dataframe(df: DataFrameXchg, allow_copy=True):
+def _from_dataframe(df: _PyArrowDataFrame, allow_copy=True):
     """
     Build a ``pa.Table`` from the DataFrame interchange object.
     Parameters
     ----------
-    df : DataFrameXchg
+    df : _PyArrowDataFrame
         Object supporting the interchange protocol, i.e. `__dataframe__`
         method.
     allow_copy : bool, default: True
@@ -76,12 +76,12 @@ def _from_dataframe(df: DataFrameXchg, allow_copy=True):
     pass
 
 
-def protocol_df_chunk_to_pyarrow(df: DataFrameXchg) -> pa.Table:
+def protocol_df_chunk_to_pyarrow(df: _PyArrowDataFrame) -> pa.Table:
     """
     Convert interchange protocol chunk to ``pd.DataFrame``.
     Parameters
     ----------
-    df : DataFrameXchg
+    df : _PyArrowDataFrame
     Returns
     -------
     pa.Table
@@ -118,7 +118,7 @@ def protocol_df_chunk_to_pyarrow(df: DataFrameXchg) -> pa.Table:
     pass
 
 
-def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
+def primitive_column_to_ndarray(col: _PyArrowColumn) -> tuple[np.ndarray, Any]:
     """
     Convert a column holding one of the primitive dtypes to a NumPy array.
     A primitive type is one of: int, uint, float, bool.
@@ -135,7 +135,7 @@ def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
 
 
 def categorical_column_to_dictionary(
-    col: Column
+    col: _PyArrowColumn
 ) -> tuple[pa.ChunkedArray, Any]:
     """
     Convert a column holding categorical data to a pandas Series.
@@ -151,7 +151,7 @@ def categorical_column_to_dictionary(
     pass
 
 
-def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
+def string_column_to_ndarray(col: _PyArrowColumn) -> tuple[np.ndarray, Any]:
     """
     Convert a column holding string data to a NumPy array.
     Parameters
@@ -171,7 +171,7 @@ def parse_datetime_format_str(format_str, data):
     pass
 
 
-def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
+def datetime_column_to_ndarray(col: _PyArrowColumn) -> tuple[np.ndarray, Any]:
     """
     Convert a column holding DateTime data to a NumPy array.
     Parameters
@@ -187,7 +187,7 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
 
 
 def buffer_to_ndarray(
-    buffer: Buffer,
+    buffer: _PyArrowBuffer,
     dtype: tuple[DtypeKind, int, str, str],
     offset: int = 0,
     length: int | None = None,
@@ -239,8 +239,8 @@ def bitmask_to_bool_ndarray(
 
 def set_nulls(
     data: np.ndarray | pa.Array | pa.ChunkedArray,
-    col: Column,
-    validity: tuple[Buffer, tuple[DtypeKind, int, str, str]] | None,
+    col: _PyArrowColumn,
+    validity: tuple[_PyArrowBuffer, tuple[DtypeKind, int, str, str]] | None,
     allow_modify_inplace: bool = True,
 ):
     """
@@ -249,9 +249,9 @@ def set_nulls(
     ----------
     data : np.ndarray, pa.Array or pa.ChunkedArray,
         Data to set nulls in.
-    col : Column
+    col : _PyArrowColumn
         Column object that describes the `data`.
-    validity : tuple(Buffer, dtype) or None
+    validity : tuple(_PyArrowBuffer, dtype) or None
         The return value of ``col.buffers()``. We do not access the
         ``col.buffers()`` here to not take the ownership of the memory
         of buffer objects.
diff --git a/python/pyarrow/tests/interchange/test_extra.py b/python/pyarrow/tests/interchange/test_extra.py
index 8fd683c8bf5..8028abac118 100644
--- a/python/pyarrow/tests/interchange/test_extra.py
+++ b/python/pyarrow/tests/interchange/test_extra.py
@@ -19,8 +19,8 @@
 import pyarrow as pa
 import pytest
 
-from pyarrow.interchange.column import PyArrowColumn
-from pyarrow.interchange.dataframe_protocol import (
+from pyarrow.interchange.column import (
+    _PyArrowColumn,
     ColumnNullType,
     DtypeKind,
 )
@@ -47,7 +47,7 @@ def test_datetime():
 )
 def test_array_to_pyarrowcolumn(test_data, kind):
     arr = pa.array(test_data)
-    arr_column = PyArrowColumn(arr)
+    arr_column = _PyArrowColumn(arr)
 
     assert arr_column._col == arr
     assert arr_column.size() == len(test_data)

From 92a176597064710dc64474ebd2ea8b61c93c50e3 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Tue, 15 Nov 2022 13:23:22 +0100
Subject: [PATCH 18/29] Add ColumnNullType = non nullable for columns without
 missing values

---
 python/pyarrow/interchange/column.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py
index 10c3f7c8c5b..5de4f658e6d 100644
--- a/python/pyarrow/interchange/column.py
+++ b/python/pyarrow/interchange/column.py
@@ -329,7 +329,13 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]:
         mask or a byte mask, the value (0 or 1) indicating a missing value.
         None otherwise.
         """
-        return ColumnNullType.USE_BITMASK, 0
+        # In case of no missing values, we need to set ColumnNullType to
+        # non nullable as in the current __dataframe__ protocol bit/byte masks
+        # can not be None
+        if self.null_count == 0:
+            return ColumnNullType.NON_NULLABLE, None
+        else:
+            return ColumnNullType.USE_BITMASK, 0
 
     @property
     def null_count(self) -> int:

From 95f7f45324dd5bd481af46c773a74f4f26136f0a Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Tue, 15 Nov 2022 13:27:00 +0100
Subject: [PATCH 19/29] Correct test error after describe_null() change

---
 python/pyarrow/tests/interchange/test_extra.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyarrow/tests/interchange/test_extra.py b/python/pyarrow/tests/interchange/test_extra.py
index 8028abac118..d9eeb761866 100644
--- a/python/pyarrow/tests/interchange/test_extra.py
+++ b/python/pyarrow/tests/interchange/test_extra.py
@@ -34,7 +34,7 @@ def test_datetime():
     assert col.size() == 2
     assert col.null_count == 1
     assert col.dtype[0] == DtypeKind.DATETIME
-    assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0)
+    assert col.describe_null == (ColumnNullType.USE_BITMASK, 0)
 
 
 @pytest.mark.parametrize(

From 964e9da8cae15ddbfe8da30a48d40ee536390310 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Tue, 15 Nov 2022 14:20:47 +0100
Subject: [PATCH 20/29] Change DtypeKind to be imported from column.py

---
 python/pyarrow/interchange/from_dataframe.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py
index 6c5d576d467..81c4dd115be 100644
--- a/python/pyarrow/interchange/from_dataframe.py
+++ b/python/pyarrow/interchange/from_dataframe.py
@@ -22,11 +22,11 @@
 )
 
 from pyarrow.interchange.buffer import _PyArrowBuffer
-from pyarrow.interchange.column import _PyArrowColumn
-from pyarrow.interchange.dataframe import (
-    _PyArrowDataFrame,
+from pyarrow.interchange.column import (
+    _PyArrowColumn,
     DtypeKind,
 )
+from pyarrow.interchange.dataframe import _PyArrowDataFrame
 
 import numpy as np
 import pyarrow as pa

From 3658088bb2647411666a33bf17a3af4b4ec74d9f Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Thu, 17 Nov 2022 15:13:58 +0100
Subject: [PATCH 21/29] Add change for string dtype and bitmask - not sure
 about it though

---
 python/pyarrow/interchange/column.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py
index 5de4f658e6d..5bcf9affb9a 100644
--- a/python/pyarrow/interchange/column.py
+++ b/python/pyarrow/interchange/column.py
@@ -200,6 +200,7 @@ def __init__(
         self._col = column
         self._allow_copy = allow_copy
 
+    @property
     def size(self) -> int:
         """
         Size of the column, in elements.
@@ -467,7 +468,7 @@ def _get_validity_buffer(self) -> Tuple[_PyArrowBuffer, Any]:
         mask.
         """
         # Define the dtype of the returned buffer
-        dtype = (DtypeKind.BOOL, 8, "b", Endianness.NATIVE)
+        dtype = (DtypeKind.BOOL, 1, "b", Endianness.NATIVE)
         if isinstance(self._col, pa.ChunkedArray):
             array = self._col.combine_chunks()
         else:
@@ -499,5 +500,5 @@ def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any]:
             )
         elif n == 3:
             # Define the dtype of the returned buffer
-            dtype = (DtypeKind.INT, 64, "L", Endianness.NATIVE)
-            return _PyArrowBuffer(array.buffers()[2]), dtype
+            dtype = (DtypeKind.INT, 32, "i", Endianness.NATIVE)
+            return _PyArrowBuffer(array.buffers()[1]), dtype

From caefeed1d651d6c194adb2f0f87c44405898ee34 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Thu, 17 Nov 2022 15:38:34 +0100
Subject: [PATCH 22/29] Add a change for dictionary arrays

---
 python/pyarrow/interchange/column.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py
index 5bcf9affb9a..2f9d3f9b513 100644
--- a/python/pyarrow/interchange/column.py
+++ b/python/pyarrow/interchange/column.py
@@ -453,6 +453,12 @@ def _get_data_buffer(
         if pa.types.is_boolean(array.type):
             array = pc.cast(array, pa.uint8())
             dtype = _PyArrowColumn(array).dtype
+        # In case of dictionary arrays, use indices
+        # to define a buffer, codes are transferred through
+        # describe_categorical()
+        if pa.types.is_dictionary(array.type):
+            array = array.indices
+            dtype = _PyArrowColumn(array).dtype
 
         n = len(array.buffers())
         if n == 2:

From 8871d117da9f8db9b49282dc1ff3a4f0ce3bb08e Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Thu, 17 Nov 2022 15:56:13 +0100
Subject: [PATCH 23/29] Add corrections for timestamp dtype

---
 python/pyarrow/interchange/column.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py
index 2f9d3f9b513..4864aabf984 100644
--- a/python/pyarrow/interchange/column.py
+++ b/python/pyarrow/interchange/column.py
@@ -257,7 +257,9 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]:
 
         if pa.types.is_timestamp(dtype):
             kind = DtypeKind.DATETIME
-            f_string = "ts{dtype.unit}:{dtype.tz}"
+            ts = dtype.unit[0]
+            tz = dtype.tz if dtype.tz else ""
+            f_string = "ts{ts}:{tz}".format(ts=ts, tz=tz)
             return kind, bit_width, f_string, Endianness.NATIVE
         elif pa.types.is_dictionary(dtype):
             kind = DtypeKind.CATEGORICAL

From ad9b2e8c4041ce47cff99d31c6bcac7118f719ee Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Thu, 17 Nov 2022 17:22:20 +0100
Subject: [PATCH 24/29] Change size() to size

---
 python/pyarrow/interchange/column.py                      | 4 ++--
 python/pyarrow/tests/interchange/test_extra.py            | 4 ++--
 python/pyarrow/tests/interchange/test_interchange_spec.py | 8 ++++----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py
index 4864aabf984..5ee5b173155 100644
--- a/python/pyarrow/interchange/column.py
+++ b/python/pyarrow/interchange/column.py
@@ -373,8 +373,8 @@ def get_chunks(
         See `DataFrame.get_chunks` for details on ``n_chunks``.
         """
         if n_chunks and n_chunks > 1:
-            chunk_size = self.size() // n_chunks
-            if self.size() % n_chunks != 0:
+            chunk_size = self.size // n_chunks
+            if self.size % n_chunks != 0:
                 chunk_size += 1
 
             if isinstance(self._col, pa.ChunkedArray):
diff --git a/python/pyarrow/tests/interchange/test_extra.py b/python/pyarrow/tests/interchange/test_extra.py
index d9eeb761866..00adce005dc 100644
--- a/python/pyarrow/tests/interchange/test_extra.py
+++ b/python/pyarrow/tests/interchange/test_extra.py
@@ -31,7 +31,7 @@ def test_datetime():
     table = pa.table(df)
     col = table.__dataframe__().get_column_by_name("A")
 
-    assert col.size() == 2
+    assert col.size == 2
     assert col.null_count == 1
     assert col.dtype[0] == DtypeKind.DATETIME
     assert col.describe_null == (ColumnNullType.USE_BITMASK, 0)
@@ -50,7 +50,7 @@ def test_array_to_pyarrowcolumn(test_data, kind):
     arr_column = _PyArrowColumn(arr)
 
     assert arr_column._col == arr
-    assert arr_column.size() == len(test_data)
+    assert arr_column.size == len(test_data)
     assert arr_column.dtype[0] == kind
     assert arr_column.num_chunks() == 1
     assert arr_column.null_count == 0
diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py
index c292c9eefab..425e8f7f95d 100644
--- a/python/pyarrow/tests/interchange/test_interchange_spec.py
+++ b/python/pyarrow/tests/interchange/test_interchange_spec.py
@@ -39,7 +39,7 @@ def test_only_one_dtype(test_data):
         null_count = df.get_column_by_name(column).null_count
         assert null_count == 0
         assert isinstance(null_count, int)
-        assert df.get_column_by_name(column).size() == column_size
+        assert df.get_column_by_name(column).size == column_size
         assert df.get_column_by_name(column).offset == 0
 
 
@@ -64,7 +64,7 @@ def test_mixed_dtypes():
         col = df.get_column_by_name(column)
         assert col.null_count == 0
         assert isinstance(col.null_count, int)
-        assert col.size() == 3
+        assert col.size == 3
         assert col.offset == 0
 
         assert col.dtype[0] == kind
@@ -132,14 +132,14 @@ def test_column_get_chunks(size, n_chunks):
     df = table.__dataframe__()
     chunks = list(df.get_column(0).get_chunks(n_chunks))
     assert len(chunks) == n_chunks
-    assert sum(chunk.size() for chunk in chunks) == size
+    assert sum(chunk.size for chunk in chunks) == size
 
 
 def test_get_columns():
     table = pa.table({"a": [0, 1], "b": [2.5, 3.5]})
     df = table.__dataframe__()
     for col in df.get_columns():
-        assert col.size() == 2
+        assert col.size == 2
         assert col.num_chunks() == 1
     # for meanings of dtype[0] see the spec; we cannot import the
     # spec here as this file is expected to be vendored *anywhere*

From 2b83dd8cf188b1b38914cfcda5b75da080558207 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Tue, 22 Nov 2022 14:10:01 +0100
Subject: [PATCH 25/29] Add schema to empty record batch and keep the number of
 chukes fixed to n_chunks

---
 python/pyarrow/interchange/column.py    | 5 +----
 python/pyarrow/interchange/dataframe.py | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py
index 5ee5b173155..dbdb63193fa 100644
--- a/python/pyarrow/interchange/column.py
+++ b/python/pyarrow/interchange/column.py
@@ -388,10 +388,7 @@ def get_chunks(
                     array.slice(start, chunk_size), self._allow_copy
                 )
                 i += 1
-            # In case when the size of the chunk is such that the resulting
-            # list is one less chunk then n_chunks -> append an empty chunk
-            if i == n_chunks - 1:
-                yield _PyArrowColumn(pa.array([]), self._allow_copy)
+
         elif isinstance(self._col, pa.ChunkedArray):
             return [
                 _PyArrowColumn(chunk, self._allow_copy)
diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py
index 965432dd938..36b669f4358 100644
--- a/python/pyarrow/interchange/dataframe.py
+++ b/python/pyarrow/interchange/dataframe.py
@@ -178,7 +178,7 @@ def get_chunks(
             # In case when the size of the chunk is such that the resulting
             # list is one less chunk then n_chunks -> append an empty chunk
             if len(batches) == n_chunks - 1:
-                batches.append(pa.record_batch([]))
+                batches.append(pa.record_batch([[]], schema = self._df.schema))
         else:
             batches = self._df.to_batches()
 

From 4f150efcf930b4527034a572899322d29dfca3c4 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Tue, 22 Nov 2022 14:37:53 +0100
Subject: [PATCH 26/29] Add offset for sliced array with a test and use
 datetime instead of pandas timestamp in the tests

---
 python/pyarrow/interchange/column.py          |  7 ++++-
 .../pyarrow/tests/interchange/test_extra.py   | 26 +++++++++++++++++--
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py
index dbdb63193fa..a5c9d4a2833 100644
--- a/python/pyarrow/interchange/column.py
+++ b/python/pyarrow/interchange/column.py
@@ -219,7 +219,12 @@ def offset(self) -> int:
         equal size M (only the last chunk may be shorter),
         ``offset = n * M``, ``n = 0 .. N-1``.
         """
-        return 0
+        if isinstance(self._col, pa.Array):
+            return self._col.offset
+        else:
+            # ChunkedArray gets copied with `combine_chunks` so the offset will
+            # always be 0
+            return 0
 
     @property
     def dtype(self) -> Tuple[DtypeKind, int, str, str]:
diff --git a/python/pyarrow/tests/interchange/test_extra.py b/python/pyarrow/tests/interchange/test_extra.py
index 00adce005dc..68368ebaf2d 100644
--- a/python/pyarrow/tests/interchange/test_extra.py
+++ b/python/pyarrow/tests/interchange/test_extra.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import pandas as pd
+from datetime import datetime as dt
 import pyarrow as pa
 import pytest
 
@@ -25,9 +25,16 @@
     DtypeKind,
 )
 
+try:
+    import pandas as pd
+    import pandas.testing as tm
+    from pandas.core.interchange.from_dataframe import from_dataframe
+except ImportError:
+    pass
+
 
 def test_datetime():
-    df = pd.DataFrame({"A": [pd.Timestamp("2022-01-01"), pd.NaT]})
+    df = pd.DataFrame({"A": [dt(2007, 7, 13), None]})
     table = pa.table(df)
     col = table.__dataframe__().get_column_by_name("A")
 
@@ -59,3 +66,18 @@ def test_array_to_pyarrowcolumn(test_data, kind):
 
     for chunk in arr_column.get_chunks():
         assert chunk == arr_column
+
+
+@pytest.mark.pandas
+def test_offset_of_sliced_array():
+    arr = pa.array([1, 2, 3, 4])
+    arr_sliced = arr.slice(2, 2)
+
+    table = pa.table([arr], names = ["arr"])
+    table_sliced = pa.table([arr_sliced], names = ["arr_sliced"])
+
+    df = from_dataframe(table)
+    df_sliced = from_dataframe(table_sliced)
+
+    tm.assert_series_equal(df["arr"][2:4], df_sliced["arr_sliced"],
+                           check_index=False, check_names=False)

From 1a456fee50f6ffbc10d880ee1dd980fd233f2d00 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Thu, 24 Nov 2022 15:30:18 +0100
Subject: [PATCH 27/29] Fix linter errors

---
 python/pyarrow/interchange/dataframe.py        | 2 +-
 python/pyarrow/tests/interchange/test_extra.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py
index 36b669f4358..278ee102ec9 100644
--- a/python/pyarrow/interchange/dataframe.py
+++ b/python/pyarrow/interchange/dataframe.py
@@ -178,7 +178,7 @@ def get_chunks(
             # In case when the size of the chunk is such that the resulting
             # list is one less chunk then n_chunks -> append an empty chunk
             if len(batches) == n_chunks - 1:
-                batches.append(pa.record_batch([[]], schema = self._df.schema))
+                batches.append(pa.record_batch([[]], schema=self._df.schema))
         else:
             batches = self._df.to_batches()
 
diff --git a/python/pyarrow/tests/interchange/test_extra.py b/python/pyarrow/tests/interchange/test_extra.py
index 68368ebaf2d..e655c2ace2b 100644
--- a/python/pyarrow/tests/interchange/test_extra.py
+++ b/python/pyarrow/tests/interchange/test_extra.py
@@ -73,8 +73,8 @@ def test_offset_of_sliced_array():
     arr = pa.array([1, 2, 3, 4])
     arr_sliced = arr.slice(2, 2)
 
-    table = pa.table([arr], names = ["arr"])
-    table_sliced = pa.table([arr_sliced], names = ["arr_sliced"])
+    table = pa.table([arr], names=["arr"])
+    table_sliced = pa.table([arr_sliced], names=["arr_sliced"])
 
     df = from_dataframe(table)
     df_sliced = from_dataframe(table_sliced)

From 2632c558bf144c2244a9cd72415eb2a7375b6e4c Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Thu, 24 Nov 2022 15:53:32 +0100
Subject: [PATCH 28/29] Add a skip for the test using from_dataframe() added in
 pandas versions < 1.5.0

---
 python/pyarrow/tests/interchange/test_extra.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/pyarrow/tests/interchange/test_extra.py b/python/pyarrow/tests/interchange/test_extra.py
index e655c2ace2b..4181b117be6 100644
--- a/python/pyarrow/tests/interchange/test_extra.py
+++ b/python/pyarrow/tests/interchange/test_extra.py
@@ -17,6 +17,7 @@
 
 from datetime import datetime as dt
 import pyarrow as pa
+from pyarrow.vendored.version import Version
 import pytest
 
 from pyarrow.interchange.column import (
@@ -70,6 +71,9 @@ def test_array_to_pyarrowcolumn(test_data, kind):
 
 @pytest.mark.pandas
 def test_offset_of_sliced_array():
+    if Version(pd.__version__) < Version("1.5.0"):
+        pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
     arr = pa.array([1, 2, 3, 4])
     arr_sliced = arr.slice(2, 2)
 

From f177b15f9aa2f129175ec6b3a81bf814531bcda6 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Mon, 28 Nov 2022 08:37:29 +0100
Subject: [PATCH 29/29] Make changes to the from_dataframe.py skeleton

---
 python/pyarrow/interchange/from_dataframe.py | 95 ++++++++------------
 1 file changed, 37 insertions(+), 58 deletions(-)

diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py
index 81c4dd115be..a7746371129 100644
--- a/python/pyarrow/interchange/from_dataframe.py
+++ b/python/pyarrow/interchange/from_dataframe.py
@@ -32,14 +32,21 @@
 import pyarrow as pa
 
 
-def from_dataframe(df, allow_copy=True) -> pa.Table:
+# A typing protocol could be added later to let Mypy validate code using
+# `from_dataframe` better.
+DataFrameObject = Any
+ColumnObject = Any
+BufferObject = Any
+
+
+def from_dataframe(df: DataFrameObject, allow_copy=True) -> pa.Table:
     """
     Build a ``pa.Table`` from any DataFrame supporting the interchange
     protocol.
 
     Parameters
     ----------
-    df : _PyArrowDataFrame
+    df : DataFrameObject
         Object supporting the interchange protocol, i.e. `__dataframe__`
         method.
     allow_copy : bool, default: True
@@ -58,12 +65,12 @@ def from_dataframe(df, allow_copy=True) -> pa.Table:
     return _from_dataframe(df.__dataframe__(allow_copy=allow_copy))
 
 
-def _from_dataframe(df: _PyArrowDataFrame, allow_copy=True):
+def _from_dataframe(df: DataFrameObject, allow_copy=True):
     """
     Build a ``pa.Table`` from the DataFrame interchange object.
     Parameters
     ----------
-    df : _PyArrowDataFrame
+    df : DataFrameObject
         Object supporting the interchange protocol, i.e. `__dataframe__`
         method.
     allow_copy : bool, default: True
@@ -76,12 +83,12 @@ def _from_dataframe(df: _PyArrowDataFrame, allow_copy=True):
     pass
 
 
-def protocol_df_chunk_to_pyarrow(df: _PyArrowDataFrame) -> pa.Table:
+def protocol_df_chunk_to_pyarrow(df: DataFrameObject) -> pa.Table:
     """
     Convert interchange protocol chunk to ``pd.DataFrame``.
     Parameters
     ----------
-    df : _PyArrowDataFrame
+    df : DataFrameObject
     Returns
     -------
     pa.Table
@@ -118,49 +125,49 @@ def protocol_df_chunk_to_pyarrow(df: _PyArrowDataFrame) -> pa.Table:
     pass
 
 
-def primitive_column_to_ndarray(col: _PyArrowColumn) -> tuple[np.ndarray, Any]:
+def primitive_column_to_array(col: ColumnObject) -> tuple[pa.Array, Any]:
     """
-    Convert a column holding one of the primitive dtypes to a NumPy array.
+    Convert a column holding one of the primitive dtypes to a PyArrow array.
     A primitive type is one of: int, uint, float, bool.
     Parameters
     ----------
-    col : Column
+    col : ColumnObject
     Returns
     -------
     tuple
-        Tuple of np.ndarray holding the data and the memory owner object
+        Tuple of pa.Array holding the data and the memory owner object
         that keeps the memory alive.
     """
     pass
 
 
 def categorical_column_to_dictionary(
-    col: _PyArrowColumn
-) -> tuple[pa.ChunkedArray, Any]:
+    col: ColumnObject
+) -> tuple[pa.Array, Any]:
     """
     Convert a column holding categorical data to a pandas Series.
     Parameters
     ----------
-    col : Column
+    col : ColumnObject
     Returns
     -------
     tuple
-        Tuple of pa.ChunkedArray holding the data and the memory owner object
+        Tuple of pa.Array holding the data and the memory owner object
         that keeps the memory alive.
     """
     pass
 
 
-def string_column_to_ndarray(col: _PyArrowColumn) -> tuple[np.ndarray, Any]:
+def string_column_to_array(col: ColumnObject) -> tuple[pa.Array, Any]:
     """
     Convert a column holding string data to a NumPy array.
     Parameters
     ----------
-    col : Column
+    col : ColumnObject
     Returns
     -------
     tuple
-        Tuple of np.ndarray holding the data and the memory owner object
+        Tuple of pa.Array holding the data and the memory owner object
         that keeps the memory alive.
     """
     pass
@@ -171,33 +178,33 @@ def parse_datetime_format_str(format_str, data):
     pass
 
 
-def datetime_column_to_ndarray(col: _PyArrowColumn) -> tuple[np.ndarray, Any]:
+def datetime_column_to_array(col: ColumnObject) -> tuple[pa.Array, Any]:
     """
     Convert a column holding DateTime data to a NumPy array.
     Parameters
     ----------
-    col : Column
+    col : ColumnObject
     Returns
     -------
     tuple
-        Tuple of np.ndarray holding the data and the memory owner object
+        Tuple of pa.Array holding the data and the memory owner object
         that keeps the memory alive.
     """
     pass
 
 
-def buffer_to_ndarray(
-    buffer: _PyArrowBuffer,
+def buffer_to_array(
+    buffer: BufferObject,
     dtype: tuple[DtypeKind, int, str, str],
     offset: int = 0,
     length: int | None = None,
-) -> np.ndarray:
+) -> pa.Array:
     """
     Build a NumPy array from the passed buffer.
     Parameters
     ----------
-    buffer : Buffer
-        Buffer to build a NumPy array from.
+    buffer : BufferObject
+        Buffer to build a PyArrow array from.
     dtype : tuple
         Data type of the buffer conforming protocol dtypes format.
     offset : int, default: 0
@@ -207,7 +214,8 @@ def buffer_to_ndarray(
         from the buffer. Has no effect otherwise.
     Returns
     -------
-    np.ndarray
+    pa.Array
+
     Notes
     -----
     The returned array doesn't own the memory. The caller of this function
@@ -217,9 +225,9 @@ def buffer_to_ndarray(
     pass
 
 
-def bitmask_to_bool_ndarray(
+def bitmask_to_bool_array(
     bitmask: np.ndarray, mask_length: int, first_byte_offset: int = 0
-) -> np.ndarray:
+) -> pa.Array:
     """
     Convert bit-mask to a boolean NumPy array.
     Parameters
@@ -232,35 +240,6 @@ def bitmask_to_bool_ndarray(
         Number of elements to offset from the start of the first byte.
     Returns
     -------
-    np.ndarray[bool]
-    """
-    pass
-
-
-def set_nulls(
-    data: np.ndarray | pa.Array | pa.ChunkedArray,
-    col: _PyArrowColumn,
-    validity: tuple[_PyArrowBuffer, tuple[DtypeKind, int, str, str]] | None,
-    allow_modify_inplace: bool = True,
-):
-    """
-    Set null values for the data according to the column null kind.
-    Parameters
-    ----------
-    data : np.ndarray, pa.Array or pa.ChunkedArray,
-        Data to set nulls in.
-    col : _PyArrowColumn
-        Column object that describes the `data`.
-    validity : tuple(_PyArrowBuffer, dtype) or None
-        The return value of ``col.buffers()``. We do not access the
-        ``col.buffers()`` here to not take the ownership of the memory
-        of buffer objects.
-    allow_modify_inplace : bool, default: True
-        Whether to modify the `data` inplace when zero-copy is possible
-        (True) or always modify a copy of the `data` (False).
-    Returns
-    -------
-    np.ndarray, pa.Array or pa.ChunkedArray,
-        Data with the nulls being set.
+    pa.Array[bool]
     """
     pass