From 854e114f45ec52869cfec2b3bc14f53377b91a9a Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 25 Oct 2022 11:51:28 +0200 Subject: [PATCH 01/29] Initial sceleton for interchange package --- python/pyarrow/interchange/__init__.py | 16 + python/pyarrow/interchange/buffer.py | 72 +++ python/pyarrow/interchange/column.py | 193 +++++++ python/pyarrow/interchange/dataframe.py | 89 ++++ .../pyarrow/interchange/dataframe_protocol.py | 493 ++++++++++++++++++ python/pyarrow/interchange/from_dataframe.py | 253 +++++++++ 6 files changed, 1116 insertions(+) create mode 100644 python/pyarrow/interchange/__init__.py create mode 100644 python/pyarrow/interchange/buffer.py create mode 100644 python/pyarrow/interchange/column.py create mode 100644 python/pyarrow/interchange/dataframe.py create mode 100644 python/pyarrow/interchange/dataframe_protocol.py create mode 100644 python/pyarrow/interchange/from_dataframe.py diff --git a/python/pyarrow/interchange/__init__.py b/python/pyarrow/interchange/__init__.py new file mode 100644 index 00000000000..d216be4ddc9 --- /dev/null +++ b/python/pyarrow/interchange/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. \ No newline at end of file diff --git a/python/pyarrow/interchange/buffer.py b/python/pyarrow/interchange/buffer.py new file mode 100644 index 00000000000..06803aaf299 --- /dev/null +++ b/python/pyarrow/interchange/buffer.py @@ -0,0 +1,72 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from dataframe_protocol import ( + Buffer, + DlpackDeviceType, +) +import numpy as np + +class PyArrowBuffer(Buffer): + """ + Data in the buffer is guaranteed to be contiguous in memory. + """ + + def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None: + """ + Handle only regular columns (= numpy arrays) for now. + """ + pass + + @property + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + pass + + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + pass + + def __dlpack__(self): + """ + Represent this structure as DLPack interface. + """ + pass + + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: + """ + Device type and device ID for where the data in the buffer resides. + """ + pass + + def __repr__(self) -> str: + return ( + "PyArrowBuffer(" + + str( + { + "bufsize": self.bufsize, + "ptr": self.ptr, + "device": self.__dlpack_device__()[0].name, + } + ) + + ")" + ) \ No newline at end of file diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py new file mode 100644 index 00000000000..eb63f014187 --- /dev/null +++ b/python/pyarrow/interchange/column.py @@ -0,0 +1,193 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any + +import pyarrow as pa + +from buffer import PyArrowBuffer +from dataframe_protocol import ( + Column, + ColumnBuffers, + DtypeKind, +) + +class PyArrowColumn(Column): + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + """ + + def __init__(self, column: pa.Array, allow_copy: bool = True) -> None: + """ + Note: doesn't deal with extension arrays yet, just assume a regular + Series/ndarray for now. + """ + pass + + def size(self) -> int: + """ + Size of the column, in elements. + """ + pass + + @property + def offset(self) -> int: + """ + Offset of first element. Always zero. + """ + pass + + @property + def dtype(self) -> tuple[DtypeKind, int, str, str]: + """ + Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. + Bit-width : the number of bits as an integer + Format string : data type description format string in Apache Arrow C + Data Interface format. + Endianness : current only native endianness (``=``) is supported + Notes: + - Kind specifiers are aligned with DLPack where possible (hence the + jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 (for bit + masks) or 8 (for byte masks). + - Dtype width in bits was preferred over bytes + - Endianness isn't too useful, but included now in case in the future + we need to support non-native endianness + - Went with Apache Arrow format strings over NumPy format strings + because they're more complete from a dataframe perspective + - Format strings are mostly useful for datetime specification, and + for categoricals. + - For categoricals, the format string describes the type of the + categorical in the data buffer. In case of a separate encoding of + the categorical (e.g. an integer to string mapping), this can + be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, binary, decimal, + and nested (list, struct, map, union) dtypes. + """ + pass + + def _dtype_from_arrowdtype(self, dtype) -> tuple[DtypeKind, int, str, str]: + """ + See `self.dtype` for details. + """ + # Note: 'c' (complex) not handled yet (not in array spec v1). + # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled + # datetime and timedelta both map to datetime (is timedelta handled?) + + pass + + @property + def describe_categorical(self): + """ + If the dtype is categorical, there are two options: + - There are only values in the data buffer. + - There is a separate non-categorical Column encoding for categorical values. + Raises TypeError if the dtype is not categorical + Content of returned dict: + - "is_ordered" : bool, whether the ordering of dictionary indices is + semantically meaningful. + - "is_dictionary" : bool, whether a dictionary-style mapping of + categorical values to other objects exists + - "categories" : Column representing the (implicit) mapping of indices to + category values (e.g. an array of cat1, cat2, ...). + None if not a dictionary-style categorical. + """ + pass + + @property + def describe_null(self): + pass + + @property + def null_count(self) -> int: + """ + Number of null elements. Should always be known. + """ + pass + + @property + def metadata(self) -> dict[str, pd.Index]: + """ + Store specific metadata of the column. + """ + pass + + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + """ + pass + + def get_chunks(self, n_chunks: int | None = None): + """ + Return an iterator yielding the chunks. + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + pass + + def get_buffers(self) -> ColumnBuffers: + """ + Return a dictionary containing the underlying buffers. + The returned dictionary has the following contents: + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. + """ + pass + + def _get_data_buffer( + self, + ) -> tuple[PyArrowBuffer, Any]: # Any is for self.dtype tuple + """ + Return the buffer containing the data and the buffer's associated dtype. + """ + pass + + def _get_validity_buffer(self) -> tuple[PyArrowBuffer, Any]: + """ + Return the buffer containing the mask values indicating missing data and + the buffer's associated dtype. + Raises NoBufferPresent if null representation is not a bit or byte mask. + """ + pass + + def _get_offsets_buffer(self) -> tuple[PyArrowBuffer, Any]: + """ + Return the buffer containing the offset values for variable-size binary + data (e.g., variable-length strings) and the buffer's associated dtype. + Raises NoBufferPresent if the data buffer does not have an associated + offsets buffer. + """ + pass diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py new file mode 100644 index 00000000000..c14ae49758a --- /dev/null +++ b/python/pyarrow/interchange/dataframe.py @@ -0,0 +1,89 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pyarrow as pa + +from column import PandasColumn +from dataframe_protocol import DataFrame as DataFrameXchg + +class PyArrowTableXchg(DataFrameXchg): + """ + A data frame class, with only the methods required by the interchange + protocol defined. + Instances of this (private) class are returned from + ``pd.DataFrame.__dataframe__`` as objects with the methods and + attributes defined on this class. + """ + + def __init__( + self, df: pa.Table, nan_as_null: bool = False, allow_copy: bool = True + ) -> None: + """ + Constructor - an instance of this (private) class is returned from + `pd.DataFrame.__dataframe__`. + """ + self._df = df + # ``nan_as_null`` is a keyword intended for the consumer to tell the + # producer to overwrite null values in the data with ``NaN`` (or ``NaT``). + # This currently has no effect; once support for nullable extension + # dtypes is added, this value should be propagated to columns. + self._nan_as_null = nan_as_null + self._allow_copy = allow_copy + + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> PyArrowTableXchg: + return PyArrowTableXchg(self._df, nan_as_null, allow_copy) + + @property + def metadata(self) -> dict[str, Index]: + # `index` isn't a regular column, and the protocol doesn't support row + # labels - so we export it as Pandas-specific metadata here. + pass + + def num_columns(self) -> int: + pass + + def num_rows(self) -> int: + pass + + def num_chunks(self) -> int: + pass + + def column_names(self) -> Index: + pass + + def get_column(self, i: int) -> PyArrowColumn: + pass + + def get_column_by_name(self, name: str) -> PyArrowColumn: + pass + + def get_columns(self) -> list[PyArrowColumn]: + pass + + def select_columns(self, indices) -> PyArrowTableFrameXchg: + pass + + def select_columns_by_name(self, names) -> PyArrowTableFrameXchg: + pass + + def get_chunks(self, n_chunks=None): + """ + Return an iterator yielding the chunks. + """ + pass \ No newline at end of file diff --git a/python/pyarrow/interchange/dataframe_protocol.py b/python/pyarrow/interchange/dataframe_protocol.py new file mode 100644 index 00000000000..f12ae4e7ae8 --- /dev/null +++ b/python/pyarrow/interchange/dataframe_protocol.py @@ -0,0 +1,493 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Copy of the spec from https://github.com/data-apis/dataframe-api +""" + +from abc import ( + ABC, + abstractmethod, +) +import enum +from typing import ( + Any, + Dict, + Iterable, + Optional, + Sequence, + Tuple, + TypedDict, +) + + +class DlpackDeviceType(enum.IntEnum): + """Integer enum for device type codes matching DLPack.""" + + CPU = 1 + CUDA = 2 + CPU_PINNED = 3 + OPENCL = 4 + VULKAN = 7 + METAL = 8 + VPI = 9 + ROCM = 10 + + +class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + + +Dtype = Tuple[DtypeKind, int, str, str] # see Column.dtype + + +class ColumnNullType(enum.IntEnum): + """ + Integer enum for null type representation. + Attributes + ---------- + NON_NULLABLE : int + Non-nullable column. + USE_NAN : int + Use explicit float NaN value. + USE_SENTINEL : int + Sentinel value besides NaN. + USE_BITMASK : int + The bit is set/unset representing a null on a certain position. + USE_BYTEMASK : int + The byte is set/unset representing a null on a certain position. + """ + + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + + +class ColumnBuffers(TypedDict): + # first element is a buffer containing the column data; + # second element is the data buffer's associated dtype + data: Tuple["Buffer", Dtype] + + # first element is a buffer containing mask values indicating missing data; + # second element is the mask value buffer's associated dtype. + # None if the null representation is not a bit or byte mask + validity: Optional[Tuple["Buffer", Dtype]] + + # first element is a buffer containing the offset values for + # variable-size binary data (e.g., variable-length strings); + # second element is the offsets buffer's associated dtype. + # None if the data buffer does not have an associated offsets buffer + offsets: Optional[Tuple["Buffer", Dtype]] + + +class CategoricalDescription(TypedDict): + # whether the ordering of dictionary indices is semantically meaningful + is_ordered: bool + # whether a dictionary-style mapping of categorical values to other objects exists + is_dictionary: bool + # Python-level only (e.g. ``{int: str}``). + # None if not a dictionary-style categorical. + categories: Optional[Column] + + +class Buffer(ABC): + """ + Data in the buffer is guaranteed to be contiguous in memory. + Note that there is no dtype attribute present, a buffer can be thought of + as simply a block of memory. However, if the column that the buffer is + attached to has a dtype that's supported by DLPack and ``__dlpack__`` is + implemented, then that dtype information will be contained in the return + value from ``__dlpack__``. + This distinction is useful to support both data exchange via DLPack on a + buffer and (b) dtypes like variable-length strings which do not have a + fixed number of bytes per element. + """ + + @property + @abstractmethod + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + pass + + @property + @abstractmethod + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + pass + + @abstractmethod + def __dlpack__(self): + """ + Produce DLPack capsule (see array API standard). + Raises: + - TypeError : if the buffer contains unsupported dtypes. + - NotImplementedError : if DLPack support is not implemented + Useful to have to connect to array libraries. Support optional because + it's not completely trivial to implement for a Python-only library. + """ + raise NotImplementedError("__dlpack__") + + @abstractmethod + def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]: + """ + Device type and device ID for where the data in the buffer resides. + Uses device type codes matching DLPack. + Note: must be implemented even if ``__dlpack__`` is not. + """ + pass + + +class Column(ABC): + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. + Instead, it seems to use "children" for both columns with a bit mask, + and for nested dtypes. Unclear whether this is elegant or confusing. + This design requires checking the null representation explicitly. + The Arrow design requires checking: + 1. the ARROW_FLAG_NULLABLE (for sentinel values) + 2. if a column has two children, combined with one of those children + having a null dtype. + Making the mask concept explicit seems useful. One null dtype would + not be enough to cover both bit and byte masks, so that would mean + even more checking if we did it the Arrow way. + TBD: there's also the "chunk" concept here, which is implicit in Arrow as + multiple buffers per array (= column here). Semantically it may make + sense to have both: chunks were meant for example for lazy evaluation + of data which doesn't fit in memory, while multiple buffers per column + could also come from doing a selection operation on a single + contiguous buffer. + Given these concepts, one would expect chunks to be all of the same + size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), + while multiple buffers could have data-dependent lengths. Not an issue + in pandas if one column is backed by a single NumPy array, but in + Arrow it seems possible. + Are multiple chunks *and* multiple buffers per column necessary for + the purposes of this interchange protocol, or must producers either + reuse the chunk concept for this or copy the data? + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + """ + + @abstractmethod + def size(self) -> int: + """ + Size of the column, in elements. + Corresponds to DataFrame.num_rows() if column is a single chunk; + equal to size of this current chunk otherwise. + Is a method rather than a property because it may cause a (potentially + expensive) computation for some dataframe implementations. + """ + pass + + @property + @abstractmethod + def offset(self) -> int: + """ + Offset of first element. + May be > 0 if using chunks; for example for a column with N chunks of + equal size M (only the last chunk may be shorter), + ``offset = n * M``, ``n = 0 .. N-1``. + """ + pass + + @property + @abstractmethod + def dtype(self) -> Dtype: + """ + Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. + Bit-width : the number of bits as an integer + Format string : data type description format string in Apache Arrow C + Data Interface format. + Endianness : current only native endianness (``=``) is supported + Notes: + - Kind specifiers are aligned with DLPack where possible (hence the + jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 (for bit + masks) or 8 (for byte masks). + - Dtype width in bits was preferred over bytes + - Endianness isn't too useful, but included now in case in the future + we need to support non-native endianness + - Went with Apache Arrow format strings over NumPy format strings + because they're more complete from a dataframe perspective + - Format strings are mostly useful for datetime specification, and + for categoricals. + - For categoricals, the format string describes the type of the + categorical in the data buffer. In case of a separate encoding of + the categorical (e.g. an integer to string mapping), this can + be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, binary, decimal, + and nested (list, struct, map, union) dtypes. + """ + pass + + @property + @abstractmethod + def describe_categorical(self) -> CategoricalDescription: + """ + If the dtype is categorical, there are two options: + - There are only values in the data buffer. + - There is a separate non-categorical Column encoding categorical values. + Raises TypeError if the dtype is not categorical + Returns the dictionary with description on how to interpret the data buffer: + - "is_ordered" : bool, whether the ordering of dictionary indices is + semantically meaningful. + - "is_dictionary" : bool, whether a mapping of + categorical values to other objects exists + - "categories" : Column representing the (implicit) mapping of indices to + category values (e.g. an array of cat1, cat2, ...). + None if not a dictionary-style categorical. + TBD: are there any other in-memory representations that are needed? + """ + pass + + @property + @abstractmethod + def describe_null(self) -> Tuple[ColumnNullType, Any]: + """ + Return the missing value (or "null") representation the column dtype + uses, as a tuple ``(kind, value)``. + Value : if kind is "sentinel value", the actual value. If kind is a bit + mask or a byte mask, the value (0 or 1) indicating a missing value. None + otherwise. + """ + pass + + @property + @abstractmethod + def null_count(self) -> Optional[int]: + """ + Number of null elements, if known. + Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. + """ + pass + + @property + @abstractmethod + def metadata(self) -> Dict[str, Any]: + """ + The metadata for the column. See `DataFrame.metadata` for more details. + """ + pass + + @abstractmethod + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + """ + pass + + @abstractmethod + def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]: + """ + Return an iterator yielding the chunks. + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + pass + + @abstractmethod + def get_buffers(self) -> ColumnBuffers: + """ + Return a dictionary containing the underlying buffers. + The returned dictionary has the following contents: + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. + """ + pass + + +# def get_children(self) -> Iterable[Column]: +# """ +# Children columns underneath the column, each object in this iterator +# must adhere to the column specification. +# """ +# pass + + +class DataFrame(ABC): + """ + A data frame class, with only the methods required by the interchange + protocol defined. + A "data frame" represents an ordered collection of named columns. + A column's "name" must be a unique string. + Columns may be accessed by name or by position. + This could be a public data frame class, or an object with the methods and + attributes defined on this DataFrame class could be returned from the + ``__dataframe__`` method of a public data frame class in a library adhering + to the dataframe interchange protocol specification. + """ + + version = 0 # version of the protocol + + @abstractmethod + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> "DataFrame": + """ + Construct a new exchange object, potentially changing the parameters. + ``nan_as_null`` is a keyword intended for the consumer to tell the + producer to overwrite null values in the data with ``NaN``. + It is intended for cases where the consumer does not support the bit + mask or byte mask that is the producer's native representation. + ``allow_copy`` is a keyword that defines whether or not the library is + allowed to make a copy of the data. For example, copying data would be + necessary if a library supports strided buffers, given that this protocol + specifies contiguous buffers. + """ + pass + + @property + @abstractmethod + def metadata(self) -> Dict[str, Any]: + """ + The metadata for the data frame, as a dictionary with string keys. The + contents of `metadata` may be anything, they are meant for a library + to store information that it needs to, e.g., roundtrip losslessly or + for two implementations to share data that is not (yet) part of the + interchange protocol specification. For avoiding collisions with other + entries, please add name the keys with the name of the library + followed by a period and the desired name, e.g, ``pandas.indexcol``. + """ + pass + + @abstractmethod + def num_columns(self) -> int: + """ + Return the number of columns in the DataFrame. + """ + pass + + @abstractmethod + def num_rows(self) -> Optional[int]: + # TODO: not happy with Optional, but need to flag it may be expensive + # why include it if it may be None - what do we expect consumers + # to do here? + """ + Return the number of rows in the DataFrame, if available. + """ + pass + + @abstractmethod + def num_chunks(self) -> int: + """ + Return the number of chunks the DataFrame consists of. + """ + pass + + @abstractmethod + def column_names(self) -> Iterable[str]: + """ + Return an iterator yielding the column names. + """ + pass + + @abstractmethod + def get_column(self, i: int) -> Column: + """ + Return the column at the indicated position. + """ + pass + + @abstractmethod + def get_column_by_name(self, name: str) -> Column: + """ + Return the column whose name is the indicated name. + """ + pass + + @abstractmethod + def get_columns(self) -> Iterable[Column]: + """ + Return an iterator yielding the columns. + """ + pass + + @abstractmethod + def select_columns(self, indices: Sequence[int]) -> "DataFrame": + """ + Create a new DataFrame by selecting a subset of columns by index. + """ + pass + + @abstractmethod + def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame": + """ + Create a new DataFrame by selecting a subset of columns by name. + """ + pass + + @abstractmethod + def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["DataFrame"]: + """ + Return an iterator yielding the chunks. + By default (None), yields the chunks that the data is stored as by the + producer. If given, ``n_chunks`` must be a multiple of + ``self.num_chunks()``, meaning the producer must subdivide each chunk + before yielding it. + Note that the producer must ensure that all columns are chunked the + same way. + """ + pass diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py new file mode 100644 index 00000000000..a0d5179a053 --- /dev/null +++ b/python/pyarrow/interchange/from_dataframe.py @@ -0,0 +1,253 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from column import PyArrowColumn +from dataframe_protocol import ( + Buffer, + Column, + ColumnNullType, + DataFrame as DataFrameXchg, + DtypeKind, +) +import pyarrow as pa + + +def from_dataframe(df, allow_copy=True) -> pd.DataFrame: + """ + Build a ``pd.DataFrame`` from any DataFrame supporting the interchange protocol. + Parameters + ---------- + df : DataFrameXchg + Object supporting the interchange protocol, i.e. `__dataframe__` method. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + Returns + ------- + pd.DataFrame + """ + if isinstance(df, pa.Table): + return df + + if not hasattr(df, "__dataframe__"): + raise ValueError("`df` does not support __dataframe__") + + return _from_dataframe(df.__dataframe__(allow_copy=allow_copy)) + + +def _from_dataframe(df: DataFrameXchg, allow_copy=True): + """ + Build a ``pd.DataFrame`` from the DataFrame interchange object. + Parameters + ---------- + df : DataFrameXchg + Object supporting the interchange protocol, i.e. `__dataframe__` method. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + Returns + ------- + pd.DataFrame + """ + pass + + +def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame: + """ + Convert interchange protocol chunk to ``pd.DataFrame``. + Parameters + ---------- + df : DataFrameXchg + Returns + ------- + pd.DataFrame + """ + # We need a dict of columns here, with each column being a NumPy array (at + # least for now, deal with non-NumPy dtypes later). + columns: dict[str, Any] = {} + buffers = [] # hold on to buffers, keeps memory alive + for name in df.column_names(): + if not isinstance(name, str): + raise ValueError(f"Column {name} is not a string") + if name in columns: + raise ValueError(f"Column {name} is not unique") + col = df.get_column_by_name(name) + dtype = col.dtype[0] + if dtype in ( + DtypeKind.INT, + DtypeKind.UINT, + DtypeKind.FLOAT, + DtypeKind.BOOL, + ): + columns[name], buf = primitive_column_to_ndarray(col) + elif dtype == DtypeKind.CATEGORICAL: + columns[name], buf = categorical_column_to_series(col) + elif dtype == DtypeKind.STRING: + columns[name], buf = string_column_to_ndarray(col) + elif dtype == DtypeKind.DATETIME: + columns[name], buf = datetime_column_to_ndarray(col) + else: + raise NotImplementedError(f"Data type {dtype} not handled yet") + + buffers.append(buf) + + pass + + +def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: + """ + Convert a column holding one of the primitive dtypes to a NumPy array. + A primitive type is one of: int, uint, float, bool. + Parameters + ---------- + col : Column + Returns + ------- + tuple + Tuple of np.ndarray holding the data and the memory owner object + that keeps the memory alive. + """ + pass + + +def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]: + """ + Convert a column holding categorical data to a pandas Series. + Parameters + ---------- + col : Column + Returns + ------- + tuple + Tuple of pd.Series holding the data and the memory owner object + that keeps the memory alive. + """ + pass + + +def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: + """ + Convert a column holding string data to a NumPy array. + Parameters + ---------- + col : Column + Returns + ------- + tuple + Tuple of np.ndarray holding the data and the memory owner object + that keeps the memory alive. + """ + pass + + +def parse_datetime_format_str(format_str, data): + """Parse datetime `format_str` to interpret the `data`.""" + pass + + +def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: + """ + Convert a column holding DateTime data to a NumPy array. + Parameters + ---------- + col : Column + Returns + ------- + tuple + Tuple of np.ndarray holding the data and the memory owner object + that keeps the memory alive. + """ + pass + + +def buffer_to_ndarray( + buffer: Buffer, + dtype: tuple[DtypeKind, int, str, str], + offset: int = 0, + length: int | None = None, +) -> np.ndarray: + """ + Build a NumPy array from the passed buffer. + Parameters + ---------- + buffer : Buffer + Buffer to build a NumPy array from. + dtype : tuple + Data type of the buffer conforming protocol dtypes format. + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + length : int, optional + If the buffer is a bit-mask, specifies a number of bits to read + from the buffer. Has no effect otherwise. + Returns + ------- + np.ndarray + Notes + ----- + The returned array doesn't own the memory. The caller of this function is + responsible for keeping the memory owner object alive as long as + the returned NumPy array is being used. + """ + pass + + +def bitmask_to_bool_ndarray( + bitmask: np.ndarray, mask_length: int, first_byte_offset: int = 0 +) -> np.ndarray: + """ + Convert bit-mask to a boolean NumPy array. + Parameters + ---------- + bitmask : np.ndarray[uint8] + NumPy array of uint8 dtype representing the bitmask. + mask_length : int + Number of elements in the mask to interpret. + first_byte_offset : int, default: 0 + Number of elements to offset from the start of the first byte. + Returns + ------- + np.ndarray[bool] + """ + pass + + +def set_nulls( + data: np.ndarray | pd.Series, + col: Column, + validity: tuple[Buffer, tuple[DtypeKind, int, str, str]] | None, + allow_modify_inplace: bool = True, +): + """ + Set null values for the data according to the column null kind. + Parameters + ---------- + data : np.ndarray or pd.Series + Data to set nulls in. + col : Column + Column object that describes the `data`. + validity : tuple(Buffer, dtype) or None + The return value of ``col.buffers()``. We do not access the ``col.buffers()`` + here to not take the ownership of the memory of buffer objects. + allow_modify_inplace : bool, default: True + Whether to modify the `data` inplace when zero-copy is possible (True) or always + modify a copy of the `data` (False). + Returns + ------- + np.ndarray or pd.Series + Data with the nulls being set. + """ + pass From 010d9a8eab42d1709880b55f5a2c38bd77f85bbb Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 26 Oct 2022 11:44:53 +0200 Subject: [PATCH 02/29] Add a dataframe (PyArrowTableXchg) class methods --- python/pyarrow/interchange/dataframe.py | 64 +++++++++++++++++-------- 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py index c14ae49758a..fef614e27ec 100644 --- a/python/pyarrow/interchange/dataframe.py +++ b/python/pyarrow/interchange/dataframe.py @@ -15,11 +15,24 @@ # specific language governing permissions and limitations # under the License. +import chunk +from typing import ( + Any, + Dict, + Iterable, + Optional, + Sequence, + Tuple, + TypedDict, +) + import pyarrow as pa +import warnings from column import PandasColumn from dataframe_protocol import DataFrame as DataFrameXchg + class PyArrowTableXchg(DataFrameXchg): """ A data frame class, with only the methods required by the interchange @@ -50,40 +63,53 @@ def __dataframe__( return PyArrowTableXchg(self._df, nan_as_null, allow_copy) @property - def metadata(self) -> dict[str, Index]: - # `index` isn't a regular column, and the protocol doesn't support row - # labels - so we export it as Pandas-specific metadata here. - pass + def metadata(self) -> dict[str, Any]: + # The metadata for the data frame, as a dictionary with string keys. + # Add schema metadata here (pandas metadata, ot custom metadata) + schema_metadata = {k.decode('utf8'): v.decode('utf8') for k, v in self.schema.metadata.items()} + return schema_metadata def num_columns(self) -> int: - pass + return self.num_columns def num_rows(self) -> int: - pass + return self.num_rows def num_chunks(self) -> int: - pass + return self.column(0).num_chunks - def column_names(self) -> Index: - pass + def column_names(self) -> Iterable[str]: + return self.column_names def get_column(self, i: int) -> PyArrowColumn: - pass + return self.column(i) def get_column_by_name(self, name: str) -> PyArrowColumn: - pass + return self.column(name) - def get_columns(self) -> list[PyArrowColumn]: - pass + def get_columns(self) -> Iterable[PyArrowColumn]: + return self.columns - def select_columns(self, indices) -> PyArrowTableFrameXchg: - pass + def select_columns(self, indices: Sequence[int]) -> PyArrowTableFrameXchg: + return self.select(indices) - def select_columns_by_name(self, names) -> PyArrowTableFrameXchg: - pass + def select_columns_by_name(self, names: Sequence[str]) -> PyArrowTableFrameXchg: + return self.select(names) - def get_chunks(self, n_chunks=None): + def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable[PyArrowTableFrameXchg]: """ Return an iterator yielding the chunks. """ - pass \ No newline at end of file + if n_chunks: + if n_chunks%self.num_chunks == 0: + chunk_size = self.num_rows//n_chunks + if self.num_rows%n_chunks != 0: + warnings.warn("Converting dataframe into smaller chunks") + batches = self.to_batches(max_chunksize = chunk_size) + else: + warnings.warn("``n_chunks`` must be a multiple of ``self.num_chunks()``") + else: + batches = self.to_batches() + + iterator_tables = [pa.Table.from_batches([batch]) for batch in batches] + return iterator_tables From c0af309ae109c4050a7c6beffdd725ffd3035091 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 26 Oct 2022 15:35:38 +0200 Subject: [PATCH 03/29] Add a subpackage for testing interchange protocol, add a test for Table.__dataframe__ and do some minor corrections --- python/pyarrow/interchange/__init__.py | 2 +- python/pyarrow/interchange/buffer.py | 23 +++---- python/pyarrow/interchange/column.py | 63 ++++++++++--------- python/pyarrow/interchange/dataframe.py | 60 ++++++++++-------- .../pyarrow/interchange/dataframe_protocol.py | 12 ++-- python/pyarrow/interchange/from_dataframe.py | 31 +++++---- python/pyarrow/table.pxi | 30 +++++++++ python/pyarrow/tests/interchange/__init__.py | 16 +++++ .../interchange/test_interchange_spec.py | 34 ++++++++++ 9 files changed, 185 insertions(+), 86 deletions(-) create mode 100644 python/pyarrow/tests/interchange/__init__.py create mode 100644 python/pyarrow/tests/interchange/test_interchange_spec.py diff --git a/python/pyarrow/interchange/__init__.py b/python/pyarrow/interchange/__init__.py index d216be4ddc9..13a83393a91 100644 --- a/python/pyarrow/interchange/__init__.py +++ b/python/pyarrow/interchange/__init__.py @@ -13,4 +13,4 @@ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations -# under the License. \ No newline at end of file +# under the License. diff --git a/python/pyarrow/interchange/buffer.py b/python/pyarrow/interchange/buffer.py index 06803aaf299..07b21190039 100644 --- a/python/pyarrow/interchange/buffer.py +++ b/python/pyarrow/interchange/buffer.py @@ -15,12 +15,13 @@ # specific language governing permissions and limitations # under the License. -from dataframe_protocol import ( +from pyarrow.interchange.dataframe_protocol import ( Buffer, DlpackDeviceType, ) import numpy as np + class PyArrowBuffer(Buffer): """ Data in the buffer is guaranteed to be contiguous in memory. @@ -52,21 +53,21 @@ def __dlpack__(self): """ pass - def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: - """ - Device type and device ID for where the data in the buffer resides. - """ - pass + # def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: + # """ + # Device type and device ID for where the data in the buffer resides. + # """ + # pass def __repr__(self) -> str: return ( - "PyArrowBuffer(" - + str( + "PyArrowBuffer(" + + str( { "bufsize": self.bufsize, "ptr": self.ptr, "device": self.__dlpack_device__()[0].name, } - ) - + ")" - ) \ No newline at end of file + ) + + ")" + ) diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py index eb63f014187..d19a7f7f725 100644 --- a/python/pyarrow/interchange/column.py +++ b/python/pyarrow/interchange/column.py @@ -15,17 +15,18 @@ # specific language governing permissions and limitations # under the License. -from typing import Any +from typing import (Dict, Any) import pyarrow as pa -from buffer import PyArrowBuffer -from dataframe_protocol import ( +from pyarrow.interchange.buffer import PyArrowBuffer +from pyarrow.interchange.dataframe_protocol import ( Column, ColumnBuffers, DtypeKind, ) + class PyArrowColumn(Column): """ A column object, with only the methods and properties required by the @@ -61,19 +62,20 @@ def offset(self) -> int: @property def dtype(self) -> tuple[DtypeKind, int, str, str]: """ - Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. Bit-width : the number of bits as an integer - Format string : data type description format string in Apache Arrow C - Data Interface format. + Format string : data type description format string in Apache Arrow + C Data Interface format. Endianness : current only native endianness (``=``) is supported Notes: - - Kind specifiers are aligned with DLPack where possible (hence the - jump to 20, leave enough room for future extension) - - Masks must be specified as boolean with either bit width 1 (for bit - masks) or 8 (for byte masks). + - Kind specifiers are aligned with DLPack where possible (hence + the jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 + (for bit masks) or 8 (for byte masks). - Dtype width in bits was preferred over bytes - - Endianness isn't too useful, but included now in case in the future - we need to support non-native endianness + - Endianness isn't too useful, but included now in case in the + future we need to support non-native endianness - Went with Apache Arrow format strings over NumPy format strings because they're more complete from a dataframe perspective - Format strings are mostly useful for datetime specification, and @@ -82,8 +84,8 @@ def dtype(self) -> tuple[DtypeKind, int, str, str]: categorical in the data buffer. In case of a separate encoding of the categorical (e.g. an integer to string mapping), this can be derived from ``self.describe_categorical``. - - Data types not included: complex, Arrow-style null, binary, decimal, - and nested (list, struct, map, union) dtypes. + - Data types not included: complex, Arrow-style null, binary, + decimal, and nested (list, struct, map, union) dtypes. """ pass @@ -92,8 +94,9 @@ def _dtype_from_arrowdtype(self, dtype) -> tuple[DtypeKind, int, str, str]: See `self.dtype` for details. """ # Note: 'c' (complex) not handled yet (not in array spec v1). - # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled - # datetime and timedelta both map to datetime (is timedelta handled?) + # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not + # handled datetime and timedelta both map to datetime + # (is timedelta handled?) pass @@ -102,16 +105,18 @@ def describe_categorical(self): """ If the dtype is categorical, there are two options: - There are only values in the data buffer. - - There is a separate non-categorical Column encoding for categorical values. + - There is a separate non-categorical Column encoding for categorical + values. Raises TypeError if the dtype is not categorical Content of returned dict: - - "is_ordered" : bool, whether the ordering of dictionary indices is - semantically meaningful. + - "is_ordered" : bool, whether the ordering of dictionary indices + is semantically meaningful. - "is_dictionary" : bool, whether a dictionary-style mapping of categorical values to other objects exists - - "categories" : Column representing the (implicit) mapping of indices to - category values (e.g. an array of cat1, cat2, ...). - None if not a dictionary-style categorical. + - "categories" : Column representing the (implicit) mapping of + indices to category values (e.g. an array of + cat1, cat2, ...). None if not a dictionary-style + categorical. """ pass @@ -127,7 +132,7 @@ def null_count(self) -> int: pass @property - def metadata(self) -> dict[str, pd.Index]: + def metadata(self) -> Dict[str, Any]: """ Store specific metadata of the column. """ @@ -139,12 +144,12 @@ def num_chunks(self) -> int: """ pass - def get_chunks(self, n_chunks: int | None = None): - """ - Return an iterator yielding the chunks. - See `DataFrame.get_chunks` for details on ``n_chunks``. - """ - pass + # def get_chunks(self, n_chunks: int | None = None): + # """ + # Return an iterator yielding the chunks. + # See `DataFrame.get_chunks` for details on ``n_chunks``. + # """ + # pass def get_buffers(self) -> ColumnBuffers: """ diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py index fef614e27ec..a4dafceae1f 100644 --- a/python/pyarrow/interchange/dataframe.py +++ b/python/pyarrow/interchange/dataframe.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -import chunk +from __future__ import annotations from typing import ( Any, Dict, @@ -29,11 +29,11 @@ import pyarrow as pa import warnings -from column import PandasColumn -from dataframe_protocol import DataFrame as DataFrameXchg +from pyarrow.interchange.column import PyArrowColumn +from pyarrow.interchange.dataframe_protocol import DataFrame as DataFrameXchg -class PyArrowTableXchg(DataFrameXchg): +class TableXchg(DataFrameXchg): """ A data frame class, with only the methods required by the interchange protocol defined. @@ -47,7 +47,7 @@ def __init__( ) -> None: """ Constructor - an instance of this (private) class is returned from - `pd.DataFrame.__dataframe__`. + `pa.Table.__dataframe__`. """ self._df = df # ``nan_as_null`` is a keyword intended for the consumer to tell the @@ -59,57 +59,63 @@ def __init__( def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True - ) -> PyArrowTableXchg: - return PyArrowTableXchg(self._df, nan_as_null, allow_copy) + ) -> TableXchg: + return TableXchg(self._df, nan_as_null, allow_copy) @property def metadata(self) -> dict[str, Any]: # The metadata for the data frame, as a dictionary with string keys. # Add schema metadata here (pandas metadata, ot custom metadata) - schema_metadata = {k.decode('utf8'): v.decode('utf8') for k, v in self.schema.metadata.items()} + schema_metadata = {k.decode('utf8'): v.decode('utf8') + for k, v in self._df.schema.metadata.items()} return schema_metadata def num_columns(self) -> int: - return self.num_columns + return self._df.num_columns def num_rows(self) -> int: - return self.num_rows + return self._df.num_rows def num_chunks(self) -> int: - return self.column(0).num_chunks + return self._df.column(0).num_chunks def column_names(self) -> Iterable[str]: - return self.column_names + return self._df.column_names def get_column(self, i: int) -> PyArrowColumn: - return self.column(i) + return self._df.column(i) def get_column_by_name(self, name: str) -> PyArrowColumn: - return self.column(name) + return self._df.column(name) def get_columns(self) -> Iterable[PyArrowColumn]: - return self.columns + return self._df.columns - def select_columns(self, indices: Sequence[int]) -> PyArrowTableFrameXchg: - return self.select(indices) + def select_columns(self, indices: Sequence[int]) -> TableXchg: + return TableXchg( + self._df.select(indices), self._nan_as_null, self._allow_copy + ) - def select_columns_by_name(self, names: Sequence[str]) -> PyArrowTableFrameXchg: - return self.select(names) + def select_columns_by_name(self, names: Sequence[str]) -> TableXchg: + return TableXchg( + self._df.select(names), self._nan_as_null, self._allow_copy + ) - def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable[PyArrowTableFrameXchg]: + def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable[TableXchg]: """ Return an iterator yielding the chunks. """ if n_chunks: - if n_chunks%self.num_chunks == 0: - chunk_size = self.num_rows//n_chunks - if self.num_rows%n_chunks != 0: + if n_chunks % self._df.num_chunks == 0: + chunk_size = self._df.num_rows // n_chunks + if self.num_rows %n_chunks != 0: warnings.warn("Converting dataframe into smaller chunks") - batches = self.to_batches(max_chunksize = chunk_size) + batches = self._df.to_batches(max_chunksize=chunk_size) else: - warnings.warn("``n_chunks`` must be a multiple of ``self.num_chunks()``") + warnings.warn( + "``n_chunks`` must be a multiple of ``self.num_chunks()``") else: - batches = self.to_batches() - + batches = self._df.to_batches() + iterator_tables = [pa.Table.from_batches([batch]) for batch in batches] return iterator_tables diff --git a/python/pyarrow/interchange/dataframe_protocol.py b/python/pyarrow/interchange/dataframe_protocol.py index f12ae4e7ae8..de30ede8d63 100644 --- a/python/pyarrow/interchange/dataframe_protocol.py +++ b/python/pyarrow/interchange/dataframe_protocol.py @@ -125,11 +125,12 @@ class ColumnBuffers(TypedDict): class CategoricalDescription(TypedDict): # whether the ordering of dictionary indices is semantically meaningful is_ordered: bool - # whether a dictionary-style mapping of categorical values to other objects exists + # whether a dictionary-style mapping of categorical values to other objects + # exists is_dictionary: bool # Python-level only (e.g. ``{int: str}``). # None if not a dictionary-style categorical. - categories: Optional[Column] + # categories: Optional[Column] class Buffer(ABC): @@ -246,7 +247,8 @@ def offset(self) -> int: @abstractmethod def dtype(self) -> Dtype: """ - Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. Bit-width : the number of bits as an integer Format string : data type description format string in Apache Arrow C Data Interface format. @@ -285,8 +287,8 @@ def describe_categorical(self) -> CategoricalDescription: semantically meaningful. - "is_dictionary" : bool, whether a mapping of categorical values to other objects exists - - "categories" : Column representing the (implicit) mapping of indices to - category values (e.g. an array of cat1, cat2, ...). + - "categories" : Column representing the (implicit) mapping of indices + to category values (e.g. an array of cat1, cat2, ...). None if not a dictionary-style categorical. TBD: are there any other in-memory representations that are needed? """ diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py index a0d5179a053..884c2c2c3f8 100644 --- a/python/pyarrow/interchange/from_dataframe.py +++ b/python/pyarrow/interchange/from_dataframe.py @@ -15,8 +15,8 @@ # specific language governing permissions and limitations # under the License. -from column import PyArrowColumn -from dataframe_protocol import ( +from pyarrow.interchange.column import PyArrowColumn +from pyarrow.interchange.dataframe_protocol import ( Buffer, Column, ColumnNullType, @@ -28,11 +28,14 @@ def from_dataframe(df, allow_copy=True) -> pd.DataFrame: """ - Build a ``pd.DataFrame`` from any DataFrame supporting the interchange protocol. + Build a ``pd.DataFrame`` from any DataFrame supporting the interchange + protocol. + Parameters ---------- df : DataFrameXchg - Object supporting the interchange protocol, i.e. `__dataframe__` method. + Object supporting the interchange protocol, i.e. `__dataframe__` + method. allow_copy : bool, default: True Whether to allow copying the memory to perform the conversion (if false then zero-copy approach is requested). @@ -55,7 +58,8 @@ def _from_dataframe(df: DataFrameXchg, allow_copy=True): Parameters ---------- df : DataFrameXchg - Object supporting the interchange protocol, i.e. `__dataframe__` method. + Object supporting the interchange protocol, i.e. `__dataframe__` + method. allow_copy : bool, default: True Whether to allow copying the memory to perform the conversion (if false then zero-copy approach is requested). @@ -76,8 +80,8 @@ def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame: ------- pd.DataFrame """ - # We need a dict of columns here, with each column being a NumPy array (at - # least for now, deal with non-NumPy dtypes later). + # We need a dict of columns here, with each column being a NumPy array + # (at least for now, deal with non-NumPy dtypes later). columns: dict[str, Any] = {} buffers = [] # hold on to buffers, keeps memory alive for name in df.column_names(): @@ -198,8 +202,8 @@ def buffer_to_ndarray( np.ndarray Notes ----- - The returned array doesn't own the memory. The caller of this function is - responsible for keeping the memory owner object alive as long as + The returned array doesn't own the memory. The caller of this function + is responsible for keeping the memory owner object alive as long as the returned NumPy array is being used. """ pass @@ -240,11 +244,12 @@ def set_nulls( col : Column Column object that describes the `data`. validity : tuple(Buffer, dtype) or None - The return value of ``col.buffers()``. We do not access the ``col.buffers()`` - here to not take the ownership of the memory of buffer objects. + The return value of ``col.buffers()``. We do not access the + ``col.buffers()`` here to not take the ownership of the memory + of buffer objects. allow_modify_inplace : bool, default: True - Whether to modify the `data` inplace when zero-copy is possible (True) or always - modify a copy of the `data` (False). + Whether to modify the `data` inplace when zero-copy is possible + (True) or always modify a copy of the `data` (False). Returns ------- np.ndarray or pd.Series diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 5c58ae61f19..25f498cb0dd 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -2809,6 +2809,36 @@ cdef class Table(_PandasConvertible): return self.column(key) + # ---------------------------------------------------------------------- + def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): + """ + Return the dataframe interchange object implementing the interchange protocol. + Parameters + ---------- + nan_as_null : bool, default False + Whether to tell the DataFrame to overwrite null values in the data + with ``NaN`` (or ``NaT``). + allow_copy : bool, default True + Whether to allow memory copying when exporting. If set to False + it would cause non-zero-copy exports to fail. + Returns + ------- + DataFrame interchange object + The object which consuming library can use to ingress the dataframe. + Notes + ----- + Details on the interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + """ + + from pyarrow.interchange.dataframe import TableXchg + + return TableXchg(self, nan_as_null, allow_copy) + + # ---------------------------------------------------------------------- + def slice(self, offset=0, length=None): """ Compute zero-copy slice of this Table. diff --git a/python/pyarrow/tests/interchange/__init__.py b/python/pyarrow/tests/interchange/__init__.py new file mode 100644 index 00000000000..13a83393a91 --- /dev/null +++ b/python/pyarrow/tests/interchange/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py new file mode 100644 index 00000000000..49bc8e65a44 --- /dev/null +++ b/python/pyarrow/tests/interchange/test_interchange_spec.py @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pyarrow as pa + + +def test_dataframe(): + n = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + a = pa.chunked_array([["Flamingo", "Parrot", "Cow"], + ["Horse", "Brittle stars", "Centipede"]]) + table = pa.Table.from_arrays([n, a], names=['n_legs', 'animals']) + df = table.__dataframe__() + + assert df.num_columns() == 2 + assert df.num_rows() == 6 + assert df.num_chunks() == 2 + assert list(df.column_names()) == ['n_legs', 'animals'] + assert list(df.select_columns([1]).column_names()) == list( + df.select_columns_by_name(["animals"]).column_names() + ) From 842ba3e1a26dff479629000c5e350fbd41acd8ec Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Mon, 7 Nov 2022 11:29:16 +0100 Subject: [PATCH 04/29] Add column (PyArrowColumn) class methods --- python/pyarrow/interchange/column.py | 152 ++++++++++++++---- python/pyarrow/interchange/dataframe.py | 13 +- .../interchange/test_interchange_spec.py | 27 +++- 3 files changed, 155 insertions(+), 37 deletions(-) diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py index d19a7f7f725..40e535e90e6 100644 --- a/python/pyarrow/interchange/column.py +++ b/python/pyarrow/interchange/column.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -from typing import (Dict, Any) +from typing import (Dict, Tuple, Any) import pyarrow as pa @@ -23,10 +23,49 @@ from pyarrow.interchange.dataframe_protocol import ( Column, ColumnBuffers, + ColumnNullType, DtypeKind, ) +_PYARROW_KINDS = { + pa.int8(): (DtypeKind.INT, "c"), + pa.int16(): (DtypeKind.INT, "s"), + pa.int32(): (DtypeKind.INT, "i"), + pa.int64(): (DtypeKind.INT, "l"), + pa.uint8(): (DtypeKind.UINT, "C"), + pa.uint16(): (DtypeKind.UINT, "S"), + pa.uint32(): (DtypeKind.UINT, "I"), + pa.uint64(): (DtypeKind.UINT, "L"), + pa.float16(): (DtypeKind.FLOAT, "e"), + pa.float32(): (DtypeKind.FLOAT, "f"), + pa.float64(): (DtypeKind.FLOAT, "g"), + pa.bool_(): (DtypeKind.BOOL, "b"), + pa.string(): (DtypeKind.STRING, "u"), # utf-8 + pa.large_string(): (DtypeKind.STRING, "U"), + # Resoulution: + # - seconds -> 's' + # - milliseconds -> 'm' + # - microseconds -> 'u' + # - nanoseconds -> 'n' + pa.timestamp(): (DtypeKind.DATETIME, "ts{resolution}:{tz}"), + pa.dictionary(): (DtypeKind.CATEGORICAL, "L") +} + + +class Endianness: + """Enum indicating the byte-order of a data-type.""" + + LITTLE = "<" + BIG = ">" + NATIVE = "=" + NA = "|" + + +class NoBufferPresent(Exception): + """Exception to signal that there is no requested buffer.""" + + class PyArrowColumn(Column): """ A column object, with only the methods and properties required by the @@ -44,20 +83,22 @@ def __init__(self, column: pa.Array, allow_copy: bool = True) -> None: Note: doesn't deal with extension arrays yet, just assume a regular Series/ndarray for now. """ - pass + # Store the column as a private attribute + self._col = column + self._allow_copy = allow_copy def size(self) -> int: """ Size of the column, in elements. """ - pass + return self._col.to_numpy().size @property def offset(self) -> int: """ - Offset of first element. Always zero. + Offset of first element. """ - pass + return self._col.offset @property def dtype(self) -> tuple[DtypeKind, int, str, str]: @@ -87,18 +128,13 @@ def dtype(self) -> tuple[DtypeKind, int, str, str]: - Data types not included: complex, Arrow-style null, binary, decimal, and nested (list, struct, map, union) dtypes. """ - pass + dtype = self._col.type + kind, f_string = _PYARROW_KINDS.get(dtype, (None, None)) + if kind is None: + raise ValueError(f"Data type {dtype} not supported by interchange protocol") + bit_width = self._col.nbytes * 8 - def _dtype_from_arrowdtype(self, dtype) -> tuple[DtypeKind, int, str, str]: - """ - See `self.dtype` for details. - """ - # Note: 'c' (complex) not handled yet (not in array spec v1). - # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not - # handled datetime and timedelta both map to datetime - # (is timedelta handled?) - - pass + return kind, bit_width, f_string, Endianness.NATIVE @property def describe_categorical(self): @@ -118,18 +154,27 @@ def describe_categorical(self): cat1, cat2, ...). None if not a dictionary-style categorical. """ - pass + if pa.types.is_dictionary(self._col.type): + raise TypeError( + "describe_categorical only works on a column with categorical dtype!" + ) + + return { + "is_ordered": True, + "is_dictionary": True, + "categories": PyArrowColumn(self._col.dictionary), + } @property def describe_null(self): - pass + return ColumnNullType.USE_BYTEMASK, 0 @property def null_count(self) -> int: """ Number of null elements. Should always be known. """ - pass + return self._col.null_count @property def metadata(self) -> Dict[str, Any]: @@ -142,14 +187,24 @@ def num_chunks(self) -> int: """ Return the number of chunks the column consists of. """ - pass + return 1 - # def get_chunks(self, n_chunks: int | None = None): - # """ - # Return an iterator yielding the chunks. - # See `DataFrame.get_chunks` for details on ``n_chunks``. - # """ - # pass + def get_chunks(self, n_chunks: int | None = None): + """ + Return an iterator yielding the chunks. + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + if n_chunks and n_chunks > 1: + size = self.size() + step = size // n_chunks + if size % n_chunks != 0: + step += 1 + for start in range(0, step * n_chunks, step): + yield PyArrowColumn( + self._col.slice(start,step), self._allow_copy + ) + else: + yield self def get_buffers(self) -> ColumnBuffers: """ @@ -170,7 +225,23 @@ def get_buffers(self) -> ColumnBuffers: if the data buffer does not have an associated offsets buffer. """ - pass + buffers: ColumnBuffers = { + "data": self._get_data_buffer(), + "validity": None, + "offsets": None, + } + + try: + buffers["validity"] = self._get_validity_buffer() + except NoBufferPresent: + pass + + try: + buffers["offsets"] = self._get_offsets_buffer() + except NoBufferPresent: + pass + + return buffers def _get_data_buffer( self, @@ -178,7 +249,11 @@ def _get_data_buffer( """ Return the buffer containing the data and the buffer's associated dtype. """ - pass + len = len(self._col.buffers()) + if len == 2: + return PyArrowBuffer(self._col.buffers()[1]), self.dtype + elif len == 3: + return PyArrowBuffer(self._col.buffers()[2]), self.dtype def _get_validity_buffer(self) -> tuple[PyArrowBuffer, Any]: """ @@ -186,7 +261,15 @@ def _get_validity_buffer(self) -> tuple[PyArrowBuffer, Any]: the buffer's associated dtype. Raises NoBufferPresent if null representation is not a bit or byte mask. """ - pass + # Define the dtype of the returned buffer + dtype = (DtypeKind.BOOL, 8, "b", Endianness.NATIVE) + buff = self._col.buffers()[0] + if buff: + return PyArrowBuffer(buff), dtype + + raise NoBufferPresent( + "There are no missing values so " + "does not have a separate mask") def _get_offsets_buffer(self) -> tuple[PyArrowBuffer, Any]: """ @@ -195,4 +278,13 @@ def _get_offsets_buffer(self) -> tuple[PyArrowBuffer, Any]: Raises NoBufferPresent if the data buffer does not have an associated offsets buffer. """ - pass + len = len(self._col.buffers()) + if len == 2: + raise NoBufferPresent( + "This column has a fixed-length dtype so " + "it does not have an offsets buffer" + ) + elif len == 3: + # Define the dtype of the returned buffer + dtype = (DtypeKind.INT, 64, "L", Endianness.NATIVE) + return PyArrowBuffer(self._col.buffers()[2]), dtype diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py index a4dafceae1f..43e4f03c6b7 100644 --- a/python/pyarrow/interchange/dataframe.py +++ b/python/pyarrow/interchange/dataframe.py @@ -83,22 +83,25 @@ def column_names(self) -> Iterable[str]: return self._df.column_names def get_column(self, i: int) -> PyArrowColumn: - return self._df.column(i) + return PyArrowColumn(self._df.column(i), + allow_copy=self._allow_copy) def get_column_by_name(self, name: str) -> PyArrowColumn: - return self._df.column(name) + return PyArrowColumn(self._df.column(name), + allow_copy=self._allow_copy) def get_columns(self) -> Iterable[PyArrowColumn]: - return self._df.columns + return PyArrowColumn(self._df.columns, + allow_copy=self._allow_copy) def select_columns(self, indices: Sequence[int]) -> TableXchg: return TableXchg( - self._df.select(indices), self._nan_as_null, self._allow_copy + self._df.select(list(indices)), self._nan_as_null, self._allow_copy ) def select_columns_by_name(self, names: Sequence[str]) -> TableXchg: return TableXchg( - self._df.select(names), self._nan_as_null, self._allow_copy + self._df.select(list(names)), self._nan_as_null, self._allow_copy ) def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable[TableXchg]: diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py index 49bc8e65a44..1dd971067a5 100644 --- a/python/pyarrow/tests/interchange/test_interchange_spec.py +++ b/python/pyarrow/tests/interchange/test_interchange_spec.py @@ -16,6 +16,29 @@ # under the License. import pyarrow as pa +import pytest + +@pytest.mark.parametrize( + "test_data", + [ + {"a": ["foo", "bar"], "b": ["baz", "qux"]}, + {"a": [1.5, 2.5, 3.5], "b": [9.2, 10.5, 11.8]}, + {"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]}, + ], + ids=["str_data", "float_data", "int_data"], +) +def test_only_one_dtype(test_data): + columns = list(test_data.keys()) + table = pa.Table.from_pylist([test_data]) + df = table.__dataframe__() + + column_size = len(test_data[columns[0]]) + for column in columns: + null_count = df.get_column_by_name(column).null_count + assert null_count == 0 + assert isinstance(null_count, int) + assert df.get_column_by_name(column).size() == column_size + assert df.get_column_by_name(column).offset == 0 def test_dataframe(): @@ -29,6 +52,6 @@ def test_dataframe(): assert df.num_rows() == 6 assert df.num_chunks() == 2 assert list(df.column_names()) == ['n_legs', 'animals'] - assert list(df.select_columns([1]).column_names()) == list( - df.select_columns_by_name(["animals"]).column_names() + assert list(df.select_columns((1,)).column_names()) == list( + df.select_columns_by_name(("animals",)).column_names() ) From 61eb00fb4c3a686dc7176b1e6434aa143965e9e7 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 8 Nov 2022 12:02:01 +0100 Subject: [PATCH 05/29] Add buffer (PyArrowBuffer) class methods, some changes and main tests --- python/pyarrow/interchange/buffer.py | 29 ++-- python/pyarrow/interchange/column.py | 145 +++++++++++++----- python/pyarrow/interchange/dataframe.py | 26 +++- .../interchange/test_interchange_spec.py | 126 ++++++++++++++- 4 files changed, 262 insertions(+), 64 deletions(-) diff --git a/python/pyarrow/interchange/buffer.py b/python/pyarrow/interchange/buffer.py index 07b21190039..ff0caf85dbb 100644 --- a/python/pyarrow/interchange/buffer.py +++ b/python/pyarrow/interchange/buffer.py @@ -15,11 +15,15 @@ # specific language governing permissions and limitations # under the License. +from __future__ import annotations + +import numpy as np +import pyarrow as pa + from pyarrow.interchange.dataframe_protocol import ( Buffer, DlpackDeviceType, ) -import numpy as np class PyArrowBuffer(Buffer): @@ -27,37 +31,40 @@ class PyArrowBuffer(Buffer): Data in the buffer is guaranteed to be contiguous in memory. """ - def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None: + def __init__(self, x: pa.Buffer, allow_copy: bool = True) -> None: """ Handle only regular columns (= numpy arrays) for now. """ - pass + self._x = x @property def bufsize(self) -> int: """ Buffer size in bytes. """ - pass + return self._x.size @property def ptr(self) -> int: """ Pointer to start of the buffer as an integer. """ - pass + return self._x.address def __dlpack__(self): """ Represent this structure as DLPack interface. """ - pass + raise NotImplementedError("__dlpack__") - # def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: - # """ - # Device type and device ID for where the data in the buffer resides. - # """ - # pass + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: + """ + Device type and device ID for where the data in the buffer resides. + """ + if self._x.is_cpu: + return (DlpackDeviceType.CPU, None) + else: + raise NotImplementedError("__dlpack_device__") def __repr__(self) -> str: return ( diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py index 40e535e90e6..37bba874a82 100644 --- a/python/pyarrow/interchange/column.py +++ b/python/pyarrow/interchange/column.py @@ -15,18 +15,15 @@ # specific language governing permissions and limitations # under the License. -from typing import (Dict, Tuple, Any) +from __future__ import annotations -import pyarrow as pa +import warnings +from typing import Any, Dict, Iterable, Tuple +import pyarrow as pa from pyarrow.interchange.buffer import PyArrowBuffer -from pyarrow.interchange.dataframe_protocol import ( - Column, - ColumnBuffers, - ColumnNullType, - DtypeKind, -) - +from pyarrow.interchange.dataframe_protocol import (Column, ColumnBuffers, + ColumnNullType, DtypeKind) _PYARROW_KINDS = { pa.int8(): (DtypeKind.INT, "c"), @@ -43,13 +40,6 @@ pa.bool_(): (DtypeKind.BOOL, "b"), pa.string(): (DtypeKind.STRING, "u"), # utf-8 pa.large_string(): (DtypeKind.STRING, "U"), - # Resoulution: - # - seconds -> 's' - # - milliseconds -> 'm' - # - microseconds -> 'u' - # - nanoseconds -> 'n' - pa.timestamp(): (DtypeKind.DATETIME, "ts{resolution}:{tz}"), - pa.dictionary(): (DtypeKind.CATEGORICAL, "L") } @@ -78,7 +68,7 @@ class PyArrowColumn(Column): doesn't need its own version or ``__column__`` protocol. """ - def __init__(self, column: pa.Array, allow_copy: bool = True) -> None: + def __init__(self, column: pa.Array | pa.ChunkedArray, allow_copy: bool = True) -> None: """ Note: doesn't deal with extension arrays yet, just assume a regular Series/ndarray for now. @@ -91,14 +81,18 @@ def size(self) -> int: """ Size of the column, in elements. """ - return self._col.to_numpy().size + if isinstance(self._col, pa.Array): + len = self._col.to_numpy().size + else: + len = self._col.length() + return len @property def offset(self) -> int: """ Offset of first element. """ - return self._col.offset + return 0 @property def dtype(self) -> tuple[DtypeKind, int, str, str]: @@ -129,10 +123,34 @@ def dtype(self) -> tuple[DtypeKind, int, str, str]: decimal, and nested (list, struct, map, union) dtypes. """ dtype = self._col.type + try: + bit_width = dtype.bit_width + except: # in case of a non-fixed width type (string) + bit_width = 8 + + if pa.types.is_timestamp(dtype): + kind = DtypeKind.DATETIME + f_string = "ts{dtype.unit}:{dtype.tz}" + return kind, bit_width, f_string, Endianness.NATIVE + elif pa.types.is_dictionary(dtype): + kind = DtypeKind.CATEGORICAL + f_string = "L" + return kind, bit_width, f_string, Endianness.NATIVE + else: + return self._dtype_from_arrowdtype(dtype, bit_width) + + + def _dtype_from_arrowdtype(self, dtype, bit_width) -> tuple[DtypeKind, int, str, str]: + """ + See `self.dtype` for details. + """ + # Note: 'c' (complex) not handled yet (not in array spec v1). + # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled + # datetime and timedelta both map to datetime (is timedelta handled?) + kind, f_string = _PYARROW_KINDS.get(dtype, (None, None)) if kind is None: raise ValueError(f"Data type {dtype} not supported by interchange protocol") - bit_width = self._col.nbytes * 8 return kind, bit_width, f_string, Endianness.NATIVE @@ -154,15 +172,20 @@ def describe_categorical(self): cat1, cat2, ...). None if not a dictionary-style categorical. """ - if pa.types.is_dictionary(self._col.type): + if isinstance(self._col, pa.ChunkedArray): + arr = self._col.combine_chunks() + else: + arr = self._col + + if not pa.types.is_dictionary(arr.type): raise TypeError( "describe_categorical only works on a column with categorical dtype!" ) return { - "is_ordered": True, + "is_ordered": self._col.type.ordered, "is_dictionary": True, - "categories": PyArrowColumn(self._col.dictionary), + "categories": PyArrowColumn(arr.dictionary), } @property @@ -187,7 +210,11 @@ def num_chunks(self) -> int: """ Return the number of chunks the column consists of. """ - return 1 + if isinstance(self._col, pa.Array): + n_chunks = 1 + else: + n_chunks = self._col.num_chunks + return n_chunks def get_chunks(self, n_chunks: int | None = None): """ @@ -195,16 +222,37 @@ def get_chunks(self, n_chunks: int | None = None): See `DataFrame.get_chunks` for details on ``n_chunks``. """ if n_chunks and n_chunks > 1: - size = self.size() - step = size // n_chunks - if size % n_chunks != 0: - step += 1 - for start in range(0, step * n_chunks, step): - yield PyArrowColumn( - self._col.slice(start,step), self._allow_copy - ) + if n_chunks % self.num_chunks() == 0: + chunk_size = self.size() // n_chunks + if self.size() % n_chunks != 0: + chunk_size += 1 + + if isinstance(self._col, pa.ChunkedArray): + array = self._col.combine_chunks() + else: + array = self._col + + i = 0 + for start in range(0, chunk_size * n_chunks, chunk_size): + yield PyArrowColumn( + array.slice(start,chunk_size), self._allow_copy + ) + i +=1 + # In case when the size of the chunk is such that the resulting + # list is one less chunk then n_chunks -> append an empty chunk + if i == n_chunks - 1: + yield PyArrowColumn(pa.array([]), self._allow_copy) + else: + warnings.warn( + "``n_chunks`` must be a multiple of ``self.num_chunks()``") + elif isinstance(self._col, pa.ChunkedArray): + return [ + PyArrowColumn(chunk, self._allow_copy) + for chunk in self._col.chunks + ] else: yield self + def get_buffers(self) -> ColumnBuffers: """ @@ -249,11 +297,16 @@ def _get_data_buffer( """ Return the buffer containing the data and the buffer's associated dtype. """ - len = len(self._col.buffers()) - if len == 2: - return PyArrowBuffer(self._col.buffers()[1]), self.dtype - elif len == 3: - return PyArrowBuffer(self._col.buffers()[2]), self.dtype + if isinstance(self._col, pa.ChunkedArray): + array = self._col.combine_chunks() + else: + array = self._col + n = len(array.buffers()) + if n == 2: + return PyArrowBuffer(array.buffers()[1]), self.dtype + elif n == 3: + return PyArrowBuffer(array.buffers()[2]), self.dtype + def _get_validity_buffer(self) -> tuple[PyArrowBuffer, Any]: """ @@ -263,7 +316,11 @@ def _get_validity_buffer(self) -> tuple[PyArrowBuffer, Any]: """ # Define the dtype of the returned buffer dtype = (DtypeKind.BOOL, 8, "b", Endianness.NATIVE) - buff = self._col.buffers()[0] + if isinstance(self._col, pa.ChunkedArray): + array = self._col.combine_chunks() + else: + array = self._col + buff = array.buffers()[0] if buff: return PyArrowBuffer(buff), dtype @@ -278,13 +335,17 @@ def _get_offsets_buffer(self) -> tuple[PyArrowBuffer, Any]: Raises NoBufferPresent if the data buffer does not have an associated offsets buffer. """ - len = len(self._col.buffers()) - if len == 2: + if isinstance(self._col, pa.ChunkedArray): + array = self._col.combine_chunks() + else: + array = self._col + n = len(array.buffers()) + if n == 2: raise NoBufferPresent( "This column has a fixed-length dtype so " "it does not have an offsets buffer" ) - elif len == 3: + elif n == 3: # Define the dtype of the returned buffer dtype = (DtypeKind.INT, 64, "L", Endianness.NATIVE) - return PyArrowBuffer(self._col.buffers()[2]), dtype + return PyArrowBuffer(array.buffers()[2]), dtype diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py index 43e4f03c6b7..f5c7335a283 100644 --- a/python/pyarrow/interchange/dataframe.py +++ b/python/pyarrow/interchange/dataframe.py @@ -91,8 +91,10 @@ def get_column_by_name(self, name: str) -> PyArrowColumn: allow_copy=self._allow_copy) def get_columns(self) -> Iterable[PyArrowColumn]: - return PyArrowColumn(self._df.columns, - allow_copy=self._allow_copy) + return [ + PyArrowColumn(col, allow_copy=self._allow_copy) + for col in self._df.columns + ] def select_columns(self, indices: Sequence[int]) -> TableXchg: return TableXchg( @@ -108,17 +110,25 @@ def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable[TableXchg]: """ Return an iterator yielding the chunks. """ - if n_chunks: - if n_chunks % self._df.num_chunks == 0: - chunk_size = self._df.num_rows // n_chunks - if self.num_rows %n_chunks != 0: - warnings.warn("Converting dataframe into smaller chunks") + if n_chunks and n_chunks > 1: + if n_chunks % self.num_chunks() == 0: + chunk_size = self.num_rows() // n_chunks + if self.num_rows() % n_chunks != 0: + chunk_size += 1 batches = self._df.to_batches(max_chunksize=chunk_size) + # In case when the size of the chunk is such that the resulting + # list is one less chunk then n_chunks -> append an empty chunk + if len(batches) == n_chunks - 1: + batches.append(pa.record_batch([])) else: warnings.warn( "``n_chunks`` must be a multiple of ``self.num_chunks()``") else: batches = self._df.to_batches() - iterator_tables = [pa.Table.from_batches([batch]) for batch in batches] + iterator_tables = [TableXchg( + pa.Table.from_batches([batch]), self._nan_as_null, self._allow_copy + ) + for batch in batches + ] return iterator_tables diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py index 1dd971067a5..97e9302c700 100644 --- a/python/pyarrow/tests/interchange/test_interchange_spec.py +++ b/python/pyarrow/tests/interchange/test_interchange_spec.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +import ctypes + import pyarrow as pa import pytest @@ -25,11 +27,10 @@ {"a": [1.5, 2.5, 3.5], "b": [9.2, 10.5, 11.8]}, {"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]}, ], - ids=["str_data", "float_data", "int_data"], ) def test_only_one_dtype(test_data): columns = list(test_data.keys()) - table = pa.Table.from_pylist([test_data]) + table = pa.table(test_data) df = table.__dataframe__() column_size = len(test_data[columns[0]]) @@ -41,11 +42,69 @@ def test_only_one_dtype(test_data): assert df.get_column_by_name(column).offset == 0 +def test_mixed_dtypes(): + table = pa.table( + { + "a": [1, 2, 3], # dtype kind INT = 0 + "b": [3, 4, 5], # dtype kind INT = 0 + "c": [1.5, 2.5, 3.5], # dtype kind FLOAT = 2 + "d": [9, 10, 11], # dtype kind INT = 0 + "e": [True, False, True], # dtype kind BOOLEAN = 20 + "f": ["a", "", "c"], # dtype kind STRING = 21 + } + ) + df = table.__dataframe__() + # for meanings of dtype[0] see the spec; we cannot import the spec here as this + # file is expected to be vendored *anywhere*; + # values for dtype[0] are explained above + columns = {"a": 0, "b": 0, "c": 2, "d": 0, "e": 20, "f": 21} + + for column, kind in columns.items(): + col = df.get_column_by_name(column) + assert col.null_count == 0 + assert isinstance(col.null_count, int) + assert col.size() == 3 + assert col.offset == 0 + + assert col.dtype[0] == kind + + assert df.get_column_by_name("c").dtype[1] == 64 + + +def test_na_float(): + table = pa.table({"a": [1.0, None, 2.0]}) + df = table.__dataframe__() + col = df.get_column_by_name("a") + assert col.null_count == 1 + assert isinstance(col.null_count, int) + + +def test_noncategorical(): + table = pa.table({"a": [1, 2, 3]}) + df = table.__dataframe__() + col = df.get_column_by_name("a") + with pytest.raises(TypeError, match=".*categorical.*"): + col.describe_categorical + + +def test_categorical(): + import pyarrow as pa + arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"] + table = pa.table( + {"weekday": pa.array(arr).dictionary_encode()} + ) + + col = table.__dataframe__().get_column_by_name("weekday") + categorical = col.describe_categorical + assert isinstance(categorical["is_ordered"], bool) + assert isinstance(categorical["is_dictionary"], bool) + + def test_dataframe(): n = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) a = pa.chunked_array([["Flamingo", "Parrot", "Cow"], ["Horse", "Brittle stars", "Centipede"]]) - table = pa.Table.from_arrays([n, a], names=['n_legs', 'animals']) + table = pa.table([n, a], names=['n_legs', 'animals']) df = table.__dataframe__() assert df.num_columns() == 2 @@ -55,3 +114,64 @@ def test_dataframe(): assert list(df.select_columns((1,)).column_names()) == list( df.select_columns_by_name(("animals",)).column_names() ) + + +@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)]) +def test_df_get_chunks(size, n_chunks): + table = pa.table({"x": list(range(size))}) + df = table.__dataframe__() + chunks = list(df.get_chunks(n_chunks)) + assert len(chunks) == n_chunks + assert sum(chunk.num_rows() for chunk in chunks) == size + + +@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)]) +def test_column_get_chunks(size, n_chunks): + table = pa.table({"x": list(range(size))}) + df = table.__dataframe__() + chunks = list(df.get_column(0).get_chunks(n_chunks)) + assert len(chunks) == n_chunks + assert sum(chunk.size() for chunk in chunks) == size + + +def test_get_columns(): + table = pa.table({"a": [0, 1], "b": [2.5, 3.5]}) + df = table.__dataframe__() + for col in df.get_columns(): + assert col.size() == 2 + assert col.num_chunks() == 1 + # for meanings of dtype[0] see the spec; we cannot import the spec here as this + # file is expected to be vendored *anywhere* + assert df.get_column(0).dtype[0] == 0 # INT + assert df.get_column(1).dtype[0] == 2 # FLOAT + + +def test_buffer(): + arr = [0, 1, -1] + table = pa.table({"a": arr}) + df = table.__dataframe__() + col = df.get_column(0) + buf = col.get_buffers() + + dataBuf, dataDtype = buf["data"] + + assert dataBuf.bufsize > 0 + assert dataBuf.ptr != 0 + device, _ = dataBuf.__dlpack_device__() + + # for meanings of dtype[0] see the spec; we cannot import the spec here as this + # file is expected to be vendored *anywhere* + assert dataDtype[0] == 0 # INT + + if device == 1: # CPU-only as we're going to directly read memory here + bitwidth = dataDtype[1] + ctype = { + 8: ctypes.c_int8, + 16: ctypes.c_int16, + 32: ctypes.c_int32, + 64: ctypes.c_int64, + }[bitwidth] + + for idx, truth in enumerate(arr): + val = ctype.from_address(dataBuf.ptr + idx * (bitwidth // 8)).value + assert val == truth, f"Buffer at index {idx} mismatch" From 027012d2ea56ab3ad6ab0c87ba43664c3c27eb9d Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 8 Nov 2022 12:43:39 +0100 Subject: [PATCH 06/29] Make changes to buffer, column and dataframe classes --- python/pyarrow/interchange/buffer.py | 2 +- python/pyarrow/interchange/column.py | 82 +++++++++++++------------ python/pyarrow/interchange/dataframe.py | 20 +++--- 3 files changed, 51 insertions(+), 53 deletions(-) diff --git a/python/pyarrow/interchange/buffer.py b/python/pyarrow/interchange/buffer.py index ff0caf85dbb..6c183c034ce 100644 --- a/python/pyarrow/interchange/buffer.py +++ b/python/pyarrow/interchange/buffer.py @@ -33,7 +33,7 @@ class PyArrowBuffer(Buffer): def __init__(self, x: pa.Buffer, allow_copy: bool = True) -> None: """ - Handle only regular columns (= numpy arrays) for now. + Handle PyArrow Buffers. """ self._x = x diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py index 37bba874a82..4c1aa2f6608 100644 --- a/python/pyarrow/interchange/column.py +++ b/python/pyarrow/interchange/column.py @@ -18,12 +18,19 @@ from __future__ import annotations import warnings -from typing import Any, Dict, Iterable, Tuple +from typing import ( + Any, + Dict, + Iterable, + Optional, + Tuple, +) import pyarrow as pa from pyarrow.interchange.buffer import PyArrowBuffer from pyarrow.interchange.dataframe_protocol import (Column, ColumnBuffers, - ColumnNullType, DtypeKind) + ColumnNullType, DtypeKind, + CategoricalDescription) _PYARROW_KINDS = { pa.int8(): (DtypeKind.INT, "c"), @@ -70,8 +77,7 @@ class PyArrowColumn(Column): def __init__(self, column: pa.Array | pa.ChunkedArray, allow_copy: bool = True) -> None: """ - Note: doesn't deal with extension arrays yet, just assume a regular - Series/ndarray for now. + Handles PyArrow Arrays and ChunkedArrays. """ # Store the column as a private attribute self._col = column @@ -95,7 +101,7 @@ def offset(self) -> int: return 0 @property - def dtype(self) -> tuple[DtypeKind, int, str, str]: + def dtype(self) -> Tuple[DtypeKind, int, str, str]: """ Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. @@ -125,8 +131,8 @@ def dtype(self) -> tuple[DtypeKind, int, str, str]: dtype = self._col.type try: bit_width = dtype.bit_width - except: # in case of a non-fixed width type (string) - bit_width = 8 + except: # in case of a variable-length strings + bit_width = None if pa.types.is_timestamp(dtype): kind = DtypeKind.DATETIME @@ -140,7 +146,7 @@ def dtype(self) -> tuple[DtypeKind, int, str, str]: return self._dtype_from_arrowdtype(dtype, bit_width) - def _dtype_from_arrowdtype(self, dtype, bit_width) -> tuple[DtypeKind, int, str, str]: + def _dtype_from_arrowdtype(self, dtype, bit_width) -> Tuple[DtypeKind, int, str, str]: """ See `self.dtype` for details. """ @@ -155,7 +161,7 @@ def _dtype_from_arrowdtype(self, dtype, bit_width) -> tuple[DtypeKind, int, str, return kind, bit_width, f_string, Endianness.NATIVE @property - def describe_categorical(self): + def describe_categorical(self) -> CategoricalDescription: """ If the dtype is categorical, there are two options: - There are only values in the data buffer. @@ -189,7 +195,7 @@ def describe_categorical(self): } @property - def describe_null(self): + def describe_null(self) -> Tuple[ColumnNullType, Any]: return ColumnNullType.USE_BYTEMASK, 0 @property @@ -216,35 +222,31 @@ def num_chunks(self) -> int: n_chunks = self._col.num_chunks return n_chunks - def get_chunks(self, n_chunks: int | None = None): + def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]: """ Return an iterator yielding the chunks. See `DataFrame.get_chunks` for details on ``n_chunks``. """ if n_chunks and n_chunks > 1: - if n_chunks % self.num_chunks() == 0: - chunk_size = self.size() // n_chunks - if self.size() % n_chunks != 0: - chunk_size += 1 - - if isinstance(self._col, pa.ChunkedArray): - array = self._col.combine_chunks() - else: - array = self._col - - i = 0 - for start in range(0, chunk_size * n_chunks, chunk_size): - yield PyArrowColumn( - array.slice(start,chunk_size), self._allow_copy - ) - i +=1 - # In case when the size of the chunk is such that the resulting - # list is one less chunk then n_chunks -> append an empty chunk - if i == n_chunks - 1: - yield PyArrowColumn(pa.array([]), self._allow_copy) + chunk_size = self.size() // n_chunks + if self.size() % n_chunks != 0: + chunk_size += 1 + + if isinstance(self._col, pa.ChunkedArray): + array = self._col.combine_chunks() else: - warnings.warn( - "``n_chunks`` must be a multiple of ``self.num_chunks()``") + array = self._col + + i = 0 + for start in range(0, chunk_size * n_chunks, chunk_size): + yield PyArrowColumn( + array.slice(start,chunk_size), self._allow_copy + ) + i +=1 + # In case when the size of the chunk is such that the resulting + # list is one less chunk then n_chunks -> append an empty chunk + if i == n_chunks - 1: + yield PyArrowColumn(pa.array([]), self._allow_copy) elif isinstance(self._col, pa.ChunkedArray): return [ PyArrowColumn(chunk, self._allow_copy) @@ -293,7 +295,7 @@ def get_buffers(self) -> ColumnBuffers: def _get_data_buffer( self, - ) -> tuple[PyArrowBuffer, Any]: # Any is for self.dtype tuple + ) -> Tuple[PyArrowBuffer, Any]: # Any is for self.dtype tuple """ Return the buffer containing the data and the buffer's associated dtype. """ @@ -308,7 +310,7 @@ def _get_data_buffer( return PyArrowBuffer(array.buffers()[2]), self.dtype - def _get_validity_buffer(self) -> tuple[PyArrowBuffer, Any]: + def _get_validity_buffer(self) -> Tuple[PyArrowBuffer, Any]: """ Return the buffer containing the mask values indicating missing data and the buffer's associated dtype. @@ -323,12 +325,12 @@ def _get_validity_buffer(self) -> tuple[PyArrowBuffer, Any]: buff = array.buffers()[0] if buff: return PyArrowBuffer(buff), dtype + else: + raise NoBufferPresent( + "There are no missing values so " + "does not have a separate mask") - raise NoBufferPresent( - "There are no missing values so " - "does not have a separate mask") - - def _get_offsets_buffer(self) -> tuple[PyArrowBuffer, Any]: + def _get_offsets_buffer(self) -> Tuple[PyArrowBuffer, Any]: """ Return the buffer containing the offset values for variable-size binary data (e.g., variable-length strings) and the buffer's associated dtype. diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py index f5c7335a283..1ebf87a7d2d 100644 --- a/python/pyarrow/interchange/dataframe.py +++ b/python/pyarrow/interchange/dataframe.py @@ -111,18 +111,14 @@ def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable[TableXchg]: Return an iterator yielding the chunks. """ if n_chunks and n_chunks > 1: - if n_chunks % self.num_chunks() == 0: - chunk_size = self.num_rows() // n_chunks - if self.num_rows() % n_chunks != 0: - chunk_size += 1 - batches = self._df.to_batches(max_chunksize=chunk_size) - # In case when the size of the chunk is such that the resulting - # list is one less chunk then n_chunks -> append an empty chunk - if len(batches) == n_chunks - 1: - batches.append(pa.record_batch([])) - else: - warnings.warn( - "``n_chunks`` must be a multiple of ``self.num_chunks()``") + chunk_size = self.num_rows() // n_chunks + if self.num_rows() % n_chunks != 0: + chunk_size += 1 + batches = self._df.to_batches(max_chunksize=chunk_size) + # In case when the size of the chunk is such that the resulting + # list is one less chunk then n_chunks -> append an empty chunk + if len(batches) == n_chunks - 1: + batches.append(pa.record_batch([])) else: batches = self._df.to_batches() From 6f746fb42f3d73e679db1d3468197e1436915f36 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 8 Nov 2022 12:50:02 +0100 Subject: [PATCH 07/29] Make changes to from_dataframe.py skeleton --- python/pyarrow/interchange/from_dataframe.py | 24 ++++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py index 884c2c2c3f8..5865280c543 100644 --- a/python/pyarrow/interchange/from_dataframe.py +++ b/python/pyarrow/interchange/from_dataframe.py @@ -26,9 +26,9 @@ import pyarrow as pa -def from_dataframe(df, allow_copy=True) -> pd.DataFrame: +def from_dataframe(df, allow_copy=True) -> pa.Table: """ - Build a ``pd.DataFrame`` from any DataFrame supporting the interchange + Build a ``pa.Table`` from any DataFrame supporting the interchange protocol. Parameters @@ -41,7 +41,7 @@ def from_dataframe(df, allow_copy=True) -> pd.DataFrame: (if false then zero-copy approach is requested). Returns ------- - pd.DataFrame + pa.Table """ if isinstance(df, pa.Table): return df @@ -54,7 +54,7 @@ def from_dataframe(df, allow_copy=True) -> pd.DataFrame: def _from_dataframe(df: DataFrameXchg, allow_copy=True): """ - Build a ``pd.DataFrame`` from the DataFrame interchange object. + Build a ``pa.Table`` from the DataFrame interchange object. Parameters ---------- df : DataFrameXchg @@ -65,12 +65,12 @@ def _from_dataframe(df: DataFrameXchg, allow_copy=True): (if false then zero-copy approach is requested). Returns ------- - pd.DataFrame + pa.Table """ pass -def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame: +def protocol_df_chunk_to_pyarrow(df: DataFrameXchg) -> pa.Table: """ Convert interchange protocol chunk to ``pd.DataFrame``. Parameters @@ -78,7 +78,7 @@ def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame: df : DataFrameXchg Returns ------- - pd.DataFrame + pa.Table """ # We need a dict of columns here, with each column being a NumPy array # (at least for now, deal with non-NumPy dtypes later). @@ -128,7 +128,7 @@ def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: pass -def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]: +def categorical_column_to_dictionary(col: Column) -> tuple[pa.ChunkedArray, Any]: """ Convert a column holding categorical data to a pandas Series. Parameters @@ -137,7 +137,7 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]: Returns ------- tuple - Tuple of pd.Series holding the data and the memory owner object + Tuple of pa.ChunkedArray holding the data and the memory owner object that keeps the memory alive. """ pass @@ -230,7 +230,7 @@ def bitmask_to_bool_ndarray( def set_nulls( - data: np.ndarray | pd.Series, + data: np.ndarray | pa.Array | pa.ChunkedArray, col: Column, validity: tuple[Buffer, tuple[DtypeKind, int, str, str]] | None, allow_modify_inplace: bool = True, @@ -239,7 +239,7 @@ def set_nulls( Set null values for the data according to the column null kind. Parameters ---------- - data : np.ndarray or pd.Series + data : np.ndarray, pa.Array or pa.ChunkedArray, Data to set nulls in. col : Column Column object that describes the `data`. @@ -252,7 +252,7 @@ def set_nulls( (True) or always modify a copy of the `data` (False). Returns ------- - np.ndarray or pd.Series + np.ndarray, pa.Array or pa.ChunkedArray, Data with the nulls being set. """ pass From 1669224ac8cbb7cdea3dbf06868bcbfcfc7d2661 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 8 Nov 2022 14:50:03 +0100 Subject: [PATCH 08/29] Add extra tests and make minor corrections --- python/pyarrow/interchange/buffer.py | 1 - python/pyarrow/interchange/column.py | 2 +- python/pyarrow/interchange/dataframe.py | 1 - .../pyarrow/tests/interchange/test_extra.py | 63 +++++++++++++++++++ 4 files changed, 64 insertions(+), 3 deletions(-) create mode 100644 python/pyarrow/tests/interchange/test_extra.py diff --git a/python/pyarrow/interchange/buffer.py b/python/pyarrow/interchange/buffer.py index 6c183c034ce..d09fc793810 100644 --- a/python/pyarrow/interchange/buffer.py +++ b/python/pyarrow/interchange/buffer.py @@ -17,7 +17,6 @@ from __future__ import annotations -import numpy as np import pyarrow as pa from pyarrow.interchange.dataframe_protocol import ( diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py index 4c1aa2f6608..085a7dd2294 100644 --- a/python/pyarrow/interchange/column.py +++ b/python/pyarrow/interchange/column.py @@ -88,7 +88,7 @@ def size(self) -> int: Size of the column, in elements. """ if isinstance(self._col, pa.Array): - len = self._col.to_numpy().size + len = self._col.to_numpy(zero_copy_only=False).size else: len = self._col.length() return len diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py index 1ebf87a7d2d..319c54e31a8 100644 --- a/python/pyarrow/interchange/dataframe.py +++ b/python/pyarrow/interchange/dataframe.py @@ -27,7 +27,6 @@ ) import pyarrow as pa -import warnings from pyarrow.interchange.column import PyArrowColumn from pyarrow.interchange.dataframe_protocol import DataFrame as DataFrameXchg diff --git a/python/pyarrow/tests/interchange/test_extra.py b/python/pyarrow/tests/interchange/test_extra.py new file mode 100644 index 00000000000..5046cecea60 --- /dev/null +++ b/python/pyarrow/tests/interchange/test_extra.py @@ -0,0 +1,63 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import ctypes + +import pandas as pd +import pyarrow as pa +import pytest + +from pyarrow.interchange.column import PyArrowColumn +from pyarrow.interchange.dataframe_protocol import ( + ColumnNullType, + DtypeKind, +) + +def test_datetime(): + df = pd.DataFrame({"A": [pd.Timestamp("2022-01-01"), pd.NaT]}) + table = pa.table(df) + col = table.__dataframe__().get_column_by_name("A") + + assert col.size() == 2 + assert col.null_count == 1 + assert col.dtype[0] == DtypeKind.DATETIME + assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0) + + +@pytest.mark.parametrize( + ["test_data", "kind"], + [ + (["foo", "bar"], 21), + ([1.5, 2.5, 3.5], 2), + ([1, 2, 3, 4], 0), + ], +) +def test_array_to_pyarrowcolumn(test_data, kind): + arr = pa.array(test_data) + arr_column = PyArrowColumn(arr) + + assert arr_column._col == arr + assert arr_column.size() == len(test_data) + assert arr_column.dtype[0] == kind + assert arr_column.num_chunks() == 1 + assert arr_column.null_count == 0 + assert arr_column.get_buffers()["validity"] == None + assert len(list(arr_column.get_chunks())) == 1 + + for chunk in arr_column.get_chunks(): + assert chunk == arr_column + From 473414e49c31188c09cdcc416fbb0a059a3913f1 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 8 Nov 2022 18:18:59 +0100 Subject: [PATCH 09/29] Run linter --- python/pyarrow/interchange/column.py | 41 +++++++++++-------- python/pyarrow/interchange/dataframe.py | 18 ++++---- .../pyarrow/interchange/dataframe_protocol.py | 41 +++++++++++-------- python/pyarrow/interchange/from_dataframe.py | 14 +++++-- .../pyarrow/tests/interchange/test_extra.py | 8 ++-- .../interchange/test_interchange_spec.py | 13 +++--- 6 files changed, 75 insertions(+), 60 deletions(-) diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py index 085a7dd2294..e4110e51fb4 100644 --- a/python/pyarrow/interchange/column.py +++ b/python/pyarrow/interchange/column.py @@ -17,7 +17,6 @@ from __future__ import annotations -import warnings from typing import ( Any, Dict, @@ -75,7 +74,9 @@ class PyArrowColumn(Column): doesn't need its own version or ``__column__`` protocol. """ - def __init__(self, column: pa.Array | pa.ChunkedArray, allow_copy: bool = True) -> None: + def __init__( + self, column: pa.Array | pa.ChunkedArray, allow_copy: bool = True + ) -> None: """ Handles PyArrow Arrays and ChunkedArrays. """ @@ -131,7 +132,7 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]: dtype = self._col.type try: bit_width = dtype.bit_width - except: # in case of a variable-length strings + except ValueError: # in case of a variable-length strings bit_width = None if pa.types.is_timestamp(dtype): @@ -145,18 +146,21 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]: else: return self._dtype_from_arrowdtype(dtype, bit_width) - - def _dtype_from_arrowdtype(self, dtype, bit_width) -> Tuple[DtypeKind, int, str, str]: + def _dtype_from_arrowdtype( + self, dtype, bit_width + ) -> Tuple[DtypeKind, int, str, str]: """ See `self.dtype` for details. """ # Note: 'c' (complex) not handled yet (not in array spec v1). - # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled - # datetime and timedelta both map to datetime (is timedelta handled?) + # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) + # not handled datetime and timedelta both map to datetime + # (is timedelta handled?) kind, f_string = _PYARROW_KINDS.get(dtype, (None, None)) if kind is None: - raise ValueError(f"Data type {dtype} not supported by interchange protocol") + raise ValueError( + f"Data type {dtype} not supported by interchange protocol") return kind, bit_width, f_string, Endianness.NATIVE @@ -181,11 +185,12 @@ def describe_categorical(self) -> CategoricalDescription: if isinstance(self._col, pa.ChunkedArray): arr = self._col.combine_chunks() else: - arr = self._col + arr = self._col if not pa.types.is_dictionary(arr.type): raise TypeError( - "describe_categorical only works on a column with categorical dtype!" + "describe_categorical only works on a column with " + "categorical dtype!" ) return { @@ -240,9 +245,9 @@ def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]: i = 0 for start in range(0, chunk_size * n_chunks, chunk_size): yield PyArrowColumn( - array.slice(start,chunk_size), self._allow_copy + array.slice(start, chunk_size), self._allow_copy ) - i +=1 + i += 1 # In case when the size of the chunk is such that the resulting # list is one less chunk then n_chunks -> append an empty chunk if i == n_chunks - 1: @@ -254,7 +259,6 @@ def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]: ] else: yield self - def get_buffers(self) -> ColumnBuffers: """ @@ -297,7 +301,8 @@ def _get_data_buffer( self, ) -> Tuple[PyArrowBuffer, Any]: # Any is for self.dtype tuple """ - Return the buffer containing the data and the buffer's associated dtype. + Return the buffer containing the data and the buffer's + associated dtype. """ if isinstance(self._col, pa.ChunkedArray): array = self._col.combine_chunks() @@ -309,12 +314,12 @@ def _get_data_buffer( elif n == 3: return PyArrowBuffer(array.buffers()[2]), self.dtype - def _get_validity_buffer(self) -> Tuple[PyArrowBuffer, Any]: """ - Return the buffer containing the mask values indicating missing data and - the buffer's associated dtype. - Raises NoBufferPresent if null representation is not a bit or byte mask. + Return the buffer containing the mask values indicating missing data + and the buffer's associated dtype. + Raises NoBufferPresent if null representation is not a bit or byte + mask. """ # Define the dtype of the returned buffer dtype = (DtypeKind.BOOL, 8, "b", Endianness.NATIVE) diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py index 319c54e31a8..3c9b0cb1f74 100644 --- a/python/pyarrow/interchange/dataframe.py +++ b/python/pyarrow/interchange/dataframe.py @@ -18,12 +18,9 @@ from __future__ import annotations from typing import ( Any, - Dict, Iterable, Optional, Sequence, - Tuple, - TypedDict, ) import pyarrow as pa @@ -50,9 +47,10 @@ def __init__( """ self._df = df # ``nan_as_null`` is a keyword intended for the consumer to tell the - # producer to overwrite null values in the data with ``NaN`` (or ``NaT``). - # This currently has no effect; once support for nullable extension - # dtypes is added, this value should be propagated to columns. + # producer to overwrite null values in the data with ``NaN`` (or + # ``NaT``). This currently has no effect; once support for nullable + # extension dtypes is added, this value should be propagated to + # columns. self._nan_as_null = nan_as_null self._allow_copy = allow_copy @@ -105,7 +103,9 @@ def select_columns_by_name(self, names: Sequence[str]) -> TableXchg: self._df.select(list(names)), self._nan_as_null, self._allow_copy ) - def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable[TableXchg]: + def get_chunks( + self, n_chunks: Optional[int] = None + ) -> Iterable[TableXchg]: """ Return an iterator yielding the chunks. """ @@ -122,8 +122,8 @@ def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable[TableXchg]: batches = self._df.to_batches() iterator_tables = [TableXchg( - pa.Table.from_batches([batch]), self._nan_as_null, self._allow_copy - ) + pa.Table.from_batches([batch]), self._nan_as_null, self._allow_copy + ) for batch in batches ] return iterator_tables diff --git a/python/pyarrow/interchange/dataframe_protocol.py b/python/pyarrow/interchange/dataframe_protocol.py index de30ede8d63..13a5d337c2d 100644 --- a/python/pyarrow/interchange/dataframe_protocol.py +++ b/python/pyarrow/interchange/dataframe_protocol.py @@ -256,11 +256,11 @@ def dtype(self) -> Dtype: Notes: - Kind specifiers are aligned with DLPack where possible (hence the jump to 20, leave enough room for future extension) - - Masks must be specified as boolean with either bit width 1 (for bit - masks) or 8 (for byte masks). + - Masks must be specified as boolean with either bit width 1 (for + bit masks) or 8 (for byte masks). - Dtype width in bits was preferred over bytes - - Endianness isn't too useful, but included now in case in the future - we need to support non-native endianness + - Endianness isn't too useful, but included now in case in the + future we need to support non-native endianness - Went with Apache Arrow format strings over NumPy format strings because they're more complete from a dataframe perspective - Format strings are mostly useful for datetime specification, and @@ -269,8 +269,8 @@ def dtype(self) -> Dtype: categorical in the data buffer. In case of a separate encoding of the categorical (e.g. an integer to string mapping), this can be derived from ``self.describe_categorical``. - - Data types not included: complex, Arrow-style null, binary, decimal, - and nested (list, struct, map, union) dtypes. + - Data types not included: complex, Arrow-style null, binary, + decimal, and nested (list, struct, map, union) dtypes. """ pass @@ -280,16 +280,19 @@ def describe_categorical(self) -> CategoricalDescription: """ If the dtype is categorical, there are two options: - There are only values in the data buffer. - - There is a separate non-categorical Column encoding categorical values. + - There is a separate non-categorical Column encoding categorical + values. Raises TypeError if the dtype is not categorical - Returns the dictionary with description on how to interpret the data buffer: - - "is_ordered" : bool, whether the ordering of dictionary indices is - semantically meaningful. + Returns the dictionary with description on how to interpret the + data buffer: + - "is_ordered" : bool, whether the ordering of dictionary indices + is semantically meaningful. - "is_dictionary" : bool, whether a mapping of categorical values to other objects exists - - "categories" : Column representing the (implicit) mapping of indices - to category values (e.g. an array of cat1, cat2, ...). - None if not a dictionary-style categorical. + - "categories" : Column representing the (implicit) mapping of + indices to category values (e.g. an array of + cat1, cat2, ...). None if not a dictionary-style + categorical. TBD: are there any other in-memory representations that are needed? """ pass @@ -301,8 +304,8 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]: Return the missing value (or "null") representation the column dtype uses, as a tuple ``(kind, value)``. Value : if kind is "sentinel value", the actual value. If kind is a bit - mask or a byte mask, the value (0 or 1) indicating a missing value. None - otherwise. + mask or a byte mask, the value (0 or 1) indicating a missing value. + None otherwise. """ pass @@ -396,8 +399,8 @@ def __dataframe__( mask or byte mask that is the producer's native representation. ``allow_copy`` is a keyword that defines whether or not the library is allowed to make a copy of the data. For example, copying data would be - necessary if a library supports strided buffers, given that this protocol - specifies contiguous buffers. + necessary if a library supports strided buffers, given that this + protocol specifies contiguous buffers. """ pass @@ -482,7 +485,9 @@ def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame": pass @abstractmethod - def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["DataFrame"]: + def get_chunks( + self, n_chunks: Optional[int] = None + ) -> Iterable["DataFrame"]: """ Return an iterator yielding the chunks. By default (None), yields the chunks that the data is stored as by the diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py index 5865280c543..9d05c7ba00d 100644 --- a/python/pyarrow/interchange/from_dataframe.py +++ b/python/pyarrow/interchange/from_dataframe.py @@ -15,14 +15,18 @@ # specific language governing permissions and limitations # under the License. -from pyarrow.interchange.column import PyArrowColumn +from typing import ( + Any, +) + from pyarrow.interchange.dataframe_protocol import ( Buffer, Column, - ColumnNullType, DataFrame as DataFrameXchg, DtypeKind, ) + +import numpy as np import pyarrow as pa @@ -99,7 +103,7 @@ def protocol_df_chunk_to_pyarrow(df: DataFrameXchg) -> pa.Table: ): columns[name], buf = primitive_column_to_ndarray(col) elif dtype == DtypeKind.CATEGORICAL: - columns[name], buf = categorical_column_to_series(col) + columns[name], buf = categorical_column_to_dictionary(col) elif dtype == DtypeKind.STRING: columns[name], buf = string_column_to_ndarray(col) elif dtype == DtypeKind.DATETIME: @@ -128,7 +132,9 @@ def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: pass -def categorical_column_to_dictionary(col: Column) -> tuple[pa.ChunkedArray, Any]: +def categorical_column_to_dictionary( + col: Column +) -> tuple[pa.ChunkedArray, Any]: """ Convert a column holding categorical data to a pandas Series. Parameters diff --git a/python/pyarrow/tests/interchange/test_extra.py b/python/pyarrow/tests/interchange/test_extra.py index 5046cecea60..8fd683c8bf5 100644 --- a/python/pyarrow/tests/interchange/test_extra.py +++ b/python/pyarrow/tests/interchange/test_extra.py @@ -15,8 +15,6 @@ # specific language governing permissions and limitations # under the License. -import ctypes - import pandas as pd import pyarrow as pa import pytest @@ -27,6 +25,7 @@ DtypeKind, ) + def test_datetime(): df = pd.DataFrame({"A": [pd.Timestamp("2022-01-01"), pd.NaT]}) table = pa.table(df) @@ -55,9 +54,8 @@ def test_array_to_pyarrowcolumn(test_data, kind): assert arr_column.dtype[0] == kind assert arr_column.num_chunks() == 1 assert arr_column.null_count == 0 - assert arr_column.get_buffers()["validity"] == None + assert arr_column.get_buffers()["validity"] is None assert len(list(arr_column.get_chunks())) == 1 - + for chunk in arr_column.get_chunks(): assert chunk == arr_column - diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py index 97e9302c700..c292c9eefab 100644 --- a/python/pyarrow/tests/interchange/test_interchange_spec.py +++ b/python/pyarrow/tests/interchange/test_interchange_spec.py @@ -20,6 +20,7 @@ import pyarrow as pa import pytest + @pytest.mark.parametrize( "test_data", [ @@ -54,8 +55,8 @@ def test_mixed_dtypes(): } ) df = table.__dataframe__() - # for meanings of dtype[0] see the spec; we cannot import the spec here as this - # file is expected to be vendored *anywhere*; + # for meanings of dtype[0] see the spec; we cannot import the + # spec here as this file is expected to be vendored *anywhere*; # values for dtype[0] are explained above columns = {"a": 0, "b": 0, "c": 2, "d": 0, "e": 20, "f": 21} @@ -140,8 +141,8 @@ def test_get_columns(): for col in df.get_columns(): assert col.size() == 2 assert col.num_chunks() == 1 - # for meanings of dtype[0] see the spec; we cannot import the spec here as this - # file is expected to be vendored *anywhere* + # for meanings of dtype[0] see the spec; we cannot import the + # spec here as this file is expected to be vendored *anywhere* assert df.get_column(0).dtype[0] == 0 # INT assert df.get_column(1).dtype[0] == 2 # FLOAT @@ -159,8 +160,8 @@ def test_buffer(): assert dataBuf.ptr != 0 device, _ = dataBuf.__dlpack_device__() - # for meanings of dtype[0] see the spec; we cannot import the spec here as this - # file is expected to be vendored *anywhere* + # for meanings of dtype[0] see the spec; we cannot import the spec + # here as this file is expected to be vendored *anywhere* assert dataDtype[0] == 0 # INT if device == 1: # CPU-only as we're going to directly read memory here From cba43740e2cdc7d72e34ee8d362e94938489317d Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 9 Nov 2022 08:30:56 +0100 Subject: [PATCH 10/29] Make changes to the code to make pa.Table -> pd.DataFrame work for int, float with missing values --- python/pyarrow/interchange/column.py | 6 +++++- python/pyarrow/interchange/dataframe.py | 7 +++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py index e4110e51fb4..9953a30eb2e 100644 --- a/python/pyarrow/interchange/column.py +++ b/python/pyarrow/interchange/column.py @@ -133,7 +133,11 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]: try: bit_width = dtype.bit_width except ValueError: # in case of a variable-length strings - bit_width = None + bit_width = 8 + # In case of bool data type, bit_width is 1 and has to be multiplied + # by 8 (why is that not the case for other dtypes?) + if pa.types.is_boolean(dtype): + bit_width *= 8 if pa.types.is_timestamp(dtype): kind = DtypeKind.DATETIME diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py index 3c9b0cb1f74..c5b4be9b539 100644 --- a/python/pyarrow/interchange/dataframe.py +++ b/python/pyarrow/interchange/dataframe.py @@ -63,9 +63,12 @@ def __dataframe__( def metadata(self) -> dict[str, Any]: # The metadata for the data frame, as a dictionary with string keys. # Add schema metadata here (pandas metadata, ot custom metadata) - schema_metadata = {k.decode('utf8'): v.decode('utf8') + if self._df.schema.metadata: + schema_metadata = {k.decode('utf8'): v.decode('utf8') for k, v in self._df.schema.metadata.items()} - return schema_metadata + return schema_metadata + else: + return {} def num_columns(self) -> int: return self._df.num_columns From c02145152ccbdb5c6f8ace6801b9479d31e2c547 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 9 Nov 2022 09:42:48 +0100 Subject: [PATCH 11/29] Correct linter error and add a check for TypedDict import --- python/pyarrow/interchange/dataframe.py | 2 +- python/pyarrow/interchange/dataframe_protocol.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py index c5b4be9b539..5eb93abacc3 100644 --- a/python/pyarrow/interchange/dataframe.py +++ b/python/pyarrow/interchange/dataframe.py @@ -65,7 +65,7 @@ def metadata(self) -> dict[str, Any]: # Add schema metadata here (pandas metadata, ot custom metadata) if self._df.schema.metadata: schema_metadata = {k.decode('utf8'): v.decode('utf8') - for k, v in self._df.schema.metadata.items()} + for k, v in self._df.schema.metadata.items()} return schema_metadata else: return {} diff --git a/python/pyarrow/interchange/dataframe_protocol.py b/python/pyarrow/interchange/dataframe_protocol.py index 13a5d337c2d..aa3637239d1 100644 --- a/python/pyarrow/interchange/dataframe_protocol.py +++ b/python/pyarrow/interchange/dataframe_protocol.py @@ -34,6 +34,13 @@ TypedDict, ) +import sys + +if sys.version_info >= (3, 8): + from typing import TypedDict +else: + from typing_extensions import TypedDict + class DlpackDeviceType(enum.IntEnum): """Integer enum for device type codes matching DLPack.""" From 7e1e6bd692a61ef87379f564b3a6e10788116929 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 9 Nov 2022 10:21:15 +0100 Subject: [PATCH 12/29] Use len(...) for the size of the pa.Array/pa.ChunkedArray --- python/pyarrow/interchange/column.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py index 9953a30eb2e..f6de4de732d 100644 --- a/python/pyarrow/interchange/column.py +++ b/python/pyarrow/interchange/column.py @@ -88,11 +88,7 @@ def size(self) -> int: """ Size of the column, in elements. """ - if isinstance(self._col, pa.Array): - len = self._col.to_numpy(zero_copy_only=False).size - else: - len = self._col.length() - return len + return len(self._col) @property def offset(self) -> int: From df9b24bc9bfdfa625711c0260ee688ba8d0678af Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 9 Nov 2022 11:58:09 +0100 Subject: [PATCH 13/29] Add missing annotations import and remove TypedDict leftover --- python/pyarrow/interchange/dataframe_protocol.py | 1 - python/pyarrow/interchange/from_dataframe.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/interchange/dataframe_protocol.py b/python/pyarrow/interchange/dataframe_protocol.py index aa3637239d1..d83dec30495 100644 --- a/python/pyarrow/interchange/dataframe_protocol.py +++ b/python/pyarrow/interchange/dataframe_protocol.py @@ -31,7 +31,6 @@ Optional, Sequence, Tuple, - TypedDict, ) import sys diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py index 9d05c7ba00d..f4fcfe45855 100644 --- a/python/pyarrow/interchange/from_dataframe.py +++ b/python/pyarrow/interchange/from_dataframe.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +from __future__ import annotations + from typing import ( Any, ) From 494ffbc6dfcd251cb897dee318ad5fb434304a1d Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 10 Nov 2022 12:31:40 +0100 Subject: [PATCH 14/29] Remove bool bit_width check --- python/pyarrow/interchange/column.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py index f6de4de732d..1a35f79c905 100644 --- a/python/pyarrow/interchange/column.py +++ b/python/pyarrow/interchange/column.py @@ -130,10 +130,6 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]: bit_width = dtype.bit_width except ValueError: # in case of a variable-length strings bit_width = 8 - # In case of bool data type, bit_width is 1 and has to be multiplied - # by 8 (why is that not the case for other dtypes?) - if pa.types.is_boolean(dtype): - bit_width *= 8 if pa.types.is_timestamp(dtype): kind = DtypeKind.DATETIME From 784d178f59351ffdfe44dfe7b6a342232a2bec71 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Mon, 14 Nov 2022 20:40:04 +0100 Subject: [PATCH 15/29] Change buffer representation of boolean arrays --- python/pyarrow/interchange/column.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py index 1a35f79c905..ef130765f4e 100644 --- a/python/pyarrow/interchange/column.py +++ b/python/pyarrow/interchange/column.py @@ -26,6 +26,7 @@ ) import pyarrow as pa +import pyarrow.compute as pc from pyarrow.interchange.buffer import PyArrowBuffer from pyarrow.interchange.dataframe_protocol import (Column, ColumnBuffers, ColumnNullType, DtypeKind, @@ -197,7 +198,7 @@ def describe_categorical(self) -> CategoricalDescription: @property def describe_null(self) -> Tuple[ColumnNullType, Any]: - return ColumnNullType.USE_BYTEMASK, 0 + return ColumnNullType.USE_BITMASK, 0 @property def null_count(self) -> int: @@ -304,11 +305,19 @@ def _get_data_buffer( array = self._col.combine_chunks() else: array = self._col + dtype = self.dtype + + # In case of boolean arrays, cast to uint8 array + # as bit packed buffers are not supported + if pa.types.is_boolean(array.type): + array = pc.cast(array, pa.uint8()) + dtype = PyArrowColumn(array).dtype + n = len(array.buffers()) if n == 2: - return PyArrowBuffer(array.buffers()[1]), self.dtype + return PyArrowBuffer(array.buffers()[1]), dtype elif n == 3: - return PyArrowBuffer(array.buffers()[2]), self.dtype + return PyArrowBuffer(array.buffers()[2]), dtype def _get_validity_buffer(self) -> Tuple[PyArrowBuffer, Any]: """ From 33784dad41b1395cb4b4989ef0ce31c5679ce291 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 15 Nov 2022 11:55:31 +0100 Subject: [PATCH 16/29] Remove dataframe protocol abstract classes and move the docstrings and necessary defenitions to separate implementation files --- python/pyarrow/interchange/buffer.py | 36 +- python/pyarrow/interchange/column.py | 192 ++++++- python/pyarrow/interchange/dataframe.py | 104 +++- .../pyarrow/interchange/dataframe_protocol.py | 506 ------------------ python/pyarrow/table.pxi | 4 +- 5 files changed, 276 insertions(+), 566 deletions(-) delete mode 100644 python/pyarrow/interchange/dataframe_protocol.py diff --git a/python/pyarrow/interchange/buffer.py b/python/pyarrow/interchange/buffer.py index d09fc793810..9f30f2b99e3 100644 --- a/python/pyarrow/interchange/buffer.py +++ b/python/pyarrow/interchange/buffer.py @@ -16,18 +16,35 @@ # under the License. from __future__ import annotations +import enum import pyarrow as pa -from pyarrow.interchange.dataframe_protocol import ( - Buffer, - DlpackDeviceType, -) +class DlpackDeviceType(enum.IntEnum): + """Integer enum for device type codes matching DLPack.""" -class PyArrowBuffer(Buffer): + CPU = 1 + CUDA = 2 + CPU_PINNED = 3 + OPENCL = 4 + VULKAN = 7 + METAL = 8 + VPI = 9 + ROCM = 10 + + +class _PyArrowBuffer: """ Data in the buffer is guaranteed to be contiguous in memory. + Note that there is no dtype attribute present, a buffer can be thought of + as simply a block of memory. However, if the column that the buffer is + attached to has a dtype that's supported by DLPack and ``__dlpack__`` is + implemented, then that dtype information will be contained in the return + value from ``__dlpack__``. + This distinction is useful to support both data exchange via DLPack on a + buffer and (b) dtypes like variable-length strings which do not have a + fixed number of bytes per element. """ def __init__(self, x: pa.Buffer, allow_copy: bool = True) -> None: @@ -52,13 +69,20 @@ def ptr(self) -> int: def __dlpack__(self): """ - Represent this structure as DLPack interface. + Produce DLPack capsule (see array API standard). + Raises: + - TypeError : if the buffer contains unsupported dtypes. + - NotImplementedError : if DLPack support is not implemented + Useful to have to connect to array libraries. Support optional because + it's not completely trivial to implement for a Python-only library. """ raise NotImplementedError("__dlpack__") def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: """ Device type and device ID for where the data in the buffer resides. + Uses device type codes matching DLPack. + Note: must be implemented even if ``__dlpack__`` is not. """ if self._x.is_cpu: return (DlpackDeviceType.CPU, None) diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py index ef130765f4e..10c3f7c8c5b 100644 --- a/python/pyarrow/interchange/column.py +++ b/python/pyarrow/interchange/column.py @@ -17,6 +17,7 @@ from __future__ import annotations +import enum from typing import ( Any, Dict, @@ -25,12 +26,49 @@ Tuple, ) +import sys +if sys.version_info >= (3, 8): + from typing import TypedDict +else: + from typing_extensions import TypedDict + import pyarrow as pa import pyarrow.compute as pc -from pyarrow.interchange.buffer import PyArrowBuffer -from pyarrow.interchange.dataframe_protocol import (Column, ColumnBuffers, - ColumnNullType, DtypeKind, - CategoricalDescription) +from pyarrow.interchange.buffer import _PyArrowBuffer + + +class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + + +Dtype = Tuple[DtypeKind, int, str, str] # see Column.dtype + _PYARROW_KINDS = { pa.int8(): (DtypeKind.INT, "c"), @@ -50,6 +88,58 @@ } +class ColumnNullType(enum.IntEnum): + """ + Integer enum for null type representation. + Attributes + ---------- + NON_NULLABLE : int + Non-nullable column. + USE_NAN : int + Use explicit float NaN value. + USE_SENTINEL : int + Sentinel value besides NaN. + USE_BITMASK : int + The bit is set/unset representing a null on a certain position. + USE_BYTEMASK : int + The byte is set/unset representing a null on a certain position. + """ + + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + + +class ColumnBuffers(TypedDict): + # first element is a buffer containing the column data; + # second element is the data buffer's associated dtype + data: Tuple[_PyArrowBuffer, Dtype] + + # first element is a buffer containing mask values indicating missing data; + # second element is the mask value buffer's associated dtype. + # None if the null representation is not a bit or byte mask + validity: Optional[Tuple[_PyArrowBuffer, Dtype]] + + # first element is a buffer containing the offset values for + # variable-size binary data (e.g., variable-length strings); + # second element is the offsets buffer's associated dtype. + # None if the data buffer does not have an associated offsets buffer + offsets: Optional[Tuple[_PyArrowBuffer, Dtype]] + + +class CategoricalDescription(TypedDict): + # whether the ordering of dictionary indices is semantically meaningful + is_ordered: bool + # whether a dictionary-style mapping of categorical values to other objects + # exists + is_dictionary: bool + # Python-level only (e.g. ``{int: str}``). + # None if not a dictionary-style categorical. + # categories: Optional[Column] + + class Endianness: """Enum indicating the byte-order of a data-type.""" @@ -63,7 +153,7 @@ class NoBufferPresent(Exception): """Exception to signal that there is no requested buffer.""" -class PyArrowColumn(Column): +class _PyArrowColumn: """ A column object, with only the methods and properties required by the interchange protocol defined. @@ -71,6 +161,31 @@ class PyArrowColumn(Column): buffers - a data buffer, a mask buffer (depending on null representation), and an offsets buffer (if variable-size binary; e.g., variable-length strings). + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. + Instead, it seems to use "children" for both columns with a bit mask, + and for nested dtypes. Unclear whether this is elegant or confusing. + This design requires checking the null representation explicitly. + The Arrow design requires checking: + 1. the ARROW_FLAG_NULLABLE (for sentinel values) + 2. if a column has two children, combined with one of those children + having a null dtype. + Making the mask concept explicit seems useful. One null dtype would + not be enough to cover both bit and byte masks, so that would mean + even more checking if we did it the Arrow way. + TBD: there's also the "chunk" concept here, which is implicit in Arrow as + multiple buffers per array (= column here). Semantically it may make + sense to have both: chunks were meant for example for lazy evaluation + of data which doesn't fit in memory, while multiple buffers per column + could also come from doing a selection operation on a single + contiguous buffer. + Given these concepts, one would expect chunks to be all of the same + size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), + while multiple buffers could have data-dependent lengths. Not an issue + in pandas if one column is backed by a single NumPy array, but in + Arrow it seems possible. + Are multiple chunks *and* multiple buffers per column necessary for + the purposes of this interchange protocol, or must producers either + reuse the chunk concept for this or copy the data? Note: this Column object can only be produced by ``__dataframe__``, so doesn't need its own version or ``__column__`` protocol. """ @@ -88,6 +203,10 @@ def __init__( def size(self) -> int: """ Size of the column, in elements. + Corresponds to DataFrame.num_rows() if column is a single chunk; + equal to size of this current chunk otherwise. + Is a method rather than a property because it may cause a (potentially + expensive) computation for some dataframe implementations. """ return len(self._col) @@ -95,6 +214,9 @@ def size(self) -> int: def offset(self) -> int: """ Offset of first element. + May be > 0 if using chunks; for example for a column with N chunks of + equal size M (only the last chunk may be shorter), + ``offset = n * M``, ``n = 0 .. N-1``. """ return 0 @@ -104,14 +226,14 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]: Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. Bit-width : the number of bits as an integer - Format string : data type description format string in Apache Arrow - C Data Interface format. + Format string : data type description format string in Apache Arrow C + Data Interface format. Endianness : current only native endianness (``=``) is supported Notes: - - Kind specifiers are aligned with DLPack where possible (hence - the jump to 20, leave enough room for future extension) - - Masks must be specified as boolean with either bit width 1 - (for bit masks) or 8 (for byte masks). + - Kind specifiers are aligned with DLPack where possible (hence the + jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 (for + bit masks) or 8 (for byte masks). - Dtype width in bits was preferred over bytes - Endianness isn't too useful, but included now in case in the future we need to support non-native endianness @@ -166,18 +288,20 @@ def describe_categorical(self) -> CategoricalDescription: """ If the dtype is categorical, there are two options: - There are only values in the data buffer. - - There is a separate non-categorical Column encoding for categorical + - There is a separate non-categorical Column encoding categorical values. Raises TypeError if the dtype is not categorical - Content of returned dict: + Returns the dictionary with description on how to interpret the + data buffer: - "is_ordered" : bool, whether the ordering of dictionary indices is semantically meaningful. - - "is_dictionary" : bool, whether a dictionary-style mapping of + - "is_dictionary" : bool, whether a mapping of categorical values to other objects exists - "categories" : Column representing the (implicit) mapping of indices to category values (e.g. an array of cat1, cat2, ...). None if not a dictionary-style categorical. + TBD: are there any other in-memory representations that are needed? """ if isinstance(self._col, pa.ChunkedArray): arr = self._col.combine_chunks() @@ -193,24 +317,32 @@ def describe_categorical(self) -> CategoricalDescription: return { "is_ordered": self._col.type.ordered, "is_dictionary": True, - "categories": PyArrowColumn(arr.dictionary), + "categories": _PyArrowColumn(arr.dictionary), } @property def describe_null(self) -> Tuple[ColumnNullType, Any]: + """ + Return the missing value (or "null") representation the column dtype + uses, as a tuple ``(kind, value)``. + Value : if kind is "sentinel value", the actual value. If kind is a bit + mask or a byte mask, the value (0 or 1) indicating a missing value. + None otherwise. + """ return ColumnNullType.USE_BITMASK, 0 @property def null_count(self) -> int: """ - Number of null elements. Should always be known. + Number of null elements, if known. + Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. """ return self._col.null_count @property def metadata(self) -> Dict[str, Any]: """ - Store specific metadata of the column. + The metadata for the column. See `DataFrame.metadata` for more details. """ pass @@ -224,7 +356,9 @@ def num_chunks(self) -> int: n_chunks = self._col.num_chunks return n_chunks - def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]: + def get_chunks( + self, n_chunks: Optional[int] = None + ) -> Iterable[_PyArrowColumn]: """ Return an iterator yielding the chunks. See `DataFrame.get_chunks` for details on ``n_chunks``. @@ -241,17 +375,17 @@ def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]: i = 0 for start in range(0, chunk_size * n_chunks, chunk_size): - yield PyArrowColumn( + yield _PyArrowColumn( array.slice(start, chunk_size), self._allow_copy ) i += 1 # In case when the size of the chunk is such that the resulting # list is one less chunk then n_chunks -> append an empty chunk if i == n_chunks - 1: - yield PyArrowColumn(pa.array([]), self._allow_copy) + yield _PyArrowColumn(pa.array([]), self._allow_copy) elif isinstance(self._col, pa.ChunkedArray): return [ - PyArrowColumn(chunk, self._allow_copy) + _PyArrowColumn(chunk, self._allow_copy) for chunk in self._col.chunks ] else: @@ -296,7 +430,7 @@ def get_buffers(self) -> ColumnBuffers: def _get_data_buffer( self, - ) -> Tuple[PyArrowBuffer, Any]: # Any is for self.dtype tuple + ) -> Tuple[_PyArrowBuffer, Any]: # Any is for self.dtype tuple """ Return the buffer containing the data and the buffer's associated dtype. @@ -311,15 +445,15 @@ def _get_data_buffer( # as bit packed buffers are not supported if pa.types.is_boolean(array.type): array = pc.cast(array, pa.uint8()) - dtype = PyArrowColumn(array).dtype + dtype = _PyArrowColumn(array).dtype n = len(array.buffers()) if n == 2: - return PyArrowBuffer(array.buffers()[1]), dtype + return _PyArrowBuffer(array.buffers()[1]), dtype elif n == 3: - return PyArrowBuffer(array.buffers()[2]), dtype + return _PyArrowBuffer(array.buffers()[2]), dtype - def _get_validity_buffer(self) -> Tuple[PyArrowBuffer, Any]: + def _get_validity_buffer(self) -> Tuple[_PyArrowBuffer, Any]: """ Return the buffer containing the mask values indicating missing data and the buffer's associated dtype. @@ -334,13 +468,13 @@ def _get_validity_buffer(self) -> Tuple[PyArrowBuffer, Any]: array = self._col buff = array.buffers()[0] if buff: - return PyArrowBuffer(buff), dtype + return _PyArrowBuffer(buff), dtype else: raise NoBufferPresent( "There are no missing values so " "does not have a separate mask") - def _get_offsets_buffer(self) -> Tuple[PyArrowBuffer, Any]: + def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any]: """ Return the buffer containing the offset values for variable-size binary data (e.g., variable-length strings) and the buffer's associated dtype. @@ -360,4 +494,4 @@ def _get_offsets_buffer(self) -> Tuple[PyArrowBuffer, Any]: elif n == 3: # Define the dtype of the returned buffer dtype = (DtypeKind.INT, 64, "L", Endianness.NATIVE) - return PyArrowBuffer(array.buffers()[2]), dtype + return _PyArrowBuffer(array.buffers()[2]), dtype diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py index 5eb93abacc3..965432dd938 100644 --- a/python/pyarrow/interchange/dataframe.py +++ b/python/pyarrow/interchange/dataframe.py @@ -25,17 +25,20 @@ import pyarrow as pa -from pyarrow.interchange.column import PyArrowColumn -from pyarrow.interchange.dataframe_protocol import DataFrame as DataFrameXchg +from pyarrow.interchange.column import _PyArrowColumn -class TableXchg(DataFrameXchg): +class _PyArrowDataFrame: """ A data frame class, with only the methods required by the interchange protocol defined. - Instances of this (private) class are returned from - ``pd.DataFrame.__dataframe__`` as objects with the methods and - attributes defined on this class. + A "data frame" represents an ordered collection of named columns. + A column's "name" must be a unique string. + Columns may be accessed by name or by position. + This could be a public data frame class, or an object with the methods and + attributes defined on this DataFrame class could be returned from the + ``__dataframe__`` method of a public data frame class in a library adhering + to the dataframe interchange protocol specification. """ def __init__( @@ -56,13 +59,33 @@ def __init__( def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True - ) -> TableXchg: - return TableXchg(self._df, nan_as_null, allow_copy) + ) -> _PyArrowDataFrame: + """ + Construct a new exchange object, potentially changing the parameters. + ``nan_as_null`` is a keyword intended for the consumer to tell the + producer to overwrite null values in the data with ``NaN``. + It is intended for cases where the consumer does not support the bit + mask or byte mask that is the producer's native representation. + ``allow_copy`` is a keyword that defines whether or not the library is + allowed to make a copy of the data. For example, copying data would be + necessary if a library supports strided buffers, given that this + protocol specifies contiguous buffers. + """ + return _PyArrowDataFrame(self._df, nan_as_null, allow_copy) @property def metadata(self) -> dict[str, Any]: + """ + The metadata for the data frame, as a dictionary with string keys. The + contents of `metadata` may be anything, they are meant for a library + to store information that it needs to, e.g., roundtrip losslessly or + for two implementations to share data that is not (yet) part of the + interchange protocol specification. For avoiding collisions with other + entries, please add name the keys with the name of the library + followed by a period and the desired name, e.g, ``pandas.indexcol``. + """ # The metadata for the data frame, as a dictionary with string keys. - # Add schema metadata here (pandas metadata, ot custom metadata) + # Add schema metadata here (pandas metadata or custom metadata) if self._df.schema.metadata: schema_metadata = {k.decode('utf8'): v.decode('utf8') for k, v in self._df.schema.metadata.items()} @@ -71,46 +94,81 @@ def metadata(self) -> dict[str, Any]: return {} def num_columns(self) -> int: + """ + Return the number of columns in the DataFrame. + """ return self._df.num_columns def num_rows(self) -> int: + """ + Return the number of rows in the DataFrame, if available. + """ return self._df.num_rows def num_chunks(self) -> int: + """ + Return the number of chunks the DataFrame consists of. + """ return self._df.column(0).num_chunks def column_names(self) -> Iterable[str]: + """ + Return an iterator yielding the column names. + """ return self._df.column_names - def get_column(self, i: int) -> PyArrowColumn: - return PyArrowColumn(self._df.column(i), - allow_copy=self._allow_copy) + def get_column(self, i: int) -> _PyArrowColumn: + """ + Return the column at the indicated position. + """ + return _PyArrowColumn(self._df.column(i), + allow_copy=self._allow_copy) - def get_column_by_name(self, name: str) -> PyArrowColumn: - return PyArrowColumn(self._df.column(name), - allow_copy=self._allow_copy) + def get_column_by_name(self, name: str) -> _PyArrowColumn: + """ + Return the column whose name is the indicated name. + """ + return _PyArrowColumn(self._df.column(name), + allow_copy=self._allow_copy) - def get_columns(self) -> Iterable[PyArrowColumn]: + def get_columns(self) -> Iterable[_PyArrowColumn]: + """ + Return an iterator yielding the columns. + """ return [ - PyArrowColumn(col, allow_copy=self._allow_copy) + _PyArrowColumn(col, allow_copy=self._allow_copy) for col in self._df.columns ] - def select_columns(self, indices: Sequence[int]) -> TableXchg: - return TableXchg( + def select_columns(self, indices: Sequence[int]) -> _PyArrowDataFrame: + """ + Create a new DataFrame by selecting a subset of columns by index. + """ + return _PyArrowDataFrame( self._df.select(list(indices)), self._nan_as_null, self._allow_copy ) - def select_columns_by_name(self, names: Sequence[str]) -> TableXchg: - return TableXchg( + def select_columns_by_name( + self, names: Sequence[str] + ) -> _PyArrowDataFrame: + """ + Create a new DataFrame by selecting a subset of columns by name. + """ + return _PyArrowDataFrame( self._df.select(list(names)), self._nan_as_null, self._allow_copy ) def get_chunks( self, n_chunks: Optional[int] = None - ) -> Iterable[TableXchg]: + ) -> Iterable[_PyArrowDataFrame]: """ Return an iterator yielding the chunks. + By default (None), yields the chunks that the data is stored as by the + producer. If given, ``n_chunks`` must be a multiple of + ``self.num_chunks()``, meaning the producer must subdivide each chunk + before yielding it. + Note that the producer must ensure that all columns are chunked the + same way. """ if n_chunks and n_chunks > 1: chunk_size = self.num_rows() // n_chunks @@ -124,7 +182,7 @@ def get_chunks( else: batches = self._df.to_batches() - iterator_tables = [TableXchg( + iterator_tables = [_PyArrowDataFrame( pa.Table.from_batches([batch]), self._nan_as_null, self._allow_copy ) for batch in batches diff --git a/python/pyarrow/interchange/dataframe_protocol.py b/python/pyarrow/interchange/dataframe_protocol.py deleted file mode 100644 index d83dec30495..00000000000 --- a/python/pyarrow/interchange/dataframe_protocol.py +++ /dev/null @@ -1,506 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -Copy of the spec from https://github.com/data-apis/dataframe-api -""" - -from abc import ( - ABC, - abstractmethod, -) -import enum -from typing import ( - Any, - Dict, - Iterable, - Optional, - Sequence, - Tuple, -) - -import sys - -if sys.version_info >= (3, 8): - from typing import TypedDict -else: - from typing_extensions import TypedDict - - -class DlpackDeviceType(enum.IntEnum): - """Integer enum for device type codes matching DLPack.""" - - CPU = 1 - CUDA = 2 - CPU_PINNED = 3 - OPENCL = 4 - VULKAN = 7 - METAL = 8 - VPI = 9 - ROCM = 10 - - -class DtypeKind(enum.IntEnum): - """ - Integer enum for data types. - Attributes - ---------- - INT : int - Matches to signed integer data type. - UINT : int - Matches to unsigned integer data type. - FLOAT : int - Matches to floating point data type. - BOOL : int - Matches to boolean data type. - STRING : int - Matches to string data type (UTF-8 encoded). - DATETIME : int - Matches to datetime data type. - CATEGORICAL : int - Matches to categorical data type. - """ - - INT = 0 - UINT = 1 - FLOAT = 2 - BOOL = 20 - STRING = 21 # UTF-8 - DATETIME = 22 - CATEGORICAL = 23 - - -Dtype = Tuple[DtypeKind, int, str, str] # see Column.dtype - - -class ColumnNullType(enum.IntEnum): - """ - Integer enum for null type representation. - Attributes - ---------- - NON_NULLABLE : int - Non-nullable column. - USE_NAN : int - Use explicit float NaN value. - USE_SENTINEL : int - Sentinel value besides NaN. - USE_BITMASK : int - The bit is set/unset representing a null on a certain position. - USE_BYTEMASK : int - The byte is set/unset representing a null on a certain position. - """ - - NON_NULLABLE = 0 - USE_NAN = 1 - USE_SENTINEL = 2 - USE_BITMASK = 3 - USE_BYTEMASK = 4 - - -class ColumnBuffers(TypedDict): - # first element is a buffer containing the column data; - # second element is the data buffer's associated dtype - data: Tuple["Buffer", Dtype] - - # first element is a buffer containing mask values indicating missing data; - # second element is the mask value buffer's associated dtype. - # None if the null representation is not a bit or byte mask - validity: Optional[Tuple["Buffer", Dtype]] - - # first element is a buffer containing the offset values for - # variable-size binary data (e.g., variable-length strings); - # second element is the offsets buffer's associated dtype. - # None if the data buffer does not have an associated offsets buffer - offsets: Optional[Tuple["Buffer", Dtype]] - - -class CategoricalDescription(TypedDict): - # whether the ordering of dictionary indices is semantically meaningful - is_ordered: bool - # whether a dictionary-style mapping of categorical values to other objects - # exists - is_dictionary: bool - # Python-level only (e.g. ``{int: str}``). - # None if not a dictionary-style categorical. - # categories: Optional[Column] - - -class Buffer(ABC): - """ - Data in the buffer is guaranteed to be contiguous in memory. - Note that there is no dtype attribute present, a buffer can be thought of - as simply a block of memory. However, if the column that the buffer is - attached to has a dtype that's supported by DLPack and ``__dlpack__`` is - implemented, then that dtype information will be contained in the return - value from ``__dlpack__``. - This distinction is useful to support both data exchange via DLPack on a - buffer and (b) dtypes like variable-length strings which do not have a - fixed number of bytes per element. - """ - - @property - @abstractmethod - def bufsize(self) -> int: - """ - Buffer size in bytes. - """ - pass - - @property - @abstractmethod - def ptr(self) -> int: - """ - Pointer to start of the buffer as an integer. - """ - pass - - @abstractmethod - def __dlpack__(self): - """ - Produce DLPack capsule (see array API standard). - Raises: - - TypeError : if the buffer contains unsupported dtypes. - - NotImplementedError : if DLPack support is not implemented - Useful to have to connect to array libraries. Support optional because - it's not completely trivial to implement for a Python-only library. - """ - raise NotImplementedError("__dlpack__") - - @abstractmethod - def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]: - """ - Device type and device ID for where the data in the buffer resides. - Uses device type codes matching DLPack. - Note: must be implemented even if ``__dlpack__`` is not. - """ - pass - - -class Column(ABC): - """ - A column object, with only the methods and properties required by the - interchange protocol defined. - A column can contain one or more chunks. Each chunk can contain up to three - buffers - a data buffer, a mask buffer (depending on null representation), - and an offsets buffer (if variable-size binary; e.g., variable-length - strings). - TBD: Arrow has a separate "null" dtype, and has no separate mask concept. - Instead, it seems to use "children" for both columns with a bit mask, - and for nested dtypes. Unclear whether this is elegant or confusing. - This design requires checking the null representation explicitly. - The Arrow design requires checking: - 1. the ARROW_FLAG_NULLABLE (for sentinel values) - 2. if a column has two children, combined with one of those children - having a null dtype. - Making the mask concept explicit seems useful. One null dtype would - not be enough to cover both bit and byte masks, so that would mean - even more checking if we did it the Arrow way. - TBD: there's also the "chunk" concept here, which is implicit in Arrow as - multiple buffers per array (= column here). Semantically it may make - sense to have both: chunks were meant for example for lazy evaluation - of data which doesn't fit in memory, while multiple buffers per column - could also come from doing a selection operation on a single - contiguous buffer. - Given these concepts, one would expect chunks to be all of the same - size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), - while multiple buffers could have data-dependent lengths. Not an issue - in pandas if one column is backed by a single NumPy array, but in - Arrow it seems possible. - Are multiple chunks *and* multiple buffers per column necessary for - the purposes of this interchange protocol, or must producers either - reuse the chunk concept for this or copy the data? - Note: this Column object can only be produced by ``__dataframe__``, so - doesn't need its own version or ``__column__`` protocol. - """ - - @abstractmethod - def size(self) -> int: - """ - Size of the column, in elements. - Corresponds to DataFrame.num_rows() if column is a single chunk; - equal to size of this current chunk otherwise. - Is a method rather than a property because it may cause a (potentially - expensive) computation for some dataframe implementations. - """ - pass - - @property - @abstractmethod - def offset(self) -> int: - """ - Offset of first element. - May be > 0 if using chunks; for example for a column with N chunks of - equal size M (only the last chunk may be shorter), - ``offset = n * M``, ``n = 0 .. N-1``. - """ - pass - - @property - @abstractmethod - def dtype(self) -> Dtype: - """ - Dtype description as a tuple ``(kind, bit-width, format string, - endianness)``. - Bit-width : the number of bits as an integer - Format string : data type description format string in Apache Arrow C - Data Interface format. - Endianness : current only native endianness (``=``) is supported - Notes: - - Kind specifiers are aligned with DLPack where possible (hence the - jump to 20, leave enough room for future extension) - - Masks must be specified as boolean with either bit width 1 (for - bit masks) or 8 (for byte masks). - - Dtype width in bits was preferred over bytes - - Endianness isn't too useful, but included now in case in the - future we need to support non-native endianness - - Went with Apache Arrow format strings over NumPy format strings - because they're more complete from a dataframe perspective - - Format strings are mostly useful for datetime specification, and - for categoricals. - - For categoricals, the format string describes the type of the - categorical in the data buffer. In case of a separate encoding of - the categorical (e.g. an integer to string mapping), this can - be derived from ``self.describe_categorical``. - - Data types not included: complex, Arrow-style null, binary, - decimal, and nested (list, struct, map, union) dtypes. - """ - pass - - @property - @abstractmethod - def describe_categorical(self) -> CategoricalDescription: - """ - If the dtype is categorical, there are two options: - - There are only values in the data buffer. - - There is a separate non-categorical Column encoding categorical - values. - Raises TypeError if the dtype is not categorical - Returns the dictionary with description on how to interpret the - data buffer: - - "is_ordered" : bool, whether the ordering of dictionary indices - is semantically meaningful. - - "is_dictionary" : bool, whether a mapping of - categorical values to other objects exists - - "categories" : Column representing the (implicit) mapping of - indices to category values (e.g. an array of - cat1, cat2, ...). None if not a dictionary-style - categorical. - TBD: are there any other in-memory representations that are needed? - """ - pass - - @property - @abstractmethod - def describe_null(self) -> Tuple[ColumnNullType, Any]: - """ - Return the missing value (or "null") representation the column dtype - uses, as a tuple ``(kind, value)``. - Value : if kind is "sentinel value", the actual value. If kind is a bit - mask or a byte mask, the value (0 or 1) indicating a missing value. - None otherwise. - """ - pass - - @property - @abstractmethod - def null_count(self) -> Optional[int]: - """ - Number of null elements, if known. - Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. - """ - pass - - @property - @abstractmethod - def metadata(self) -> Dict[str, Any]: - """ - The metadata for the column. See `DataFrame.metadata` for more details. - """ - pass - - @abstractmethod - def num_chunks(self) -> int: - """ - Return the number of chunks the column consists of. - """ - pass - - @abstractmethod - def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]: - """ - Return an iterator yielding the chunks. - See `DataFrame.get_chunks` for details on ``n_chunks``. - """ - pass - - @abstractmethod - def get_buffers(self) -> ColumnBuffers: - """ - Return a dictionary containing the underlying buffers. - The returned dictionary has the following contents: - - "data": a two-element tuple whose first element is a buffer - containing the data and whose second element is the data - buffer's associated dtype. - - "validity": a two-element tuple whose first element is a buffer - containing mask values indicating missing data and - whose second element is the mask value buffer's - associated dtype. None if the null representation is - not a bit or byte mask. - - "offsets": a two-element tuple whose first element is a buffer - containing the offset values for variable-size binary - data (e.g., variable-length strings) and whose second - element is the offsets buffer's associated dtype. None - if the data buffer does not have an associated offsets - buffer. - """ - pass - - -# def get_children(self) -> Iterable[Column]: -# """ -# Children columns underneath the column, each object in this iterator -# must adhere to the column specification. -# """ -# pass - - -class DataFrame(ABC): - """ - A data frame class, with only the methods required by the interchange - protocol defined. - A "data frame" represents an ordered collection of named columns. - A column's "name" must be a unique string. - Columns may be accessed by name or by position. - This could be a public data frame class, or an object with the methods and - attributes defined on this DataFrame class could be returned from the - ``__dataframe__`` method of a public data frame class in a library adhering - to the dataframe interchange protocol specification. - """ - - version = 0 # version of the protocol - - @abstractmethod - def __dataframe__( - self, nan_as_null: bool = False, allow_copy: bool = True - ) -> "DataFrame": - """ - Construct a new exchange object, potentially changing the parameters. - ``nan_as_null`` is a keyword intended for the consumer to tell the - producer to overwrite null values in the data with ``NaN``. - It is intended for cases where the consumer does not support the bit - mask or byte mask that is the producer's native representation. - ``allow_copy`` is a keyword that defines whether or not the library is - allowed to make a copy of the data. For example, copying data would be - necessary if a library supports strided buffers, given that this - protocol specifies contiguous buffers. - """ - pass - - @property - @abstractmethod - def metadata(self) -> Dict[str, Any]: - """ - The metadata for the data frame, as a dictionary with string keys. The - contents of `metadata` may be anything, they are meant for a library - to store information that it needs to, e.g., roundtrip losslessly or - for two implementations to share data that is not (yet) part of the - interchange protocol specification. For avoiding collisions with other - entries, please add name the keys with the name of the library - followed by a period and the desired name, e.g, ``pandas.indexcol``. - """ - pass - - @abstractmethod - def num_columns(self) -> int: - """ - Return the number of columns in the DataFrame. - """ - pass - - @abstractmethod - def num_rows(self) -> Optional[int]: - # TODO: not happy with Optional, but need to flag it may be expensive - # why include it if it may be None - what do we expect consumers - # to do here? - """ - Return the number of rows in the DataFrame, if available. - """ - pass - - @abstractmethod - def num_chunks(self) -> int: - """ - Return the number of chunks the DataFrame consists of. - """ - pass - - @abstractmethod - def column_names(self) -> Iterable[str]: - """ - Return an iterator yielding the column names. - """ - pass - - @abstractmethod - def get_column(self, i: int) -> Column: - """ - Return the column at the indicated position. - """ - pass - - @abstractmethod - def get_column_by_name(self, name: str) -> Column: - """ - Return the column whose name is the indicated name. - """ - pass - - @abstractmethod - def get_columns(self) -> Iterable[Column]: - """ - Return an iterator yielding the columns. - """ - pass - - @abstractmethod - def select_columns(self, indices: Sequence[int]) -> "DataFrame": - """ - Create a new DataFrame by selecting a subset of columns by index. - """ - pass - - @abstractmethod - def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame": - """ - Create a new DataFrame by selecting a subset of columns by name. - """ - pass - - @abstractmethod - def get_chunks( - self, n_chunks: Optional[int] = None - ) -> Iterable["DataFrame"]: - """ - Return an iterator yielding the chunks. - By default (None), yields the chunks that the data is stored as by the - producer. If given, ``n_chunks`` must be a multiple of - ``self.num_chunks()``, meaning the producer must subdivide each chunk - before yielding it. - Note that the producer must ensure that all columns are chunked the - same way. - """ - pass diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 25f498cb0dd..94d8b6b6487 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -2833,9 +2833,9 @@ cdef class Table(_PandasConvertible): dtypes is added, this value should be propagated to columns. """ - from pyarrow.interchange.dataframe import TableXchg + from pyarrow.interchange.dataframe import _PyArrowDataFrame - return TableXchg(self, nan_as_null, allow_copy) + return _PyArrowDataFrame(self, nan_as_null, allow_copy) # ---------------------------------------------------------------------- From 2860911ab671368f1f8004e3f4a9ab3cbd4cec12 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 15 Nov 2022 13:06:31 +0100 Subject: [PATCH 17/29] Add missing changes to the class names and references --- python/pyarrow/interchange/from_dataframe.py | 36 +++++++++---------- .../pyarrow/tests/interchange/test_extra.py | 6 ++-- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py index f4fcfe45855..6c5d576d467 100644 --- a/python/pyarrow/interchange/from_dataframe.py +++ b/python/pyarrow/interchange/from_dataframe.py @@ -21,10 +21,10 @@ Any, ) -from pyarrow.interchange.dataframe_protocol import ( - Buffer, - Column, - DataFrame as DataFrameXchg, +from pyarrow.interchange.buffer import _PyArrowBuffer +from pyarrow.interchange.column import _PyArrowColumn +from pyarrow.interchange.dataframe import ( + _PyArrowDataFrame, DtypeKind, ) @@ -39,7 +39,7 @@ def from_dataframe(df, allow_copy=True) -> pa.Table: Parameters ---------- - df : DataFrameXchg + df : _PyArrowDataFrame Object supporting the interchange protocol, i.e. `__dataframe__` method. allow_copy : bool, default: True @@ -58,12 +58,12 @@ def from_dataframe(df, allow_copy=True) -> pa.Table: return _from_dataframe(df.__dataframe__(allow_copy=allow_copy)) -def _from_dataframe(df: DataFrameXchg, allow_copy=True): +def _from_dataframe(df: _PyArrowDataFrame, allow_copy=True): """ Build a ``pa.Table`` from the DataFrame interchange object. Parameters ---------- - df : DataFrameXchg + df : _PyArrowDataFrame Object supporting the interchange protocol, i.e. `__dataframe__` method. allow_copy : bool, default: True @@ -76,12 +76,12 @@ def _from_dataframe(df: DataFrameXchg, allow_copy=True): pass -def protocol_df_chunk_to_pyarrow(df: DataFrameXchg) -> pa.Table: +def protocol_df_chunk_to_pyarrow(df: _PyArrowDataFrame) -> pa.Table: """ Convert interchange protocol chunk to ``pd.DataFrame``. Parameters ---------- - df : DataFrameXchg + df : _PyArrowDataFrame Returns ------- pa.Table @@ -118,7 +118,7 @@ def protocol_df_chunk_to_pyarrow(df: DataFrameXchg) -> pa.Table: pass -def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: +def primitive_column_to_ndarray(col: _PyArrowColumn) -> tuple[np.ndarray, Any]: """ Convert a column holding one of the primitive dtypes to a NumPy array. A primitive type is one of: int, uint, float, bool. @@ -135,7 +135,7 @@ def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: def categorical_column_to_dictionary( - col: Column + col: _PyArrowColumn ) -> tuple[pa.ChunkedArray, Any]: """ Convert a column holding categorical data to a pandas Series. @@ -151,7 +151,7 @@ def categorical_column_to_dictionary( pass -def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: +def string_column_to_ndarray(col: _PyArrowColumn) -> tuple[np.ndarray, Any]: """ Convert a column holding string data to a NumPy array. Parameters @@ -171,7 +171,7 @@ def parse_datetime_format_str(format_str, data): pass -def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: +def datetime_column_to_ndarray(col: _PyArrowColumn) -> tuple[np.ndarray, Any]: """ Convert a column holding DateTime data to a NumPy array. Parameters @@ -187,7 +187,7 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: def buffer_to_ndarray( - buffer: Buffer, + buffer: _PyArrowBuffer, dtype: tuple[DtypeKind, int, str, str], offset: int = 0, length: int | None = None, @@ -239,8 +239,8 @@ def bitmask_to_bool_ndarray( def set_nulls( data: np.ndarray | pa.Array | pa.ChunkedArray, - col: Column, - validity: tuple[Buffer, tuple[DtypeKind, int, str, str]] | None, + col: _PyArrowColumn, + validity: tuple[_PyArrowBuffer, tuple[DtypeKind, int, str, str]] | None, allow_modify_inplace: bool = True, ): """ @@ -249,9 +249,9 @@ def set_nulls( ---------- data : np.ndarray, pa.Array or pa.ChunkedArray, Data to set nulls in. - col : Column + col : _PyArrowColumn Column object that describes the `data`. - validity : tuple(Buffer, dtype) or None + validity : tuple(_PyArrowBuffer, dtype) or None The return value of ``col.buffers()``. We do not access the ``col.buffers()`` here to not take the ownership of the memory of buffer objects. diff --git a/python/pyarrow/tests/interchange/test_extra.py b/python/pyarrow/tests/interchange/test_extra.py index 8fd683c8bf5..8028abac118 100644 --- a/python/pyarrow/tests/interchange/test_extra.py +++ b/python/pyarrow/tests/interchange/test_extra.py @@ -19,8 +19,8 @@ import pyarrow as pa import pytest -from pyarrow.interchange.column import PyArrowColumn -from pyarrow.interchange.dataframe_protocol import ( +from pyarrow.interchange.column import ( + _PyArrowColumn, ColumnNullType, DtypeKind, ) @@ -47,7 +47,7 @@ def test_datetime(): ) def test_array_to_pyarrowcolumn(test_data, kind): arr = pa.array(test_data) - arr_column = PyArrowColumn(arr) + arr_column = _PyArrowColumn(arr) assert arr_column._col == arr assert arr_column.size() == len(test_data) From 92a176597064710dc64474ebd2ea8b61c93c50e3 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 15 Nov 2022 13:23:22 +0100 Subject: [PATCH 18/29] Add ColumnNullType = non nullable for columns without missing values --- python/pyarrow/interchange/column.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py index 10c3f7c8c5b..5de4f658e6d 100644 --- a/python/pyarrow/interchange/column.py +++ b/python/pyarrow/interchange/column.py @@ -329,7 +329,13 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]: mask or a byte mask, the value (0 or 1) indicating a missing value. None otherwise. """ - return ColumnNullType.USE_BITMASK, 0 + # In case of no missing values, we need to set ColumnNullType to + # non nullable as in the current __dataframe__ protocol bit/byte masks + # can not be None + if self.null_count == 0: + return ColumnNullType.NON_NULLABLE, None + else: + return ColumnNullType.USE_BITMASK, 0 @property def null_count(self) -> int: From 95f7f45324dd5bd481af46c773a74f4f26136f0a Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 15 Nov 2022 13:27:00 +0100 Subject: [PATCH 19/29] Correct test error after describe_null() change --- python/pyarrow/tests/interchange/test_extra.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/interchange/test_extra.py b/python/pyarrow/tests/interchange/test_extra.py index 8028abac118..d9eeb761866 100644 --- a/python/pyarrow/tests/interchange/test_extra.py +++ b/python/pyarrow/tests/interchange/test_extra.py @@ -34,7 +34,7 @@ def test_datetime(): assert col.size() == 2 assert col.null_count == 1 assert col.dtype[0] == DtypeKind.DATETIME - assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0) + assert col.describe_null == (ColumnNullType.USE_BITMASK, 0) @pytest.mark.parametrize( From 964e9da8cae15ddbfe8da30a48d40ee536390310 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 15 Nov 2022 14:20:47 +0100 Subject: [PATCH 20/29] Change DtypeKind to be imported from column.py --- python/pyarrow/interchange/from_dataframe.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py index 6c5d576d467..81c4dd115be 100644 --- a/python/pyarrow/interchange/from_dataframe.py +++ b/python/pyarrow/interchange/from_dataframe.py @@ -22,11 +22,11 @@ ) from pyarrow.interchange.buffer import _PyArrowBuffer -from pyarrow.interchange.column import _PyArrowColumn -from pyarrow.interchange.dataframe import ( - _PyArrowDataFrame, +from pyarrow.interchange.column import ( + _PyArrowColumn, DtypeKind, ) +from pyarrow.interchange.dataframe import _PyArrowDataFrame import numpy as np import pyarrow as pa From 3658088bb2647411666a33bf17a3af4b4ec74d9f Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 17 Nov 2022 15:13:58 +0100 Subject: [PATCH 21/29] Add change for string dtype and bitmask - not sure about it though --- python/pyarrow/interchange/column.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py index 5de4f658e6d..5bcf9affb9a 100644 --- a/python/pyarrow/interchange/column.py +++ b/python/pyarrow/interchange/column.py @@ -200,6 +200,7 @@ def __init__( self._col = column self._allow_copy = allow_copy + @property def size(self) -> int: """ Size of the column, in elements. @@ -467,7 +468,7 @@ def _get_validity_buffer(self) -> Tuple[_PyArrowBuffer, Any]: mask. """ # Define the dtype of the returned buffer - dtype = (DtypeKind.BOOL, 8, "b", Endianness.NATIVE) + dtype = (DtypeKind.BOOL, 1, "b", Endianness.NATIVE) if isinstance(self._col, pa.ChunkedArray): array = self._col.combine_chunks() else: @@ -499,5 +500,5 @@ def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any]: ) elif n == 3: # Define the dtype of the returned buffer - dtype = (DtypeKind.INT, 64, "L", Endianness.NATIVE) - return _PyArrowBuffer(array.buffers()[2]), dtype + dtype = (DtypeKind.INT, 32, "i", Endianness.NATIVE) + return _PyArrowBuffer(array.buffers()[1]), dtype From caefeed1d651d6c194adb2f0f87c44405898ee34 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 17 Nov 2022 15:38:34 +0100 Subject: [PATCH 22/29] Add a change for dictionary arrays --- python/pyarrow/interchange/column.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py index 5bcf9affb9a..2f9d3f9b513 100644 --- a/python/pyarrow/interchange/column.py +++ b/python/pyarrow/interchange/column.py @@ -453,6 +453,12 @@ def _get_data_buffer( if pa.types.is_boolean(array.type): array = pc.cast(array, pa.uint8()) dtype = _PyArrowColumn(array).dtype + # In case of dictionary arrays, use indices + # to define a buffer, codes are transferred through + # describe_categorical() + if pa.types.is_dictionary(array.type): + array = array.indices + dtype = _PyArrowColumn(array).dtype n = len(array.buffers()) if n == 2: From 8871d117da9f8db9b49282dc1ff3a4f0ce3bb08e Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 17 Nov 2022 15:56:13 +0100 Subject: [PATCH 23/29] Add corrections for timestamp dtype --- python/pyarrow/interchange/column.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py index 2f9d3f9b513..4864aabf984 100644 --- a/python/pyarrow/interchange/column.py +++ b/python/pyarrow/interchange/column.py @@ -257,7 +257,9 @@ def dtype(self) -> Tuple[DtypeKind, int, str, str]: if pa.types.is_timestamp(dtype): kind = DtypeKind.DATETIME - f_string = "ts{dtype.unit}:{dtype.tz}" + ts = dtype.unit[0] + tz = dtype.tz if dtype.tz else "" + f_string = "ts{ts}:{tz}".format(ts=ts, tz=tz) return kind, bit_width, f_string, Endianness.NATIVE elif pa.types.is_dictionary(dtype): kind = DtypeKind.CATEGORICAL From ad9b2e8c4041ce47cff99d31c6bcac7118f719ee Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 17 Nov 2022 17:22:20 +0100 Subject: [PATCH 24/29] Change size() to size --- python/pyarrow/interchange/column.py | 4 ++-- python/pyarrow/tests/interchange/test_extra.py | 4 ++-- python/pyarrow/tests/interchange/test_interchange_spec.py | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py index 4864aabf984..5ee5b173155 100644 --- a/python/pyarrow/interchange/column.py +++ b/python/pyarrow/interchange/column.py @@ -373,8 +373,8 @@ def get_chunks( See `DataFrame.get_chunks` for details on ``n_chunks``. """ if n_chunks and n_chunks > 1: - chunk_size = self.size() // n_chunks - if self.size() % n_chunks != 0: + chunk_size = self.size // n_chunks + if self.size % n_chunks != 0: chunk_size += 1 if isinstance(self._col, pa.ChunkedArray): diff --git a/python/pyarrow/tests/interchange/test_extra.py b/python/pyarrow/tests/interchange/test_extra.py index d9eeb761866..00adce005dc 100644 --- a/python/pyarrow/tests/interchange/test_extra.py +++ b/python/pyarrow/tests/interchange/test_extra.py @@ -31,7 +31,7 @@ def test_datetime(): table = pa.table(df) col = table.__dataframe__().get_column_by_name("A") - assert col.size() == 2 + assert col.size == 2 assert col.null_count == 1 assert col.dtype[0] == DtypeKind.DATETIME assert col.describe_null == (ColumnNullType.USE_BITMASK, 0) @@ -50,7 +50,7 @@ def test_array_to_pyarrowcolumn(test_data, kind): arr_column = _PyArrowColumn(arr) assert arr_column._col == arr - assert arr_column.size() == len(test_data) + assert arr_column.size == len(test_data) assert arr_column.dtype[0] == kind assert arr_column.num_chunks() == 1 assert arr_column.null_count == 0 diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py index c292c9eefab..425e8f7f95d 100644 --- a/python/pyarrow/tests/interchange/test_interchange_spec.py +++ b/python/pyarrow/tests/interchange/test_interchange_spec.py @@ -39,7 +39,7 @@ def test_only_one_dtype(test_data): null_count = df.get_column_by_name(column).null_count assert null_count == 0 assert isinstance(null_count, int) - assert df.get_column_by_name(column).size() == column_size + assert df.get_column_by_name(column).size == column_size assert df.get_column_by_name(column).offset == 0 @@ -64,7 +64,7 @@ def test_mixed_dtypes(): col = df.get_column_by_name(column) assert col.null_count == 0 assert isinstance(col.null_count, int) - assert col.size() == 3 + assert col.size == 3 assert col.offset == 0 assert col.dtype[0] == kind @@ -132,14 +132,14 @@ def test_column_get_chunks(size, n_chunks): df = table.__dataframe__() chunks = list(df.get_column(0).get_chunks(n_chunks)) assert len(chunks) == n_chunks - assert sum(chunk.size() for chunk in chunks) == size + assert sum(chunk.size for chunk in chunks) == size def test_get_columns(): table = pa.table({"a": [0, 1], "b": [2.5, 3.5]}) df = table.__dataframe__() for col in df.get_columns(): - assert col.size() == 2 + assert col.size == 2 assert col.num_chunks() == 1 # for meanings of dtype[0] see the spec; we cannot import the # spec here as this file is expected to be vendored *anywhere* From 2b83dd8cf188b1b38914cfcda5b75da080558207 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 22 Nov 2022 14:10:01 +0100 Subject: [PATCH 25/29] Add schema to empty record batch and keep the number of chukes fixed to n_chunks --- python/pyarrow/interchange/column.py | 5 +---- python/pyarrow/interchange/dataframe.py | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py index 5ee5b173155..dbdb63193fa 100644 --- a/python/pyarrow/interchange/column.py +++ b/python/pyarrow/interchange/column.py @@ -388,10 +388,7 @@ def get_chunks( array.slice(start, chunk_size), self._allow_copy ) i += 1 - # In case when the size of the chunk is such that the resulting - # list is one less chunk then n_chunks -> append an empty chunk - if i == n_chunks - 1: - yield _PyArrowColumn(pa.array([]), self._allow_copy) + elif isinstance(self._col, pa.ChunkedArray): return [ _PyArrowColumn(chunk, self._allow_copy) diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py index 965432dd938..36b669f4358 100644 --- a/python/pyarrow/interchange/dataframe.py +++ b/python/pyarrow/interchange/dataframe.py @@ -178,7 +178,7 @@ def get_chunks( # In case when the size of the chunk is such that the resulting # list is one less chunk then n_chunks -> append an empty chunk if len(batches) == n_chunks - 1: - batches.append(pa.record_batch([])) + batches.append(pa.record_batch([[]], schema = self._df.schema)) else: batches = self._df.to_batches() From 4f150efcf930b4527034a572899322d29dfca3c4 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Tue, 22 Nov 2022 14:37:53 +0100 Subject: [PATCH 26/29] Add offset for sliced array with a test and use datetime instead of pandas timestamp in the tests --- python/pyarrow/interchange/column.py | 7 ++++- .../pyarrow/tests/interchange/test_extra.py | 26 +++++++++++++++++-- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py index dbdb63193fa..a5c9d4a2833 100644 --- a/python/pyarrow/interchange/column.py +++ b/python/pyarrow/interchange/column.py @@ -219,7 +219,12 @@ def offset(self) -> int: equal size M (only the last chunk may be shorter), ``offset = n * M``, ``n = 0 .. N-1``. """ - return 0 + if isinstance(self._col, pa.Array): + return self._col.offset + else: + # ChunkedArray gets copied with `combine_chunks` so the offset will + # always be 0 + return 0 @property def dtype(self) -> Tuple[DtypeKind, int, str, str]: diff --git a/python/pyarrow/tests/interchange/test_extra.py b/python/pyarrow/tests/interchange/test_extra.py index 00adce005dc..68368ebaf2d 100644 --- a/python/pyarrow/tests/interchange/test_extra.py +++ b/python/pyarrow/tests/interchange/test_extra.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -import pandas as pd +from datetime import datetime as dt import pyarrow as pa import pytest @@ -25,9 +25,16 @@ DtypeKind, ) +try: + import pandas as pd + import pandas.testing as tm + from pandas.core.interchange.from_dataframe import from_dataframe +except ImportError: + pass + def test_datetime(): - df = pd.DataFrame({"A": [pd.Timestamp("2022-01-01"), pd.NaT]}) + df = pd.DataFrame({"A": [dt(2007, 7, 13), None]}) table = pa.table(df) col = table.__dataframe__().get_column_by_name("A") @@ -59,3 +66,18 @@ def test_array_to_pyarrowcolumn(test_data, kind): for chunk in arr_column.get_chunks(): assert chunk == arr_column + + +@pytest.mark.pandas +def test_offset_of_sliced_array(): + arr = pa.array([1, 2, 3, 4]) + arr_sliced = arr.slice(2, 2) + + table = pa.table([arr], names = ["arr"]) + table_sliced = pa.table([arr_sliced], names = ["arr_sliced"]) + + df = from_dataframe(table) + df_sliced = from_dataframe(table_sliced) + + tm.assert_series_equal(df["arr"][2:4], df_sliced["arr_sliced"], + check_index=False, check_names=False) From 1a456fee50f6ffbc10d880ee1dd980fd233f2d00 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 24 Nov 2022 15:30:18 +0100 Subject: [PATCH 27/29] Fix linter errors --- python/pyarrow/interchange/dataframe.py | 2 +- python/pyarrow/tests/interchange/test_extra.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py index 36b669f4358..278ee102ec9 100644 --- a/python/pyarrow/interchange/dataframe.py +++ b/python/pyarrow/interchange/dataframe.py @@ -178,7 +178,7 @@ def get_chunks( # In case when the size of the chunk is such that the resulting # list is one less chunk then n_chunks -> append an empty chunk if len(batches) == n_chunks - 1: - batches.append(pa.record_batch([[]], schema = self._df.schema)) + batches.append(pa.record_batch([[]], schema=self._df.schema)) else: batches = self._df.to_batches() diff --git a/python/pyarrow/tests/interchange/test_extra.py b/python/pyarrow/tests/interchange/test_extra.py index 68368ebaf2d..e655c2ace2b 100644 --- a/python/pyarrow/tests/interchange/test_extra.py +++ b/python/pyarrow/tests/interchange/test_extra.py @@ -73,8 +73,8 @@ def test_offset_of_sliced_array(): arr = pa.array([1, 2, 3, 4]) arr_sliced = arr.slice(2, 2) - table = pa.table([arr], names = ["arr"]) - table_sliced = pa.table([arr_sliced], names = ["arr_sliced"]) + table = pa.table([arr], names=["arr"]) + table_sliced = pa.table([arr_sliced], names=["arr_sliced"]) df = from_dataframe(table) df_sliced = from_dataframe(table_sliced) From 2632c558bf144c2244a9cd72415eb2a7375b6e4c Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 24 Nov 2022 15:53:32 +0100 Subject: [PATCH 28/29] Add a skip for the test using from_dataframe() added in pandas versions < 1.5.0 --- python/pyarrow/tests/interchange/test_extra.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/pyarrow/tests/interchange/test_extra.py b/python/pyarrow/tests/interchange/test_extra.py index e655c2ace2b..4181b117be6 100644 --- a/python/pyarrow/tests/interchange/test_extra.py +++ b/python/pyarrow/tests/interchange/test_extra.py @@ -17,6 +17,7 @@ from datetime import datetime as dt import pyarrow as pa +from pyarrow.vendored.version import Version import pytest from pyarrow.interchange.column import ( @@ -70,6 +71,9 @@ def test_array_to_pyarrowcolumn(test_data, kind): @pytest.mark.pandas def test_offset_of_sliced_array(): + if Version(pd.__version__) < Version("1.5.0"): + pytest.skip("__dataframe__ added to pandas in 1.5.0") + arr = pa.array([1, 2, 3, 4]) arr_sliced = arr.slice(2, 2) From f177b15f9aa2f129175ec6b3a81bf814531bcda6 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Mon, 28 Nov 2022 08:37:29 +0100 Subject: [PATCH 29/29] Make changes to the from_dataframe.py skeleton --- python/pyarrow/interchange/from_dataframe.py | 95 ++++++++------------ 1 file changed, 37 insertions(+), 58 deletions(-) diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py index 81c4dd115be..a7746371129 100644 --- a/python/pyarrow/interchange/from_dataframe.py +++ b/python/pyarrow/interchange/from_dataframe.py @@ -32,14 +32,21 @@ import pyarrow as pa -def from_dataframe(df, allow_copy=True) -> pa.Table: +# A typing protocol could be added later to let Mypy validate code using +# `from_dataframe` better. +DataFrameObject = Any +ColumnObject = Any +BufferObject = Any + + +def from_dataframe(df: DataFrameObject, allow_copy=True) -> pa.Table: """ Build a ``pa.Table`` from any DataFrame supporting the interchange protocol. Parameters ---------- - df : _PyArrowDataFrame + df : DataFrameObject Object supporting the interchange protocol, i.e. `__dataframe__` method. allow_copy : bool, default: True @@ -58,12 +65,12 @@ def from_dataframe(df, allow_copy=True) -> pa.Table: return _from_dataframe(df.__dataframe__(allow_copy=allow_copy)) -def _from_dataframe(df: _PyArrowDataFrame, allow_copy=True): +def _from_dataframe(df: DataFrameObject, allow_copy=True): """ Build a ``pa.Table`` from the DataFrame interchange object. Parameters ---------- - df : _PyArrowDataFrame + df : DataFrameObject Object supporting the interchange protocol, i.e. `__dataframe__` method. allow_copy : bool, default: True @@ -76,12 +83,12 @@ def _from_dataframe(df: _PyArrowDataFrame, allow_copy=True): pass -def protocol_df_chunk_to_pyarrow(df: _PyArrowDataFrame) -> pa.Table: +def protocol_df_chunk_to_pyarrow(df: DataFrameObject) -> pa.Table: """ Convert interchange protocol chunk to ``pd.DataFrame``. Parameters ---------- - df : _PyArrowDataFrame + df : DataFrameObject Returns ------- pa.Table @@ -118,49 +125,49 @@ def protocol_df_chunk_to_pyarrow(df: _PyArrowDataFrame) -> pa.Table: pass -def primitive_column_to_ndarray(col: _PyArrowColumn) -> tuple[np.ndarray, Any]: +def primitive_column_to_array(col: ColumnObject) -> tuple[pa.Array, Any]: """ - Convert a column holding one of the primitive dtypes to a NumPy array. + Convert a column holding one of the primitive dtypes to a PyArrow array. A primitive type is one of: int, uint, float, bool. Parameters ---------- - col : Column + col : ColumnObject Returns ------- tuple - Tuple of np.ndarray holding the data and the memory owner object + Tuple of pa.Array holding the data and the memory owner object that keeps the memory alive. """ pass def categorical_column_to_dictionary( - col: _PyArrowColumn -) -> tuple[pa.ChunkedArray, Any]: + col: ColumnObject +) -> tuple[pa.Array, Any]: """ Convert a column holding categorical data to a pandas Series. Parameters ---------- - col : Column + col : ColumnObject Returns ------- tuple - Tuple of pa.ChunkedArray holding the data and the memory owner object + Tuple of pa.Array holding the data and the memory owner object that keeps the memory alive. """ pass -def string_column_to_ndarray(col: _PyArrowColumn) -> tuple[np.ndarray, Any]: +def string_column_to_array(col: ColumnObject) -> tuple[pa.Array, Any]: """ Convert a column holding string data to a NumPy array. Parameters ---------- - col : Column + col : ColumnObject Returns ------- tuple - Tuple of np.ndarray holding the data and the memory owner object + Tuple of pa.Array holding the data and the memory owner object that keeps the memory alive. """ pass @@ -171,33 +178,33 @@ def parse_datetime_format_str(format_str, data): pass -def datetime_column_to_ndarray(col: _PyArrowColumn) -> tuple[np.ndarray, Any]: +def datetime_column_to_array(col: ColumnObject) -> tuple[pa.Array, Any]: """ Convert a column holding DateTime data to a NumPy array. Parameters ---------- - col : Column + col : ColumnObject Returns ------- tuple - Tuple of np.ndarray holding the data and the memory owner object + Tuple of pa.Array holding the data and the memory owner object that keeps the memory alive. """ pass -def buffer_to_ndarray( - buffer: _PyArrowBuffer, +def buffer_to_array( + buffer: BufferObject, dtype: tuple[DtypeKind, int, str, str], offset: int = 0, length: int | None = None, -) -> np.ndarray: +) -> pa.Array: """ Build a NumPy array from the passed buffer. Parameters ---------- - buffer : Buffer - Buffer to build a NumPy array from. + buffer : BufferObject + Buffer to build a PyArrow array from. dtype : tuple Data type of the buffer conforming protocol dtypes format. offset : int, default: 0 @@ -207,7 +214,8 @@ def buffer_to_ndarray( from the buffer. Has no effect otherwise. Returns ------- - np.ndarray + pa.Array + Notes ----- The returned array doesn't own the memory. The caller of this function @@ -217,9 +225,9 @@ def buffer_to_ndarray( pass -def bitmask_to_bool_ndarray( +def bitmask_to_bool_array( bitmask: np.ndarray, mask_length: int, first_byte_offset: int = 0 -) -> np.ndarray: +) -> pa.Array: """ Convert bit-mask to a boolean NumPy array. Parameters @@ -232,35 +240,6 @@ def bitmask_to_bool_ndarray( Number of elements to offset from the start of the first byte. Returns ------- - np.ndarray[bool] - """ - pass - - -def set_nulls( - data: np.ndarray | pa.Array | pa.ChunkedArray, - col: _PyArrowColumn, - validity: tuple[_PyArrowBuffer, tuple[DtypeKind, int, str, str]] | None, - allow_modify_inplace: bool = True, -): - """ - Set null values for the data according to the column null kind. - Parameters - ---------- - data : np.ndarray, pa.Array or pa.ChunkedArray, - Data to set nulls in. - col : _PyArrowColumn - Column object that describes the `data`. - validity : tuple(_PyArrowBuffer, dtype) or None - The return value of ``col.buffers()``. We do not access the - ``col.buffers()`` here to not take the ownership of the memory - of buffer objects. - allow_modify_inplace : bool, default: True - Whether to modify the `data` inplace when zero-copy is possible - (True) or always modify a copy of the `data` (False). - Returns - ------- - np.ndarray, pa.Array or pa.ChunkedArray, - Data with the nulls being set. + pa.Array[bool] """ pass