diff --git a/python/pyarrow/interchange/__init__.py b/python/pyarrow/interchange/__init__.py new file mode 100644 index 00000000000..7ebe59b499c --- /dev/null +++ b/python/pyarrow/interchange/__init__.py @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# flake8: noqa + +from .from_dataframe import from_dataframe diff --git a/python/pyarrow/interchange/buffer.py b/python/pyarrow/interchange/buffer.py new file mode 100644 index 00000000000..1f537798130 --- /dev/null +++ b/python/pyarrow/interchange/buffer.py @@ -0,0 +1,107 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations +import enum + +import pyarrow as pa + + +class DlpackDeviceType(enum.IntEnum): + """Integer enum for device type codes matching DLPack.""" + + CPU = 1 + CUDA = 2 + CPU_PINNED = 3 + OPENCL = 4 + VULKAN = 7 + METAL = 8 + VPI = 9 + ROCM = 10 + + +class _PyArrowBuffer: + """ + Data in the buffer is guaranteed to be contiguous in memory. + + Note that there is no dtype attribute present, a buffer can be thought of + as simply a block of memory. However, if the column that the buffer is + attached to has a dtype that's supported by DLPack and ``__dlpack__`` is + implemented, then that dtype information will be contained in the return + value from ``__dlpack__``. + + This distinction is useful to support both data exchange via DLPack on a + buffer and (b) dtypes like variable-length strings which do not have a + fixed number of bytes per element. + """ + + def __init__(self, x: pa.Buffer, allow_copy: bool = True) -> None: + """ + Handle PyArrow Buffers. + """ + self._x = x + + @property + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + return self._x.size + + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + return self._x.address + + def __dlpack__(self): + """ + Produce DLPack capsule (see array API standard). + + Raises: + - TypeError : if the buffer contains unsupported dtypes. + - NotImplementedError : if DLPack support is not implemented + + Useful to have to connect to array libraries. Support optional because + it's not completely trivial to implement for a Python-only library. + """ + raise NotImplementedError("__dlpack__") + + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: + """ + Device type and device ID for where the data in the buffer resides. + Uses device type codes matching DLPack. + Note: must be implemented even if ``__dlpack__`` is not. + """ + if self._x.is_cpu: + return (DlpackDeviceType.CPU, None) + else: + raise NotImplementedError("__dlpack_device__") + + def __repr__(self) -> str: + return ( + "PyArrowBuffer(" + + str( + { + "bufsize": self.bufsize, + "ptr": self.ptr, + "device": self.__dlpack_device__()[0].name, + } + ) + + ")" + ) diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py new file mode 100644 index 00000000000..a9b89586165 --- /dev/null +++ b/python/pyarrow/interchange/column.py @@ -0,0 +1,527 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import enum +from typing import ( + Any, + Dict, + Iterable, + Optional, + Tuple, +) + +import sys +if sys.version_info >= (3, 8): + from typing import TypedDict +else: + from typing_extensions import TypedDict + +import pyarrow as pa +import pyarrow.compute as pc +from pyarrow.interchange.buffer import _PyArrowBuffer + + +class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + + +Dtype = Tuple[DtypeKind, int, str, str] # see Column.dtype + + +_PYARROW_KINDS = { + pa.int8(): (DtypeKind.INT, "c"), + pa.int16(): (DtypeKind.INT, "s"), + pa.int32(): (DtypeKind.INT, "i"), + pa.int64(): (DtypeKind.INT, "l"), + pa.uint8(): (DtypeKind.UINT, "C"), + pa.uint16(): (DtypeKind.UINT, "S"), + pa.uint32(): (DtypeKind.UINT, "I"), + pa.uint64(): (DtypeKind.UINT, "L"), + pa.float16(): (DtypeKind.FLOAT, "e"), + pa.float32(): (DtypeKind.FLOAT, "f"), + pa.float64(): (DtypeKind.FLOAT, "g"), + pa.bool_(): (DtypeKind.BOOL, "b"), + pa.string(): (DtypeKind.STRING, "u"), + pa.large_string(): (DtypeKind.STRING, "U"), +} + + +class ColumnNullType(enum.IntEnum): + """ + Integer enum for null type representation. + + Attributes + ---------- + NON_NULLABLE : int + Non-nullable column. + USE_NAN : int + Use explicit float NaN value. + USE_SENTINEL : int + Sentinel value besides NaN. + USE_BITMASK : int + The bit is set/unset representing a null on a certain position. + USE_BYTEMASK : int + The byte is set/unset representing a null on a certain position. + """ + + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + + +class ColumnBuffers(TypedDict): + # first element is a buffer containing the column data; + # second element is the data buffer's associated dtype + data: Tuple[_PyArrowBuffer, Dtype] + + # first element is a buffer containing mask values indicating missing data; + # second element is the mask value buffer's associated dtype. + # None if the null representation is not a bit or byte mask + validity: Optional[Tuple[_PyArrowBuffer, Dtype]] + + # first element is a buffer containing the offset values for + # variable-size binary data (e.g., variable-length strings); + # second element is the offsets buffer's associated dtype. + # None if the data buffer does not have an associated offsets buffer + offsets: Optional[Tuple[_PyArrowBuffer, Dtype]] + + +class CategoricalDescription(TypedDict): + # whether the ordering of dictionary indices is semantically meaningful + is_ordered: bool + # whether a dictionary-style mapping of categorical values to other objects + # exists + is_dictionary: bool + # Python-level only (e.g. ``{int: str}``). + # None if not a dictionary-style categorical. + categories: Optional[_PyArrowColumn] + + +class Endianness: + """Enum indicating the byte-order of a data-type.""" + + LITTLE = "<" + BIG = ">" + NATIVE = "=" + NA = "|" + + +class NoBufferPresent(Exception): + """Exception to signal that there is no requested buffer.""" + + +class _PyArrowColumn: + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). + + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. + Instead, it seems to use "children" for both columns with a bit mask, + and for nested dtypes. Unclear whether this is elegant or confusing. + This design requires checking the null representation explicitly. + + The Arrow design requires checking: + 1. the ARROW_FLAG_NULLABLE (for sentinel values) + 2. if a column has two children, combined with one of those children + having a null dtype. + + Making the mask concept explicit seems useful. One null dtype would + not be enough to cover both bit and byte masks, so that would mean + even more checking if we did it the Arrow way. + + TBD: there's also the "chunk" concept here, which is implicit in Arrow as + multiple buffers per array (= column here). Semantically it may make + sense to have both: chunks were meant for example for lazy evaluation + of data which doesn't fit in memory, while multiple buffers per column + could also come from doing a selection operation on a single + contiguous buffer. + + Given these concepts, one would expect chunks to be all of the same + size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), + while multiple buffers could have data-dependent lengths. Not an issue + in pandas if one column is backed by a single NumPy array, but in + Arrow it seems possible. + Are multiple chunks *and* multiple buffers per column necessary for + the purposes of this interchange protocol, or must producers either + reuse the chunk concept for this or copy the data? + + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + """ + + def __init__( + self, column: pa.Array | pa.ChunkedArray, allow_copy: bool = True + ) -> None: + """ + Handles PyArrow Arrays and ChunkedArrays. + """ + # Store the column as a private attribute + if isinstance(column, pa.ChunkedArray): + if column.num_chunks == 1: + column = column.chunk(0) + else: + if not allow_copy: + raise RuntimeError( + "Chunks will be combined and a copy is required which " + "is forbidden by allow_copy=False" + ) + column = column.combine_chunks() + + self._allow_copy = allow_copy + + if pa.types.is_boolean(column.type): + if not allow_copy: + raise RuntimeError( + "Boolean column will be casted to uint8 and a copy " + "is required which is forbidden by allow_copy=False" + ) + self._dtype = self._dtype_from_arrowdtype(column.type, 8) + self._col = pc.cast(column, pa.uint8()) + else: + self._col = column + dtype = self._col.type + try: + bit_width = dtype.bit_width + except ValueError: + # in case of a variable-length strings, considered as array + # of bytes (8 bits) + bit_width = 8 + self._dtype = self._dtype_from_arrowdtype(dtype, bit_width) + + def size(self) -> int: + """ + Size of the column, in elements. + + Corresponds to DataFrame.num_rows() if column is a single chunk; + equal to size of this current chunk otherwise. + + Is a method rather than a property because it may cause a (potentially + expensive) computation for some dataframe implementations. + """ + return len(self._col) + + @property + def offset(self) -> int: + """ + Offset of first element. + + May be > 0 if using chunks; for example for a column with N chunks of + equal size M (only the last chunk may be shorter), + ``offset = n * M``, ``n = 0 .. N-1``. + """ + return self._col.offset + + @property + def dtype(self) -> Tuple[DtypeKind, int, str, str]: + """ + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + + Bit-width : the number of bits as an integer + Format string : data type description format string in Apache Arrow C + Data Interface format. + Endianness : current only native endianness (``=``) is supported + + Notes: + - Kind specifiers are aligned with DLPack where possible (hence the + jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 (for + bit masks) or 8 (for byte masks). + - Dtype width in bits was preferred over bytes + - Endianness isn't too useful, but included now in case in the + future we need to support non-native endianness + - Went with Apache Arrow format strings over NumPy format strings + because they're more complete from a dataframe perspective + - Format strings are mostly useful for datetime specification, and + for categoricals. + - For categoricals, the format string describes the type of the + categorical in the data buffer. In case of a separate encoding of + the categorical (e.g. an integer to string mapping), this can + be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, binary, + decimal, and nested (list, struct, map, union) dtypes. + """ + return self._dtype + + def _dtype_from_arrowdtype( + self, dtype: pa.DataType, bit_width: int + ) -> Tuple[DtypeKind, int, str, str]: + """ + See `self.dtype` for details. + """ + # Note: 'c' (complex) not handled yet (not in array spec v1). + # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) + # not handled datetime and timedelta both map to datetime + # (is timedelta handled?) + + if pa.types.is_timestamp(dtype): + kind = DtypeKind.DATETIME + ts = dtype.unit[0] + tz = dtype.tz if dtype.tz else "" + f_string = "ts{ts}:{tz}".format(ts=ts, tz=tz) + return kind, bit_width, f_string, Endianness.NATIVE + elif pa.types.is_dictionary(dtype): + kind = DtypeKind.CATEGORICAL + f_string = "L" + return kind, bit_width, f_string, Endianness.NATIVE + else: + kind, f_string = _PYARROW_KINDS.get(dtype, (None, None)) + if kind is None: + raise ValueError( + f"Data type {dtype} not supported by interchange protocol") + + return kind, bit_width, f_string, Endianness.NATIVE + + @property + def describe_categorical(self) -> CategoricalDescription: + """ + If the dtype is categorical, there are two options: + - There are only values in the data buffer. + - There is a separate non-categorical Column encoding categorical + values. + + Raises TypeError if the dtype is not categorical + + Returns the dictionary with description on how to interpret the + data buffer: + - "is_ordered" : bool, whether the ordering of dictionary indices + is semantically meaningful. + - "is_dictionary" : bool, whether a mapping of + categorical values to other objects exists + - "categories" : Column representing the (implicit) mapping of + indices to category values (e.g. an array of + cat1, cat2, ...). None if not a dictionary-style + categorical. + + TBD: are there any other in-memory representations that are needed? + """ + arr = self._col + if not pa.types.is_dictionary(arr.type): + raise TypeError( + "describe_categorical only works on a column with " + "categorical dtype!" + ) + + return { + "is_ordered": self._col.type.ordered, + "is_dictionary": True, + "categories": _PyArrowColumn(arr.dictionary), + } + + @property + def describe_null(self) -> Tuple[ColumnNullType, Any]: + """ + Return the missing value (or "null") representation the column dtype + uses, as a tuple ``(kind, value)``. + + Value : if kind is "sentinel value", the actual value. If kind is a bit + mask or a byte mask, the value (0 or 1) indicating a missing value. + None otherwise. + """ + # In case of no missing values, we need to set ColumnNullType to + # non nullable as in the current __dataframe__ protocol bit/byte masks + # can not be None + if self.null_count == 0: + return ColumnNullType.NON_NULLABLE, None + else: + return ColumnNullType.USE_BITMASK, 0 + + @property + def null_count(self) -> int: + """ + Number of null elements, if known. + + Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. + """ + arrow_null_count = self._col.null_count + n = arrow_null_count if arrow_null_count != -1 else None + return n + + @property + def metadata(self) -> Dict[str, Any]: + """ + The metadata for the column. See `DataFrame.metadata` for more details. + """ + pass + + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + """ + return 1 + + def get_chunks( + self, n_chunks: Optional[int] = None + ) -> Iterable[_PyArrowColumn]: + """ + Return an iterator yielding the chunks. + + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + if n_chunks and n_chunks > 1: + chunk_size = self.size() // n_chunks + if self.size() % n_chunks != 0: + chunk_size += 1 + + array = self._col + i = 0 + for start in range(0, chunk_size * n_chunks, chunk_size): + yield _PyArrowColumn( + array.slice(start, chunk_size), self._allow_copy + ) + i += 1 + else: + yield self + + def get_buffers(self) -> ColumnBuffers: + """ + Return a dictionary containing the underlying buffers. + + The returned dictionary has the following contents: + + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. + """ + buffers: ColumnBuffers = { + "data": self._get_data_buffer(), + "validity": None, + "offsets": None, + } + + try: + buffers["validity"] = self._get_validity_buffer() + except NoBufferPresent: + pass + + try: + buffers["offsets"] = self._get_offsets_buffer() + except NoBufferPresent: + pass + + return buffers + + def _get_data_buffer( + self, + ) -> Tuple[_PyArrowBuffer, Any]: # Any is for self.dtype tuple + """ + Return the buffer containing the data and the buffer's + associated dtype. + """ + array = self._col + dtype = self.dtype + + # In case of dictionary arrays, use indices + # to define a buffer, codes are transferred through + # describe_categorical() + if pa.types.is_dictionary(array.type): + array = array.indices + dtype = _PyArrowColumn(array).dtype + + n = len(array.buffers()) + if n == 2: + return _PyArrowBuffer(array.buffers()[1]), dtype + elif n == 3: + return _PyArrowBuffer(array.buffers()[2]), dtype + + def _get_validity_buffer(self) -> Tuple[_PyArrowBuffer, Any]: + """ + Return the buffer containing the mask values indicating missing data + and the buffer's associated dtype. + Raises NoBufferPresent if null representation is not a bit or byte + mask. + """ + # Define the dtype of the returned buffer + dtype = (DtypeKind.BOOL, 1, "b", Endianness.NATIVE) + array = self._col + buff = array.buffers()[0] + if buff: + return _PyArrowBuffer(buff), dtype + else: + raise NoBufferPresent( + "There are no missing values so " + "does not have a separate mask") + + def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any]: + """ + Return the buffer containing the offset values for variable-size binary + data (e.g., variable-length strings) and the buffer's associated dtype. + Raises NoBufferPresent if the data buffer does not have an associated + offsets buffer. + """ + array = self._col + n = len(array.buffers()) + if n == 2: + raise NoBufferPresent( + "This column has a fixed-length dtype so " + "it does not have an offsets buffer" + ) + elif n == 3: + # Define the dtype of the returned buffer + dtype = self._col.type + if pa.types.is_large_string(dtype): + dtype = (DtypeKind.INT, 64, "l", Endianness.NATIVE) + else: + dtype = (DtypeKind.INT, 32, "i", Endianness.NATIVE) + return _PyArrowBuffer(array.buffers()[1]), dtype diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py new file mode 100644 index 00000000000..d0717e02e88 --- /dev/null +++ b/python/pyarrow/interchange/dataframe.py @@ -0,0 +1,202 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations +from typing import ( + Any, + Iterable, + Optional, + Sequence, +) + +import pyarrow as pa + +from pyarrow.interchange.column import _PyArrowColumn + + +class _PyArrowDataFrame: + """ + A data frame class, with only the methods required by the interchange + protocol defined. + + A "data frame" represents an ordered collection of named columns. + A column's "name" must be a unique string. + Columns may be accessed by name or by position. + + This could be a public data frame class, or an object with the methods and + attributes defined on this DataFrame class could be returned from the + ``__dataframe__`` method of a public data frame class in a library adhering + to the dataframe interchange protocol specification. + """ + + def __init__( + self, df: pa.Table, nan_as_null: bool = False, allow_copy: bool = True + ) -> None: + """ + Constructor - an instance of this (private) class is returned from + `pa.Table.__dataframe__`. + """ + self._df = df + # ``nan_as_null`` is a keyword intended for the consumer to tell the + # producer to overwrite null values in the data with ``NaN`` (or + # ``NaT``). + if nan_as_null is True: + raise RuntimeError( + "nan_as_null=True currently has no effect, " + "use the default nan_as_null=False" + ) + self._nan_as_null = nan_as_null + self._allow_copy = allow_copy + + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> _PyArrowDataFrame: + """ + Construct a new exchange object, potentially changing the parameters. + ``nan_as_null`` is a keyword intended for the consumer to tell the + producer to overwrite null values in the data with ``NaN``. + It is intended for cases where the consumer does not support the bit + mask or byte mask that is the producer's native representation. + ``allow_copy`` is a keyword that defines whether or not the library is + allowed to make a copy of the data. For example, copying data would be + necessary if a library supports strided buffers, given that this + protocol specifies contiguous buffers. + """ + return _PyArrowDataFrame(self._df, nan_as_null, allow_copy) + + @property + def metadata(self) -> dict[str, Any]: + """ + The metadata for the data frame, as a dictionary with string keys. The + contents of `metadata` may be anything, they are meant for a library + to store information that it needs to, e.g., roundtrip losslessly or + for two implementations to share data that is not (yet) part of the + interchange protocol specification. For avoiding collisions with other + entries, please add name the keys with the name of the library + followed by a period and the desired name, e.g, ``pandas.indexcol``. + """ + # The metadata for the data frame, as a dictionary with string keys. + # Add schema metadata here (pandas metadata or custom metadata) + if self._df.schema.metadata: + schema_metadata = {"pyarrow." + k.decode('utf8'): v.decode('utf8') + for k, v in self._df.schema.metadata.items()} + return schema_metadata + else: + return {} + + def num_columns(self) -> int: + """ + Return the number of columns in the DataFrame. + """ + return self._df.num_columns + + def num_rows(self) -> int: + """ + Return the number of rows in the DataFrame, if available. + """ + return self._df.num_rows + + def num_chunks(self) -> int: + """ + Return the number of chunks the DataFrame consists of. + """ + # pyarrow.Table can have columns with different number + # of chunks so we take the number of chunks that + # .to_batches() returns as it takes the min chunk size + # of all the columns (to_batches is a zero copy method) + batches = self._df.to_batches() + return len(batches) + + def column_names(self) -> Iterable[str]: + """ + Return an iterator yielding the column names. + """ + return self._df.column_names + + def get_column(self, i: int) -> _PyArrowColumn: + """ + Return the column at the indicated position. + """ + return _PyArrowColumn(self._df.column(i), + allow_copy=self._allow_copy) + + def get_column_by_name(self, name: str) -> _PyArrowColumn: + """ + Return the column whose name is the indicated name. + """ + return _PyArrowColumn(self._df.column(name), + allow_copy=self._allow_copy) + + def get_columns(self) -> Iterable[_PyArrowColumn]: + """ + Return an iterator yielding the columns. + """ + return [ + _PyArrowColumn(col, allow_copy=self._allow_copy) + for col in self._df.columns + ] + + def select_columns(self, indices: Sequence[int]) -> _PyArrowDataFrame: + """ + Create a new DataFrame by selecting a subset of columns by index. + """ + return _PyArrowDataFrame( + self._df.select(list(indices)), self._nan_as_null, self._allow_copy + ) + + def select_columns_by_name( + self, names: Sequence[str] + ) -> _PyArrowDataFrame: + """ + Create a new DataFrame by selecting a subset of columns by name. + """ + return _PyArrowDataFrame( + self._df.select(list(names)), self._nan_as_null, self._allow_copy + ) + + def get_chunks( + self, n_chunks: Optional[int] = None + ) -> Iterable[_PyArrowDataFrame]: + """ + Return an iterator yielding the chunks. + + By default (None), yields the chunks that the data is stored as by the + producer. If given, ``n_chunks`` must be a multiple of + ``self.num_chunks()``, meaning the producer must subdivide each chunk + before yielding it. + + Note that the producer must ensure that all columns are chunked the + same way. + """ + if n_chunks and n_chunks > 1: + chunk_size = self.num_rows() // n_chunks + if self.num_rows() % n_chunks != 0: + chunk_size += 1 + batches = self._df.to_batches(max_chunksize=chunk_size) + # In case when the size of the chunk is such that the resulting + # list is one less chunk then n_chunks -> append an empty chunk + if len(batches) == n_chunks - 1: + batches.append(pa.record_batch([[]], schema=self._df.schema)) + else: + batches = self._df.to_batches() + + iterator_tables = [_PyArrowDataFrame( + pa.Table.from_batches([batch]), self._nan_as_null, self._allow_copy + ) + for batch in batches + ] + return iterator_tables diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py new file mode 100644 index 00000000000..204530a3354 --- /dev/null +++ b/python/pyarrow/interchange/from_dataframe.py @@ -0,0 +1,567 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +from typing import ( + Any, +) + +from pyarrow.interchange.column import ( + DtypeKind, + ColumnBuffers, + ColumnNullType, +) + +import pyarrow as pa +import re + +import pyarrow.compute as pc +from pyarrow.interchange.column import Dtype + + +# A typing protocol could be added later to let Mypy validate code using +# `from_dataframe` better. +DataFrameObject = Any +ColumnObject = Any +BufferObject = Any + + +_PYARROW_DTYPES: dict[DtypeKind, dict[int, Any]] = { + DtypeKind.INT: {8: pa.int8(), + 16: pa.int16(), + 32: pa.int32(), + 64: pa.int64()}, + DtypeKind.UINT: {8: pa.uint8(), + 16: pa.uint16(), + 32: pa.uint32(), + 64: pa.uint64()}, + DtypeKind.FLOAT: {16: pa.float16(), + 32: pa.float32(), + 64: pa.float64()}, + DtypeKind.BOOL: {8: pa.uint8()}, + DtypeKind.STRING: {8: pa.string()}, +} + + +def from_dataframe(df: DataFrameObject, allow_copy=True) -> pa.Table: + """ + Build a ``pa.Table`` from any DataFrame supporting the interchange + protocol. + + Parameters + ---------- + df : DataFrameObject + Object supporting the interchange protocol, i.e. `__dataframe__` + method. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Table + """ + if isinstance(df, pa.Table): + return df + + if not hasattr(df, "__dataframe__"): + raise ValueError("`df` does not support __dataframe__") + + return _from_dataframe(df.__dataframe__(allow_copy=allow_copy), + allow_copy=allow_copy) + + +def _from_dataframe(df: DataFrameObject, allow_copy=True): + """ + Build a ``pa.Table`` from the DataFrame interchange object. + + Parameters + ---------- + df : DataFrameObject + Object supporting the interchange protocol, i.e. `__dataframe__` + method. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Table + """ + batches = [] + for chunk in df.get_chunks(): + batch = protocol_df_chunk_to_pyarrow(chunk, allow_copy) + batches.append(batch) + + table = pa.Table.from_batches(batches) + return table + + +def protocol_df_chunk_to_pyarrow( + df: DataFrameObject, + allow_copy: bool = True +) -> pa.RecordBatch: + """ + Convert interchange protocol chunk to ``pa.RecordBatch``. + + Parameters + ---------- + df : DataFrameObject + Object supporting the interchange protocol, i.e. `__dataframe__` + method. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.RecordBatch + """ + # We need a dict of columns here, with each column being a pa.Array + columns: dict[str, pa.Array] = {} + for name in df.column_names(): + if not isinstance(name, str): + raise ValueError(f"Column {name} is not a string") + if name in columns: + raise ValueError(f"Column {name} is not unique") + col = df.get_column_by_name(name) + dtype = col.dtype[0] + if dtype in ( + DtypeKind.INT, + DtypeKind.UINT, + DtypeKind.FLOAT, + DtypeKind.STRING, + DtypeKind.DATETIME, + ): + columns[name] = column_to_array(col, allow_copy) + elif dtype == DtypeKind.BOOL: + columns[name] = bool_column_to_array(col, allow_copy) + elif dtype == DtypeKind.CATEGORICAL: + columns[name] = categorical_column_to_dictionary(col, allow_copy) + else: + raise NotImplementedError(f"Data type {dtype} not handled yet") + + return pa.RecordBatch.from_pydict(columns) + + +def column_to_array( + col: ColumnObject, + allow_copy: bool = True, +) -> pa.Array: + """ + Convert a column holding one of the primitive dtypes to a PyArrow array. + A primitive type is one of: int, uint, float, bool (1 bit). + + Parameters + ---------- + col : ColumnObject + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Array + """ + buffers = col.get_buffers() + data = buffers_to_array(buffers, col.size(), + col.describe_null, + col.offset, + allow_copy) + return data + + +def bool_column_to_array( + col: ColumnObject, + allow_copy: bool = True, +) -> pa.Array: + """ + Convert a column holding boolean dtype to a PyArrow array. + + Parameters + ---------- + col : ColumnObject + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Array + """ + if not allow_copy: + raise RuntimeError( + "Boolean column will be casted from uint8 and a copy " + "is required which is forbidden by allow_copy=False" + ) + + buffers = col.get_buffers() + data = buffers_to_array(buffers, col.size(), + col.describe_null, + col.offset) + data = pc.cast(data, pa.bool_()) + + return data + + +def categorical_column_to_dictionary( + col: ColumnObject, + allow_copy: bool = True, +) -> pa.DictionaryArray: + """ + Convert a column holding categorical data to a pa.DictionaryArray. + + Parameters + ---------- + col : ColumnObject + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.DictionaryArray + """ + if not allow_copy: + raise RuntimeError( + "Categorical column will be casted from uint8 and a copy " + "is required which is forbidden by allow_copy=False" + ) + + categorical = col.describe_categorical + + if not categorical["is_dictionary"]: + raise NotImplementedError( + "Non-dictionary categoricals not supported yet") + + cat_column = categorical["categories"] + dictionary = column_to_array(cat_column) + + buffers = col.get_buffers() + indices = buffers_to_array(buffers, col.size(), + col.describe_null, + col.offset) + + # Constructing a pa.DictionaryArray + dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) + + return dict_array + + +def parse_datetime_format_str(format_str): + """Parse datetime `format_str` to interpret the `data`.""" + + # timestamp 'ts{unit}:tz' + timestamp_meta = re.match(r"ts([smun]):(.*)", format_str) + if timestamp_meta: + unit, tz = timestamp_meta.group(1), timestamp_meta.group(2) + if unit != "s": + # the format string describes only a first letter of the unit, so + # add one extra letter to convert the unit to numpy-style: + # 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns' + unit += "s" + + return unit, tz + + raise NotImplementedError(f"DateTime kind is not supported: {format_str}") + + +def map_date_type(data_type): + """Map column date type to pyarrow date type. """ + kind, bit_width, f_string, _ = data_type + + if kind == DtypeKind.DATETIME: + unit, tz = parse_datetime_format_str(f_string) + return pa.timestamp(unit, tz=tz) + else: + pa_dtype = _PYARROW_DTYPES.get(kind, {}).get(bit_width, None) + + # Error if dtype is not supported + if pa_dtype: + return pa_dtype + else: + raise NotImplementedError( + f"Conversion for {data_type} is not yet supported.") + + +def buffers_to_array( + buffers: ColumnBuffers, + length: int, + describe_null: ColumnNullType, + offset: int = 0, + allow_copy: bool = True, +) -> pa.Array: + """ + Build a PyArrow array from the passed buffer. + + Parameters + ---------- + buffer : ColumnBuffers + Dictionary containing tuples of underlying buffers and + their associated dtype. + length : int + The number of values in the array. + describe_null: ColumnNullType + Null representation the column dtype uses, + as a tuple ``(kind, value)`` + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Array + + Notes + ----- + The returned array doesn't own the memory. The caller of this function + is responsible for keeping the memory owner object alive as long as + the returned PyArrow array is being used. + """ + data_buff, data_type = buffers["data"] + try: + validity_buff, validity_dtype = buffers["validity"] + except TypeError: + validity_buff = None + try: + offset_buff, offset_dtype = buffers["offsets"] + except TypeError: + offset_buff = None + + # Construct a pyarrow Buffer + data_pa_buffer = pa.foreign_buffer(data_buff.ptr, data_buff.bufsize, + base=data_buff) + + # Construct a validity pyarrow Buffer, if applicable + if validity_buff: + validity_pa_buff = validity_buffer_from_mask(validity_buff, + validity_dtype, + describe_null, + length, + offset, + allow_copy) + else: + validity_pa_buff = validity_buffer_nan_sentinel(data_pa_buffer, + data_type, + describe_null, + length, + offset, + allow_copy) + + # Construct a pyarrow Array from buffers + data_dtype = map_date_type(data_type) + + if offset_buff: + _, offset_bit_width, _, _ = offset_dtype + # If an offset buffer exists, construct an offset pyarrow Buffer + # and add it to the construction of an array + offset_pa_buffer = pa.foreign_buffer(offset_buff.ptr, + offset_buff.bufsize, + base=offset_buff) + + if data_type[2] == 'U': + string_type = pa.large_string() + else: + if offset_bit_width == 64: + string_type = pa.large_string() + else: + string_type = pa.string() + array = pa.Array.from_buffers( + string_type, + length, + [validity_pa_buff, offset_pa_buffer, data_pa_buffer], + offset=offset, + ) + else: + array = pa.Array.from_buffers( + data_dtype, + length, + [validity_pa_buff, data_pa_buffer], + offset=offset, + ) + + return array + + +def validity_buffer_from_mask( + validity_buff: BufferObject, + validity_dtype: Dtype, + describe_null: ColumnNullType, + length: int, + offset: int = 0, + allow_copy: bool = True, +) -> pa.Buffer: + """ + Build a PyArrow buffer from the passed mask buffer. + + Parameters + ---------- + validity_buff : BufferObject + Tuple of underlying validity buffer and associated dtype. + validity_dtype : Dtype + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + describe_null : ColumnNullType + Null representation the column dtype uses, + as a tuple ``(kind, value)`` + length : int + The number of values in the array. + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Buffer + """ + null_kind, sentinel_val = describe_null + validity_kind, _, _, _ = validity_dtype + assert validity_kind == DtypeKind.BOOL + + if null_kind == ColumnNullType.NON_NULLABLE: + # Sliced array can have a NON_NULLABLE ColumnNullType due + # to no missing values in that slice of an array though the bitmask + # exists and validity_buff must be set to None in this case + return None + + elif null_kind == ColumnNullType.USE_BYTEMASK or ( + null_kind == ColumnNullType.USE_BITMASK and sentinel_val == 1 + ): + buff = pa.foreign_buffer(validity_buff.ptr, + validity_buff.bufsize, + base=validity_buff) + + if null_kind == ColumnNullType.USE_BYTEMASK: + if not allow_copy: + raise RuntimeError( + "To create a bitmask a copy of the data is " + "required which is forbidden by allow_copy=False" + ) + mask = pa.Array.from_buffers(pa.int8(), length, + [None, buff], + offset=offset) + mask_bool = pc.cast(mask, pa.bool_()) + else: + mask_bool = pa.Array.from_buffers(pa.bool_(), length, + [None, buff], + offset=offset) + + if sentinel_val == 1: + mask_bool = pc.invert(mask_bool) + + return mask_bool.buffers()[1] + + elif null_kind == ColumnNullType.USE_BITMASK and sentinel_val == 0: + return pa.foreign_buffer(validity_buff.ptr, + validity_buff.bufsize, + base=validity_buff) + else: + raise NotImplementedError( + f"{describe_null} null representation is not yet supported.") + + +def validity_buffer_nan_sentinel( + data_pa_buffer: BufferObject, + data_type: Dtype, + describe_null: ColumnNullType, + length: int, + offset: int = 0, + allow_copy: bool = True, +) -> pa.Buffer: + """ + Build a PyArrow buffer from NaN or sentinel values. + + Parameters + ---------- + data_pa_buffer : pa.Buffer + PyArrow buffer for the column data. + data_type : Dtype + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + describe_null : ColumnNullType + Null representation the column dtype uses, + as a tuple ``(kind, value)`` + length : int + The number of values in the array. + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Buffer + """ + kind, bit_width, _, _ = data_type + data_dtype = map_date_type(data_type) + null_kind, sentinel_val = describe_null + + # Check for float NaN values + if null_kind == ColumnNullType.USE_NAN: + if not allow_copy: + raise RuntimeError( + "To create a bitmask a copy of the data is " + "required which is forbidden by allow_copy=False" + ) + + if kind == DtypeKind.FLOAT and bit_width == 16: + # 'pyarrow.compute.is_nan' kernel not yet implemented + # for float16 + raise NotImplementedError( + f"{data_type} with {null_kind} is not yet supported.") + else: + pyarrow_data = pa.Array.from_buffers( + data_dtype, + length, + [None, data_pa_buffer], + offset=offset, + ) + mask = pc.is_nan(pyarrow_data) + mask = pc.invert(mask) + return mask.buffers()[1] + + # Check for sentinel values + elif null_kind == ColumnNullType.USE_SENTINEL: + if not allow_copy: + raise RuntimeError( + "To create a bitmask a copy of the data is " + "required which is forbidden by allow_copy=False" + ) + + if kind == DtypeKind.DATETIME: + sentinel_dtype = pa.int64() + else: + sentinel_dtype = data_dtype + pyarrow_data = pa.Array.from_buffers(sentinel_dtype, + length, + [None, data_pa_buffer], + offset=offset) + sentinel_arr = pc.equal(pyarrow_data, sentinel_val) + mask_bool = pc.invert(sentinel_arr) + return mask_bool.buffers()[1] + + elif null_kind == ColumnNullType.NON_NULLABLE: + pass + else: + raise NotImplementedError( + f"{describe_null} null representation is not yet supported.") diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index c97ad027dd7..35492d4f646 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -2861,6 +2861,36 @@ cdef class Table(_PandasConvertible): return self.column(key) + # ---------------------------------------------------------------------- + def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): + """ + Return the dataframe interchange object implementing the interchange protocol. + Parameters + ---------- + nan_as_null : bool, default False + Whether to tell the DataFrame to overwrite null values in the data + with ``NaN`` (or ``NaT``). + allow_copy : bool, default True + Whether to allow memory copying when exporting. If set to False + it would cause non-zero-copy exports to fail. + Returns + ------- + DataFrame interchange object + The object which consuming library can use to ingress the dataframe. + Notes + ----- + Details on the interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + """ + + from pyarrow.interchange.dataframe import _PyArrowDataFrame + + return _PyArrowDataFrame(self, nan_as_null, allow_copy) + + # ---------------------------------------------------------------------- + def slice(self, offset=0, length=None): """ Compute zero-copy slice of this Table. diff --git a/python/pyarrow/tests/interchange/__init__.py b/python/pyarrow/tests/interchange/__init__.py new file mode 100644 index 00000000000..13a83393a91 --- /dev/null +++ b/python/pyarrow/tests/interchange/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/pyarrow/tests/interchange/test_conversion.py b/python/pyarrow/tests/interchange/test_conversion.py new file mode 100644 index 00000000000..0680d9c4ec1 --- /dev/null +++ b/python/pyarrow/tests/interchange/test_conversion.py @@ -0,0 +1,524 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from datetime import datetime as dt +import numpy as np +import pyarrow as pa +from pyarrow.vendored.version import Version +import pytest + +import pyarrow.interchange as pi +from pyarrow.interchange.column import ( + _PyArrowColumn, + ColumnNullType, + DtypeKind, +) +from pyarrow.interchange.from_dataframe import _from_dataframe + +try: + import pandas as pd + # import pandas.testing as tm +except ImportError: + pass + + +@pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns']) +@pytest.mark.parametrize("tz", ['', 'America/New_York', '+07:30', '-04:30']) +def test_datetime(unit, tz): + dt_arr = [dt(2007, 7, 13), dt(2007, 7, 14), None] + table = pa.table({"A": pa.array(dt_arr, type=pa.timestamp(unit, tz=tz))}) + col = table.__dataframe__().get_column_by_name("A") + + assert col.size() == 3 + assert col.offset == 0 + assert col.null_count == 1 + assert col.dtype[0] == DtypeKind.DATETIME + assert col.describe_null == (ColumnNullType.USE_BITMASK, 0) + + +@pytest.mark.parametrize( + ["test_data", "kind"], + [ + (["foo", "bar"], 21), + ([1.5, 2.5, 3.5], 2), + ([1, 2, 3, 4], 0), + ], +) +def test_array_to_pyarrowcolumn(test_data, kind): + arr = pa.array(test_data) + arr_column = _PyArrowColumn(arr) + + assert arr_column._col == arr + assert arr_column.size() == len(test_data) + assert arr_column.dtype[0] == kind + assert arr_column.num_chunks() == 1 + assert arr_column.null_count == 0 + assert arr_column.get_buffers()["validity"] is None + assert len(list(arr_column.get_chunks())) == 1 + + for chunk in arr_column.get_chunks(): + assert chunk == arr_column + + +def test_offset_of_sliced_array(): + arr = pa.array([1, 2, 3, 4]) + arr_sliced = arr.slice(2, 2) + + table = pa.table([arr], names=["arr"]) + table_sliced = pa.table([arr_sliced], names=["arr_sliced"]) + + col = table_sliced.__dataframe__().get_column(0) + assert col.offset == 2 + + result = _from_dataframe(table_sliced.__dataframe__()) + assert table_sliced.equals(result) + assert not table.equals(result) + + # pandas hardcodes offset to 0: + # https://github.com/pandas-dev/pandas/blob/5c66e65d7b9fef47ccb585ce2fd0b3ea18dc82ea/pandas/core/interchange/from_dataframe.py#L247 + # so conversion to pandas can't be tested currently + + # df = pandas_from_dataframe(table) + # df_sliced = pandas_from_dataframe(table_sliced) + + # tm.assert_series_equal(df["arr"][2:4], df_sliced["arr_sliced"], + # check_index=False, check_names=False) + + +# Currently errors due to string conversion +# as col.size is called as a property not method in pandas +# see L255-L257 in pandas/core/interchange/from_dataframe.py +@pytest.mark.pandas +def test_categorical_roundtrip(): + pytest.skip("Bug in pandas implementation") + + if Version(pd.__version__) < Version("1.5.0"): + pytest.skip("__dataframe__ added to pandas in 1.5.0") + arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"] + table = pa.table( + {"weekday": pa.array(arr).dictionary_encode()} + ) + + pandas_df = table.to_pandas() + result = pi.from_dataframe(pandas_df) + + # Checking equality for the values + # As the dtype of the indices is changed from int32 in pa.Table + # to int64 in pandas interchange protocol implementation + assert result[0].chunk(0).dictionary == table[0].chunk(0).dictionary + + table_protocol = table.__dataframe__() + result_protocol = result.__dataframe__() + + assert table_protocol.num_columns() == result_protocol.num_columns() + assert table_protocol.num_rows() == result_protocol.num_rows() + assert table_protocol.num_chunks() == result_protocol.num_chunks() + assert table_protocol.column_names() == result_protocol.column_names() + + col_table = table_protocol.get_column(0) + col_result = result_protocol.get_column(0) + + assert col_result.dtype[0] == DtypeKind.CATEGORICAL + assert col_result.dtype[0] == col_table.dtype[0] + assert col_result.size == col_table.size + assert col_result.offset == col_table.offset + + desc_cat_table = col_result.describe_categorical + desc_cat_result = col_result.describe_categorical + + assert desc_cat_table["is_ordered"] == desc_cat_result["is_ordered"] + assert desc_cat_table["is_dictionary"] == desc_cat_result["is_dictionary"] + assert isinstance(desc_cat_result["categories"]._col, pa.Array) + + +@pytest.mark.pandas +@pytest.mark.parametrize( + "uint", [pa.uint8(), pa.uint16(), pa.uint32()] +) +@pytest.mark.parametrize( + "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()] +) +@pytest.mark.parametrize( + "float, np_float", [ + # (pa.float16(), np.float16), #not supported by pandas + (pa.float32(), np.float32), + (pa.float64(), np.float64) + ] +) +def test_pandas_roundtrip(uint, int, float, np_float): + if Version(pd.__version__) < Version("1.5.0"): + pytest.skip("__dataframe__ added to pandas in 1.5.0") + + arr = [1, 2, 3] + table = pa.table( + { + "a": pa.array(arr, type=uint), + "b": pa.array(arr, type=int), + "c": pa.array(np.array(arr, dtype=np_float), type=float), + } + ) + from pandas.api.interchange import ( + from_dataframe as pandas_from_dataframe + ) + pandas_df = pandas_from_dataframe(table) + result = pi.from_dataframe(pandas_df) + assert table.equals(result) + + table_protocol = table.__dataframe__() + result_protocol = result.__dataframe__() + + assert table_protocol.num_columns() == result_protocol.num_columns() + assert table_protocol.num_rows() == result_protocol.num_rows() + assert table_protocol.num_chunks() == result_protocol.num_chunks() + assert table_protocol.column_names() == result_protocol.column_names() + + +@pytest.mark.pandas +def test_roundtrip_pandas_string(): + # See https://github.com/pandas-dev/pandas/issues/50554 + if Version(pd.__version__) < Version("1.6"): + pytest.skip(" Column.size() called as a method in pandas 2.0.0") + + # large string is not supported by pandas implementation + table = pa.table({"a": pa.array(["a", "", "c"])}) + + from pandas.api.interchange import ( + from_dataframe as pandas_from_dataframe + ) + pandas_df = pandas_from_dataframe(table) + result = pi.from_dataframe(pandas_df) + + assert result[0].to_pylist() == table[0].to_pylist() + assert pa.types.is_string(table[0].type) + assert pa.types.is_large_string(result[0].type) + + table_protocol = table.__dataframe__() + result_protocol = result.__dataframe__() + + assert table_protocol.num_columns() == result_protocol.num_columns() + assert table_protocol.num_rows() == result_protocol.num_rows() + assert table_protocol.num_chunks() == result_protocol.num_chunks() + assert table_protocol.column_names() == result_protocol.column_names() + + +@pytest.mark.pandas +def test_roundtrip_pandas_boolean(): + if Version(pd.__version__) < Version("1.5.0"): + pytest.skip("__dataframe__ added to pandas in 1.5.0") + + table = pa.table({"a": [True, False, True]}) + + from pandas.api.interchange import ( + from_dataframe as pandas_from_dataframe + ) + pandas_df = pandas_from_dataframe(table) + result = pi.from_dataframe(pandas_df) + + assert table.equals(result) + + table_protocol = table.__dataframe__() + result_protocol = result.__dataframe__() + + assert table_protocol.num_columns() == result_protocol.num_columns() + assert table_protocol.num_rows() == result_protocol.num_rows() + assert table_protocol.num_chunks() == result_protocol.num_chunks() + assert table_protocol.column_names() == result_protocol.column_names() + + +@pytest.mark.pandas +@pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns']) +def test_roundtrip_pandas_datetime(unit): + if Version(pd.__version__) < Version("1.5.0"): + pytest.skip("__dataframe__ added to pandas in 1.5.0") + from datetime import datetime as dt + + # timezones not included as they are not yet supported in + # the pandas implementation + dt_arr = [dt(2007, 7, 13), dt(2007, 7, 14), dt(2007, 7, 15)] + table = pa.table({"a": pa.array(dt_arr, type=pa.timestamp(unit))}) + + if Version(pd.__version__) < Version("1.6"): + # pandas < 2.0 always creates datetime64 in "ns" + # resolution + expected = pa.table({"a": pa.array(dt_arr, type=pa.timestamp('ns'))}) + else: + expected = table + + from pandas.api.interchange import ( + from_dataframe as pandas_from_dataframe + ) + pandas_df = pandas_from_dataframe(table) + result = pi.from_dataframe(pandas_df) + + assert expected.equals(result) + + expected_protocol = expected.__dataframe__() + result_protocol = result.__dataframe__() + + assert expected_protocol.num_columns() == result_protocol.num_columns() + assert expected_protocol.num_rows() == result_protocol.num_rows() + assert expected_protocol.num_chunks() == result_protocol.num_chunks() + assert expected_protocol.column_names() == result_protocol.column_names() + + +@pytest.mark.large_memory +@pytest.mark.pandas +def test_pandas_assertion_error_large_string(): + # Test AssertionError as pandas does not support "U" type strings + if Version(pd.__version__) < Version("1.5.0"): + pytest.skip("__dataframe__ added to pandas in 1.5.0") + + data = np.array([b'x'*1024]*(3*1024**2), dtype='object') # 3GB bytes data + arr = pa.array(data, type=pa.large_string()) + table = pa.table([arr], names=["large_string"]) + + from pandas.api.interchange import ( + from_dataframe as pandas_from_dataframe + ) + + with pytest.raises(AssertionError): + pandas_from_dataframe(table) + + +@pytest.mark.pandas +@pytest.mark.parametrize( + "np_float", [np.float32, np.float64] +) +def test_pandas_to_pyarrow_with_missing(np_float): + if Version(pd.__version__) < Version("1.5.0"): + pytest.skip("__dataframe__ added to pandas in 1.5.0") + + np_array = np.array([0, np.nan, 2], dtype=np_float) + datetime_array = [None, dt(2007, 7, 14), dt(2007, 7, 15)] + df = pd.DataFrame({ + "a": np_array, # float, ColumnNullType.USE_NAN + "dt": datetime_array # ColumnNullType.USE_SENTINEL + }) + expected = pa.table({ + "a": pa.array(np_array, from_pandas=True), + "dt": pa.array(datetime_array, type=pa.timestamp("ns")) + }) + result = pi.from_dataframe(df) + + assert result.equals(expected) + + +@pytest.mark.pandas +def test_pandas_to_pyarrow_float16_with_missing(): + if Version(pd.__version__) < Version("1.5.0"): + pytest.skip("__dataframe__ added to pandas in 1.5.0") + + # np.float16 errors if ps.is_nan is used + # pyarrow.lib.ArrowNotImplementedError: Function 'is_nan' has no kernel + # matching input types (halffloat) + np_array = np.array([0, np.nan, 2], dtype=np.float16) + df = pd.DataFrame({"a": np_array}) + + with pytest.raises(NotImplementedError): + pi.from_dataframe(df) + + +@pytest.mark.pandas +def test_pandas_to_pyarrow_string_with_missing(): + if Version(pd.__version__) < Version("1.5.0"): + pytest.skip("__dataframe__ added to pandas in 1.5.0") + + # pandas is using int64 offsets for string dtype so the constructed + # pyarrow string column will always be a large_string data type + arr = { + "Y": ["a", "b", None], # bool, ColumnNullType.USE_BYTEMASK, + } + df = pd.DataFrame(arr) + expected = pa.table(arr) + result = pi.from_dataframe(df) + + assert result[0].to_pylist() == expected[0].to_pylist() + assert pa.types.is_string(expected[0].type) + assert pa.types.is_large_string(result[0].type) + + +@pytest.mark.pandas +def test_pandas_to_pyarrow_categorical_with_missing(): + if Version(pd.__version__) < Version("1.5.0"): + pytest.skip("__dataframe__ added to pandas in 1.5.0") + + arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", None] + df = pd.DataFrame( + {"weekday": arr} + ) + df = df.astype("category") + result = pi.from_dataframe(df) + + expected_dictionary = ["Fri", "Mon", "Sat", "Thu", "Tue", "Wed"] + expected_indices = pa.array([1, 4, 1, 5, 1, 3, 0, 2, None], type=pa.int8()) + + assert result[0].to_pylist() == arr + assert result[0].chunk(0).dictionary.to_pylist() == expected_dictionary + assert result[0].chunk(0).indices.equals(expected_indices) + + +@pytest.mark.parametrize( + "uint", [pa.uint8(), pa.uint16(), pa.uint32()] +) +@pytest.mark.parametrize( + "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()] +) +@pytest.mark.parametrize( + "float, np_float", [ + (pa.float16(), np.float16), + (pa.float32(), np.float32), + (pa.float64(), np.float64) + ] +) +@pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns']) +@pytest.mark.parametrize("tz", ['America/New_York', '+07:30', '-04:30']) +@pytest.mark.parametrize("offset, length", [(0, 3), (0, 2), (1, 2), (2, 1)]) +def test_pyarrow_roundtrip(uint, int, float, np_float, + unit, tz, offset, length): + + from datetime import datetime as dt + arr = [1, 2, None] + dt_arr = [dt(2007, 7, 13), None, dt(2007, 7, 15)] + + table = pa.table( + { + "a": pa.array(arr, type=uint), + "b": pa.array(arr, type=int), + "c": pa.array(np.array(arr, dtype=np_float), + type=float, from_pandas=True), + "d": [True, False, True], + "e": [True, False, None], + "f": ["a", None, "c"], + "g": pa.array(dt_arr, type=pa.timestamp(unit, tz=tz)) + } + ) + table = table.slice(offset, length) + result = _from_dataframe(table.__dataframe__()) + + assert table.equals(result) + + table_protocol = table.__dataframe__() + result_protocol = result.__dataframe__() + + assert table_protocol.num_columns() == result_protocol.num_columns() + assert table_protocol.num_rows() == result_protocol.num_rows() + assert table_protocol.num_chunks() == result_protocol.num_chunks() + assert table_protocol.column_names() == result_protocol.column_names() + + +@pytest.mark.parametrize("offset, length", [(0, 10), (0, 2), (7, 3), (2, 1)]) +def test_pyarrow_roundtrip_categorical(offset, length): + arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", None, "Sun"] + table = pa.table( + {"weekday": pa.array(arr).dictionary_encode()} + ) + table = table.slice(offset, length) + result = _from_dataframe(table.__dataframe__()) + + assert table.equals(result) + + table_protocol = table.__dataframe__() + result_protocol = result.__dataframe__() + + assert table_protocol.num_columns() == result_protocol.num_columns() + assert table_protocol.num_rows() == result_protocol.num_rows() + assert table_protocol.num_chunks() == result_protocol.num_chunks() + assert table_protocol.column_names() == result_protocol.column_names() + + col_table = table_protocol.get_column(0) + col_result = result_protocol.get_column(0) + + assert col_result.dtype[0] == DtypeKind.CATEGORICAL + assert col_result.dtype[0] == col_table.dtype[0] + assert col_result.size() == col_table.size() + assert col_result.offset == col_table.offset + + desc_cat_table = col_result.describe_categorical + desc_cat_result = col_result.describe_categorical + + assert desc_cat_table["is_ordered"] == desc_cat_result["is_ordered"] + assert desc_cat_table["is_dictionary"] == desc_cat_result["is_dictionary"] + assert isinstance(desc_cat_result["categories"]._col, pa.Array) + + +@pytest.mark.large_memory +def test_pyarrow_roundtrip_large_string(): + + data = np.array([b'x'*1024]*(3*1024**2), dtype='object') # 3GB bytes data + arr = pa.array(data, type=pa.large_string()) + table = pa.table([arr], names=["large_string"]) + + result = _from_dataframe(table.__dataframe__()) + col = result.__dataframe__().get_column(0) + + assert col.size() == 3*1024**2 + assert pa.types.is_large_string(table[0].type) + assert pa.types.is_large_string(result[0].type) + + assert table.equals(result) + + +def test_nan_as_null(): + table = pa.table({"a": [1, 2, 3, 4]}) + with pytest.raises(RuntimeError): + table.__dataframe__(nan_as_null=True) + + +@pytest.mark.pandas +def test_allow_copy_false(): + if Version(pd.__version__) < Version("1.5.0"): + pytest.skip("__dataframe__ added to pandas in 1.5.0") + + # Test that an error is raised when a copy is needed + # to create a bitmask + + df = pd.DataFrame({"a": [0, 1.0, 2.0]}) + with pytest.raises(RuntimeError): + pi.from_dataframe(df, allow_copy=False) + + df = pd.DataFrame({ + "dt": [None, dt(2007, 7, 14), dt(2007, 7, 15)] + }) + with pytest.raises(RuntimeError): + pi.from_dataframe(df, allow_copy=False) + + +@pytest.mark.pandas +def test_allow_copy_false_bool_categorical(): + if Version(pd.__version__) < Version("1.5.0"): + pytest.skip("__dataframe__ added to pandas in 1.5.0") + + # Test that an error is raised for boolean + # and categorical dtype (copy is always made) + + df = pd.DataFrame({"a": [None, False, True]}) + with pytest.raises(RuntimeError): + pi.from_dataframe(df, allow_copy=False) + + df = pd.DataFrame({"a": [True, False, True]}) + with pytest.raises(RuntimeError): + pi.from_dataframe(df, allow_copy=False) + + df = pd.DataFrame({"weekday": ["a", "b", None]}) + df = df.astype("category") + with pytest.raises(RuntimeError): + pi.from_dataframe(df, allow_copy=False) + + df = pd.DataFrame({"weekday": ["a", "b", "c"]}) + df = df.astype("category") + with pytest.raises(RuntimeError): + pi.from_dataframe(df, allow_copy=False) diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py new file mode 100644 index 00000000000..42ec8053599 --- /dev/null +++ b/python/pyarrow/tests/interchange/test_interchange_spec.py @@ -0,0 +1,243 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import ctypes +import hypothesis as h +import hypothesis.strategies as st + +import numpy as np +import pyarrow as pa +import pyarrow.tests.strategies as past +import pytest + + +all_types = st.deferred( + lambda: ( + past.signed_integer_types | + past.unsigned_integer_types | + past.floating_types | + past.bool_type | + past.string_type | + past.large_string_type + ) +) + + +# datetime is tested in test_extra.py +# dictionary is tested in test_categorical() +@h.given(past.arrays(all_types, size=3)) +def test_dtypes(arr): + table = pa.table([arr], names=["a"]) + df = table.__dataframe__() + + null_count = df.get_column(0).null_count + assert null_count == arr.null_count + assert isinstance(null_count, int) + assert df.get_column(0).size() == 3 + assert df.get_column(0).offset == 0 + + +@pytest.mark.parametrize( + "uint, uint_bw", + [ + (pa.uint8(), 8), + (pa.uint16(), 16), + (pa.uint32(), 32) + ] +) +@pytest.mark.parametrize( + "int, int_bw", [ + (pa.int8(), 8), + (pa.int16(), 16), + (pa.int32(), 32), + (pa.int64(), 64) + ] +) +@pytest.mark.parametrize( + "float, float_bw, np_float", [ + (pa.float16(), 16, np.float16), + (pa.float32(), 32, np.float32), + (pa.float64(), 64, np.float64) + ] +) +@pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns']) +@pytest.mark.parametrize("tz", ['', 'America/New_York', '+07:30', '-04:30']) +def test_mixed_dtypes(uint, uint_bw, int, int_bw, + float, float_bw, np_float, unit, tz): + from datetime import datetime as dt + arr = [1, 2, 3] + dt_arr = [dt(2007, 7, 13), dt(2007, 7, 14), dt(2007, 7, 15)] + table = pa.table( + { + "a": pa.array(arr, type=uint), + "b": pa.array(arr, type=int), + "c": pa.array(np.array(arr, dtype=np_float), type=float), + "d": [True, False, True], + "e": ["a", "", "c"], + "f": pa.array(dt_arr, type=pa.timestamp(unit, tz=tz)) + } + ) + df = table.__dataframe__() + # 0 = DtypeKind.INT, 1 = DtypeKind.UINT, 2 = DtypeKind.FLOAT, + # 20 = DtypeKind.BOOL, 21 = DtypeKind.STRING, 22 = DtypeKind.DATETIME + # see DtypeKind class in column.py + columns = {"a": 1, "b": 0, "c": 2, "d": 20, "e": 21, "f": 22} + + for column, kind in columns.items(): + col = df.get_column_by_name(column) + + assert col.null_count == 0 + assert col.size() == 3 + assert col.offset == 0 + assert col.dtype[0] == kind + + assert df.get_column_by_name("a").dtype[1] == uint_bw + assert df.get_column_by_name("b").dtype[1] == int_bw + assert df.get_column_by_name("c").dtype[1] == float_bw + + +def test_na_float(): + table = pa.table({"a": [1.0, None, 2.0]}) + df = table.__dataframe__() + col = df.get_column_by_name("a") + assert col.null_count == 1 + assert isinstance(col.null_count, int) + + +def test_noncategorical(): + table = pa.table({"a": [1, 2, 3]}) + df = table.__dataframe__() + col = df.get_column_by_name("a") + with pytest.raises(TypeError, match=".*categorical.*"): + col.describe_categorical + + +def test_categorical(): + import pyarrow as pa + arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", None] + table = pa.table( + {"weekday": pa.array(arr).dictionary_encode()} + ) + + col = table.__dataframe__().get_column_by_name("weekday") + categorical = col.describe_categorical + assert isinstance(categorical["is_ordered"], bool) + assert isinstance(categorical["is_dictionary"], bool) + + +def test_dataframe(): + n = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + a = pa.chunked_array([["Flamingo", "Parrot", "Cow"], + ["Horse", "Brittle stars", "Centipede"]]) + table = pa.table([n, a], names=['n_legs', 'animals']) + df = table.__dataframe__() + + assert df.num_columns() == 2 + assert df.num_rows() == 6 + assert df.num_chunks() == 2 + assert list(df.column_names()) == ['n_legs', 'animals'] + assert list(df.select_columns((1,)).column_names()) == list( + df.select_columns_by_name(("animals",)).column_names() + ) + + +@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)]) +def test_df_get_chunks(size, n_chunks): + table = pa.table({"x": list(range(size))}) + df = table.__dataframe__() + chunks = list(df.get_chunks(n_chunks)) + assert len(chunks) == n_chunks + assert sum(chunk.num_rows() for chunk in chunks) == size + + +@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)]) +def test_column_get_chunks(size, n_chunks): + table = pa.table({"x": list(range(size))}) + df = table.__dataframe__() + chunks = list(df.get_column(0).get_chunks(n_chunks)) + assert len(chunks) == n_chunks + assert sum(chunk.size() for chunk in chunks) == size + + +@pytest.mark.pandas +@pytest.mark.parametrize( + "uint", [pa.uint8(), pa.uint16(), pa.uint32()] +) +@pytest.mark.parametrize( + "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()] +) +@pytest.mark.parametrize( + "float, np_float", [ + (pa.float16(), np.float16), + (pa.float32(), np.float32), + (pa.float64(), np.float64) + ] +) +def test_get_columns(uint, int, float, np_float): + arr = [[1, 2, 3], [4, 5]] + arr_float = np.array([1, 2, 3, 4, 5], dtype=np_float) + table = pa.table( + { + "a": pa.chunked_array(arr, type=uint), + "b": pa.chunked_array(arr, type=int), + "c": pa.array(arr_float, type=float) + } + ) + df = table.__dataframe__() + for col in df.get_columns(): + assert col.size() == 5 + assert col.num_chunks() == 1 + + # 0 = DtypeKind.INT, 1 = DtypeKind.UINT, 2 = DtypeKind.FLOAT, + # see DtypeKind class in column.py + assert df.get_column(0).dtype[0] == 1 # UINT + assert df.get_column(1).dtype[0] == 0 # INT + assert df.get_column(2).dtype[0] == 2 # FLOAT + + +@pytest.mark.parametrize( + "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()] +) +def test_buffer(int): + arr = [0, 1, -1] + table = pa.table({"a": pa.array(arr, type=int)}) + df = table.__dataframe__() + col = df.get_column(0) + buf = col.get_buffers() + + dataBuf, dataDtype = buf["data"] + + assert dataBuf.bufsize > 0 + assert dataBuf.ptr != 0 + device, _ = dataBuf.__dlpack_device__() + + # 0 = DtypeKind.INT + # see DtypeKind class in column.py + assert dataDtype[0] == 0 + + if device == 1: # CPU-only as we're going to directly read memory here + bitwidth = dataDtype[1] + ctype = { + 8: ctypes.c_int8, + 16: ctypes.c_int16, + 32: ctypes.c_int32, + 64: ctypes.c_int64, + }[bitwidth] + + for idx, truth in enumerate(arr): + val = ctype.from_address(dataBuf.ptr + idx * (bitwidth // 8)).value + assert val == truth, f"Buffer at index {idx} mismatch"