diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 2ef42051d9a..91770a52199 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1651,16 +1651,30 @@ cdef class Array(_PandasConvertible): array = array.copy() return array - def to_pylist(self): + def to_pylist(self, *, maps_as_pydicts=None): """ Convert to a list of native Python objects. + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + Returns ------- lst : list """ self._assert_cpu() - return [x.as_py() for x in self] + return [x.as_py(maps_as_pydicts=maps_as_pydicts) for x in self] def tolist(self): """ @@ -2286,12 +2300,18 @@ cdef class MonthDayNanoIntervalArray(Array): Concrete class for Arrow arrays of interval[MonthDayNano] type. """ - def to_pylist(self): + def to_pylist(self, *, maps_as_pydicts=None): """ Convert to a list of native Python objects. pyarrow.MonthDayNano is used as the native representation. + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. + Returns ------- lst : list diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index e877b0965d1..04442c1f5d2 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -16,6 +16,7 @@ # under the License. import collections +import warnings from uuid import UUID @@ -148,7 +149,24 @@ cdef class Scalar(_Weakrefable): def __reduce__(self): return scalar, (self.as_py(), self.type) - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): + """ + Return this value as a Python representation. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + """ raise NotImplementedError() @@ -169,9 +187,15 @@ cdef class NullScalar(Scalar): def __init__(self): pass - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python None. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ return None @@ -184,9 +208,15 @@ cdef class BooleanScalar(Scalar): Concrete class for boolean scalars. """ - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python bool. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef CBooleanScalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -197,9 +227,15 @@ cdef class UInt8Scalar(Scalar): Concrete class for uint8 scalars. """ - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python int. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef CUInt8Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -210,9 +246,15 @@ cdef class Int8Scalar(Scalar): Concrete class for int8 scalars. """ - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python int. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef CInt8Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -223,9 +265,15 @@ cdef class UInt16Scalar(Scalar): Concrete class for uint16 scalars. """ - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python int. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef CUInt16Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -236,9 +284,15 @@ cdef class Int16Scalar(Scalar): Concrete class for int16 scalars. """ - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python int. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef CInt16Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -249,9 +303,15 @@ cdef class UInt32Scalar(Scalar): Concrete class for uint32 scalars. """ - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python int. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef CUInt32Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -262,9 +322,15 @@ cdef class Int32Scalar(Scalar): Concrete class for int32 scalars. """ - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python int. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef CInt32Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -275,9 +341,15 @@ cdef class UInt64Scalar(Scalar): Concrete class for uint64 scalars. """ - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python int. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef CUInt64Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -288,9 +360,15 @@ cdef class Int64Scalar(Scalar): Concrete class for int64 scalars. """ - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python int. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef CInt64Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -301,9 +379,15 @@ cdef class HalfFloatScalar(Scalar): Concrete class for float scalars. """ - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python float. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef CHalfFloatScalar* sp = self.wrapped.get() return PyHalf_FromHalf(sp.value) if sp.is_valid else None @@ -314,9 +398,15 @@ cdef class FloatScalar(Scalar): Concrete class for float scalars. """ - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python float. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef CFloatScalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -327,9 +417,15 @@ cdef class DoubleScalar(Scalar): Concrete class for double scalars. """ - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python float. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef CDoubleScalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -340,9 +436,15 @@ cdef class Decimal32Scalar(Scalar): Concrete class for decimal32 scalars. """ - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python Decimal. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef: CDecimal32Scalar* sp = self.wrapped.get() @@ -360,9 +462,15 @@ cdef class Decimal64Scalar(Scalar): Concrete class for decimal64 scalars. """ - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python Decimal. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef: CDecimal64Scalar* sp = self.wrapped.get() @@ -380,9 +488,15 @@ cdef class Decimal128Scalar(Scalar): Concrete class for decimal128 scalars. """ - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python Decimal. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef: CDecimal128Scalar* sp = self.wrapped.get() @@ -400,9 +514,15 @@ cdef class Decimal256Scalar(Scalar): Concrete class for decimal256 scalars. """ - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python Decimal. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef: CDecimal256Scalar* sp = self.wrapped.get() @@ -425,9 +545,15 @@ cdef class Date32Scalar(Scalar): cdef CDate32Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python datetime.datetime instance. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef CDate32Scalar* sp = self.wrapped.get() @@ -450,9 +576,15 @@ cdef class Date64Scalar(Scalar): cdef CDate64Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python datetime.datetime instance. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef CDate64Scalar* sp = self.wrapped.get() @@ -504,9 +636,15 @@ cdef class Time32Scalar(Scalar): cdef CTime32Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python datetime.timedelta instance. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef: CTime32Scalar* sp = self.wrapped.get() @@ -528,9 +666,15 @@ cdef class Time64Scalar(Scalar): cdef CTime64Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python datetime.timedelta instance. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef: CTime64Scalar* sp = self.wrapped.get() @@ -552,11 +696,17 @@ cdef class TimestampScalar(Scalar): cdef CTimestampScalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Pandas Timestamp instance (if units are nanoseconds and pandas is available), otherwise as a Python datetime.datetime instance. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef: CTimestampScalar* sp = self.wrapped.get() @@ -600,11 +750,17 @@ cdef class DurationScalar(Scalar): cdef CDurationScalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Pandas Timedelta instance (if units are nanoseconds and pandas is available), otherwise as a Python datetime.timedelta instance. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef: CDurationScalar* sp = self.wrapped.get() @@ -647,9 +803,15 @@ cdef class MonthDayNanoIntervalScalar(Scalar): """ return self.as_py() - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a pyarrow.MonthDayNano. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ cdef: PyObject* val @@ -672,9 +834,15 @@ cdef class BinaryScalar(Scalar): cdef CBaseBinaryScalar* sp = self.wrapped.get() return pyarrow_wrap_buffer(sp.value) if sp.is_valid else None - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python bytes. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ buffer = self.as_buffer() return None if buffer is None else buffer.to_pybytes() @@ -693,9 +861,15 @@ cdef class StringScalar(BinaryScalar): Concrete class for string-like (utf8) scalars. """ - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python string. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ buffer = self.as_buffer() return None if buffer is None else str(buffer, 'utf8') @@ -744,12 +918,26 @@ cdef class ListScalar(Scalar): """ return iter(self.values) - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python list. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. """ arr = self.values - return None if arr is None else arr.to_pylist() + return None if arr is None else arr.to_pylist(maps_as_pydicts=maps_as_pydicts) cdef class FixedSizeListScalar(ListScalar): @@ -824,13 +1012,27 @@ cdef class StructScalar(Scalar, collections.abc.Mapping): else: raise KeyError(key) from exc - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python dict. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. """ if self.is_valid: try: - return {k: self[k].as_py() for k in self.keys()} + return {k: self[k].as_py(maps_as_pydicts=maps_as_pydicts) for k in self.keys()} except KeyError: raise ValueError( "Converting to Python dictionary is not supported when " @@ -880,12 +1082,47 @@ cdef class MapScalar(ListScalar): for k, v in zip(arr.field(self.type.key_field.name), arr.field(self.type.item_field.name)): yield (k.as_py(), v.as_py()) - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ - Return this value as a Python list. + Return this value as a Python list or dict, depending on 'maps_as_pydicts'. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. """ - cdef CStructScalar* sp = self.wrapped.get() - return list(self) if sp.is_valid else None + if maps_as_pydicts not in (None, "lossy", "strict"): + raise ValueError( + "Invalid value for 'maps_as_pydicts': " + + "valid values are 'lossy', 'strict' or `None` (default). " + + f"Received {maps_as_pydicts!r}." + ) + if not self.is_valid: + return None + if not maps_as_pydicts: + return list(self) + result_dict = {} + for key, value in self: + if key in result_dict: + if maps_as_pydicts == "strict": + raise KeyError( + "Converting to Python dictionary is not supported in strict mode " + f"when duplicate keys are present (duplicate key was '{key}')." + ) + else: + warnings.warn( + f"Encountered key '{key}' which was already encountered.") + result_dict[key] = value + return result_dict cdef class DictionaryScalar(Scalar): @@ -958,11 +1195,25 @@ cdef class DictionaryScalar(Scalar): cdef CDictionaryScalar* sp = self.wrapped.get() return pyarrow_wrap_array(sp.value.dictionary) - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this encoded value as a Python object. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. """ - return self.value.as_py() if self.is_valid else None + return self.value.as_py(maps_as_pydicts=maps_as_pydicts) if self.is_valid else None cdef class RunEndEncodedScalar(Scalar): @@ -977,11 +1228,25 @@ cdef class RunEndEncodedScalar(Scalar): cdef CRunEndEncodedScalar* sp = self.wrapped.get() return Scalar.wrap(sp.value) - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return underlying value as a Python object. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. """ - return self.value.as_py() + return self.value.as_py(maps_as_pydicts=maps_as_pydicts) cdef class UnionScalar(Scalar): @@ -1003,12 +1268,26 @@ cdef class UnionScalar(Scalar): dp = self.wrapped.get() return Scalar.wrap(dp.value) if dp.is_valid else None - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return underlying value as a Python object. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. """ value = self.value - return None if value is None else value.as_py() + return None if value is None else value.as_py(maps_as_pydicts=maps_as_pydicts) @property def type_code(self): @@ -1032,11 +1311,25 @@ cdef class ExtensionScalar(Scalar): cdef CExtensionScalar* sp = self.wrapped.get() return Scalar.wrap(sp.value) if sp.is_valid else None - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this scalar as a Python object. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. """ - return None if self.value is None else self.value.as_py() + return None if self.value is None else self.value.as_py(maps_as_pydicts=maps_as_pydicts) @staticmethod def from_storage(BaseExtensionType typ, value): @@ -1093,7 +1386,16 @@ class UuidScalar(ExtensionScalar): Concrete class for Uuid extension scalar. """ - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): + """ + Return this scalar as a Python UUID. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. + """ return None if self.value is None else UUID(bytes=self.value.as_py()) @@ -1150,9 +1452,15 @@ cdef class Bool8Scalar(ExtensionScalar): Concrete class for bool8 extension scalar. """ - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): """ Return this scalar as a Python object. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + This parameter is ignored for non-nested Scalars. """ py_val = super().as_py() return None if py_val is None else py_val != 0 diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index af241e4be07..5a6cd390489 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1349,10 +1349,24 @@ cdef class ChunkedArray(_PandasConvertible): for i in range(self.num_chunks): yield self.chunk(i) - def to_pylist(self): + def to_pylist(self, *, maps_as_pydicts=None): """ Convert to a list of native Python objects. + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + Examples -------- >>> import pyarrow as pa @@ -1363,7 +1377,7 @@ cdef class ChunkedArray(_PandasConvertible): self._assert_cpu() result = [] for i in range(self.num_chunks): - result += self.chunk(i).to_pylist() + result += self.chunk(i).to_pylist(maps_as_pydicts=maps_as_pydicts) return result def __arrow_c_stream__(self, requested_schema=None): @@ -2255,10 +2269,24 @@ cdef class _Tabular(_PandasConvertible): else: return _pc().filter(self, mask, null_selection_behavior) - def to_pydict(self): + def to_pydict(self, *, maps_as_pydicts=None): """ Convert the Table or RecordBatch to a dict or OrderedDict. + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + Returns ------- dict @@ -2277,14 +2305,28 @@ cdef class _Tabular(_PandasConvertible): entries = [] for i in range(self.num_columns): name = self.field(i).name - column = self[i].to_pylist() + column = self[i].to_pylist(maps_as_pydicts=maps_as_pydicts) entries.append((name, column)) return ordered_dict(entries) - def to_pylist(self): + def to_pylist(self, *, maps_as_pydicts=None): """ Convert the Table or RecordBatch to a list of rows / dictionaries. + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + Returns ------- list @@ -2300,7 +2342,7 @@ cdef class _Tabular(_PandasConvertible): >>> table.to_pylist() [{'n_legs': 2, 'animals': 'Flamingo'}, {'n_legs': 4, 'animals': 'Horse'}, ... """ - pydict = self.to_pydict() + pydict = self.to_pydict(maps_as_pydicts=maps_as_pydicts) names = self.schema.names pylist = [{column: pydict[column][row] for column in names} for row in range(self.num_rows)] diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 634d9ce2d8d..185b5bb424b 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -99,7 +99,7 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): class ExampleUuidScalarType(pa.ExtensionScalar): - def as_py(self): + def as_py(self, *, maps_as_pydicts=None): return None if self.value is None else UUID(bytes=self.value.as_py()) diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 3f4a53c473e..29db36eddc7 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -786,6 +786,22 @@ def test_map(pickle_module): restored = pickle_module.loads(pickle_module.dumps(s)) assert restored.equals(s) + assert s.as_py(maps_as_pydicts="strict") == {'a': 1, 'b': 2} + + +def test_map_duplicate_fields(): + ty = pa.map_(pa.string(), pa.int8()) + v = [('a', 1), ('a', 2)] + s = pa.scalar(v, type=ty) + + assert s.as_py(maps_as_pydicts=None) == v + + with pytest.raises(KeyError): + assert s.as_py(maps_as_pydicts="strict") + + with pytest.warns(match="Encountered key 'a' which was already encountered"): + assert s.as_py(maps_as_pydicts="lossy") == {'a': 2} + def test_dictionary(pickle_module): indices = pa.array([2, None, 1, 2, 0, None]) @@ -898,3 +914,15 @@ def test_map_scalar_as_py_with_custom_field_name(): pa.field("custom_value", pa.string()), ), ).as_py() == [("foo", "bar")] + + +def test_nested_map_types_with_maps_as_pydicts(): + ty = pa.struct([ + pa.field('x', pa.map_(pa.string(), pa.int8())), + pa.field('y', pa.list_(pa.map_(pa.string(), pa.int8()))), + ]) + + v = {'x': {'a': 1}, 'y': [{'b': 2}, {'c': 3}]} + s = pa.scalar(v, type=ty) + + assert s.as_py(maps_as_pydicts="strict") == v diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 4c058ccecda..180ae7b4c1a 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -1888,6 +1888,26 @@ def test_table_unify_dictionaries(): assert table.schema.metadata == {b"key1": b"value1"} +def test_table_maps_as_pydicts(): + arrays = [ + pa.array( + [{'x': 1, 'y': 2}, {'z': 3}], + type=pa.map_(pa.string(), pa.int32()) + ) + ] + table = pa.Table.from_arrays(arrays, names=['a']) + + table_dict = table.to_pydict(maps_as_pydicts="strict") + assert 'a' in table_dict + column_list = table_dict['a'] + assert len(column_list) == 2 + assert column_list == [{'x': 1, 'y': 2}, {'z': 3}] + + table_list = table.to_pylist(maps_as_pydicts="strict") + assert len(table_list) == 2 + assert table_list == [{'a': {'x': 1, 'y': 2}}, {'a': {'z': 3}}] + + def test_concat_tables(): data = [ list(range(5)),