From 1b7c569db8f2843a07fb22f74a99256197fb62ab Mon Sep 17 00:00:00 2001 From: Jonas Dedden Date: Sun, 9 Feb 2025 20:40:44 +0100 Subject: [PATCH 1/7] GH-39010: [Python] Introduce `maps_as_pydicts` parameter Fix ExampleUuidScalarType Add tests for `maps_as_pydicts` Add test for duplicate map keys Formatting fixes Add docstring for 'maps_as_pydicts' Formatting fixes Call from_arrays from Table Fix last hopefully issues Correct MapScalar method "as_py" when there are multiple keys present --- python/pyarrow/array.pxi | 16 +- python/pyarrow/scalar.pxi | 291 ++++++++++++++++---- python/pyarrow/table.pxi | 27 +- python/pyarrow/tests/test_extension_type.py | 2 +- python/pyarrow/tests/test_scalars.py | 25 ++ python/pyarrow/tests/test_table.py | 20 ++ 6 files changed, 324 insertions(+), 57 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 2ef42051d9a..87dbf16a9c8 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1651,16 +1651,21 @@ cdef class Array(_PandasConvertible): array = array.copy() return array - def to_pylist(self): + def to_pylist(self, maps_as_pydicts=False): """ Convert to a list of native Python objects. + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + Returns ------- lst : list """ self._assert_cpu() - return [x.as_py() for x in self] + return [x.as_py(maps_as_pydicts=maps_as_pydicts) for x in self] def tolist(self): """ @@ -2286,12 +2291,17 @@ cdef class MonthDayNanoIntervalArray(Array): Concrete class for Arrow arrays of interval[MonthDayNano] type. """ - def to_pylist(self): + def to_pylist(self, maps_as_pydicts=False): """ Convert to a list of native Python objects. pyarrow.MonthDayNano is used as the native representation. + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + Returns ------- lst : list diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index e877b0965d1..d3450e390b1 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -148,7 +148,15 @@ cdef class Scalar(_Weakrefable): def __reduce__(self): return scalar, (self.as_py(), self.type) - def as_py(self): + def as_py(self, maps_as_pydicts=False): + """ + Return this value as a Python representation. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + """ raise NotImplementedError() @@ -169,9 +177,14 @@ cdef class NullScalar(Scalar): def __init__(self): pass - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python None. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ return None @@ -184,9 +197,14 @@ cdef class BooleanScalar(Scalar): Concrete class for boolean scalars. """ - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python bool. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef CBooleanScalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -197,9 +215,14 @@ cdef class UInt8Scalar(Scalar): Concrete class for uint8 scalars. """ - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python int. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef CUInt8Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -210,9 +233,14 @@ cdef class Int8Scalar(Scalar): Concrete class for int8 scalars. """ - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python int. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef CInt8Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -223,9 +251,14 @@ cdef class UInt16Scalar(Scalar): Concrete class for uint16 scalars. """ - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python int. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef CUInt16Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -236,9 +269,14 @@ cdef class Int16Scalar(Scalar): Concrete class for int16 scalars. """ - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python int. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef CInt16Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -249,9 +287,14 @@ cdef class UInt32Scalar(Scalar): Concrete class for uint32 scalars. """ - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python int. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef CUInt32Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -262,9 +305,14 @@ cdef class Int32Scalar(Scalar): Concrete class for int32 scalars. """ - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python int. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef CInt32Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -275,9 +323,14 @@ cdef class UInt64Scalar(Scalar): Concrete class for uint64 scalars. """ - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python int. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef CUInt64Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -288,9 +341,14 @@ cdef class Int64Scalar(Scalar): Concrete class for int64 scalars. """ - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python int. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef CInt64Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -301,9 +359,14 @@ cdef class HalfFloatScalar(Scalar): Concrete class for float scalars. """ - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python float. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef CHalfFloatScalar* sp = self.wrapped.get() return PyHalf_FromHalf(sp.value) if sp.is_valid else None @@ -314,9 +377,14 @@ cdef class FloatScalar(Scalar): Concrete class for float scalars. """ - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python float. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef CFloatScalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -327,9 +395,14 @@ cdef class DoubleScalar(Scalar): Concrete class for double scalars. """ - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python float. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef CDoubleScalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -340,9 +413,14 @@ cdef class Decimal32Scalar(Scalar): Concrete class for decimal32 scalars. """ - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python Decimal. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef: CDecimal32Scalar* sp = self.wrapped.get() @@ -360,9 +438,14 @@ cdef class Decimal64Scalar(Scalar): Concrete class for decimal64 scalars. """ - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python Decimal. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef: CDecimal64Scalar* sp = self.wrapped.get() @@ -380,9 +463,14 @@ cdef class Decimal128Scalar(Scalar): Concrete class for decimal128 scalars. """ - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python Decimal. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef: CDecimal128Scalar* sp = self.wrapped.get() @@ -400,9 +488,14 @@ cdef class Decimal256Scalar(Scalar): Concrete class for decimal256 scalars. """ - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python Decimal. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef: CDecimal256Scalar* sp = self.wrapped.get() @@ -425,9 +518,14 @@ cdef class Date32Scalar(Scalar): cdef CDate32Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python datetime.datetime instance. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef CDate32Scalar* sp = self.wrapped.get() @@ -450,9 +548,14 @@ cdef class Date64Scalar(Scalar): cdef CDate64Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python datetime.datetime instance. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef CDate64Scalar* sp = self.wrapped.get() @@ -504,9 +607,14 @@ cdef class Time32Scalar(Scalar): cdef CTime32Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python datetime.timedelta instance. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef: CTime32Scalar* sp = self.wrapped.get() @@ -528,9 +636,14 @@ cdef class Time64Scalar(Scalar): cdef CTime64Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python datetime.timedelta instance. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef: CTime64Scalar* sp = self.wrapped.get() @@ -552,11 +665,16 @@ cdef class TimestampScalar(Scalar): cdef CTimestampScalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Pandas Timestamp instance (if units are nanoseconds and pandas is available), otherwise as a Python datetime.datetime instance. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef: CTimestampScalar* sp = self.wrapped.get() @@ -600,11 +718,16 @@ cdef class DurationScalar(Scalar): cdef CDurationScalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Pandas Timedelta instance (if units are nanoseconds and pandas is available), otherwise as a Python datetime.timedelta instance. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef: CDurationScalar* sp = self.wrapped.get() @@ -647,9 +770,14 @@ cdef class MonthDayNanoIntervalScalar(Scalar): """ return self.as_py() - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a pyarrow.MonthDayNano. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ cdef: PyObject* val @@ -672,9 +800,14 @@ cdef class BinaryScalar(Scalar): cdef CBaseBinaryScalar* sp = self.wrapped.get() return pyarrow_wrap_buffer(sp.value) if sp.is_valid else None - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python bytes. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ buffer = self.as_buffer() return None if buffer is None else buffer.to_pybytes() @@ -693,9 +826,14 @@ cdef class StringScalar(BinaryScalar): Concrete class for string-like (utf8) scalars. """ - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python string. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ buffer = self.as_buffer() return None if buffer is None else str(buffer, 'utf8') @@ -744,12 +882,17 @@ cdef class ListScalar(Scalar): """ return iter(self.values) - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python list. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ arr = self.values - return None if arr is None else arr.to_pylist() + return None if arr is None else arr.to_pylist(maps_as_pydicts=maps_as_pydicts) cdef class FixedSizeListScalar(ListScalar): @@ -824,13 +967,18 @@ cdef class StructScalar(Scalar, collections.abc.Mapping): else: raise KeyError(key) from exc - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this value as a Python dict. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ if self.is_valid: try: - return {k: self[k].as_py() for k in self.keys()} + return {k: self[k].as_py(maps_as_pydicts=maps_as_pydicts) for k in self.keys()} except KeyError: raise ValueError( "Converting to Python dictionary is not supported when " @@ -880,12 +1028,28 @@ cdef class MapScalar(ListScalar): for k, v in zip(arr.field(self.type.key_field.name), arr.field(self.type.item_field.name)): yield (k.as_py(), v.as_py()) - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ - Return this value as a Python list. + Return this value as a Python list or dict, depending on 'maps_as_pydicts'. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ - cdef CStructScalar* sp = self.wrapped.get() - return list(self) if sp.is_valid else None + if not self.is_valid: + return None + if not maps_as_pydicts: + return list(self) + result_dict = {} + for key, value in self: + if key in result_dict: + raise ValueError( + "Converting to Python dictionary is not supported when " + "duplicate field names are present" + ) + result_dict[key] = value + return result_dict cdef class DictionaryScalar(Scalar): @@ -958,11 +1122,16 @@ cdef class DictionaryScalar(Scalar): cdef CDictionaryScalar* sp = self.wrapped.get() return pyarrow_wrap_array(sp.value.dictionary) - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this encoded value as a Python object. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ - return self.value.as_py() if self.is_valid else None + return self.value.as_py(maps_as_pydicts=maps_as_pydicts) if self.is_valid else None cdef class RunEndEncodedScalar(Scalar): @@ -977,11 +1146,16 @@ cdef class RunEndEncodedScalar(Scalar): cdef CRunEndEncodedScalar* sp = self.wrapped.get() return Scalar.wrap(sp.value) - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return underlying value as a Python object. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ - return self.value.as_py() + return self.value.as_py(maps_as_pydicts=maps_as_pydicts) cdef class UnionScalar(Scalar): @@ -1003,12 +1177,17 @@ cdef class UnionScalar(Scalar): dp = self.wrapped.get() return Scalar.wrap(dp.value) if dp.is_valid else None - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return underlying value as a Python object. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ value = self.value - return None if value is None else value.as_py() + return None if value is None else value.as_py(maps_as_pydicts=maps_as_pydicts) @property def type_code(self): @@ -1032,11 +1211,16 @@ cdef class ExtensionScalar(Scalar): cdef CExtensionScalar* sp = self.wrapped.get() return Scalar.wrap(sp.value) if sp.is_valid else None - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this scalar as a Python object. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ - return None if self.value is None else self.value.as_py() + return None if self.value is None else self.value.as_py(maps_as_pydicts=maps_as_pydicts) @staticmethod def from_storage(BaseExtensionType typ, value): @@ -1093,8 +1277,16 @@ class UuidScalar(ExtensionScalar): Concrete class for Uuid extension scalar. """ - def as_py(self): - return None if self.value is None else UUID(bytes=self.value.as_py()) + def as_py(self, maps_as_pydicts=False): + """ + Return this scalar as a Python UUID. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + """ + return None if self.value is None else UUID(bytes=self.value.as_py(maps_as_pydicts=maps_as_pydicts)) cdef class FixedShapeTensorScalar(ExtensionScalar): @@ -1150,11 +1342,16 @@ cdef class Bool8Scalar(ExtensionScalar): Concrete class for bool8 extension scalar. """ - def as_py(self): + def as_py(self, maps_as_pydicts=False): """ Return this scalar as a Python object. + + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples """ - py_val = super().as_py() + py_val = super().as_py(maps_as_pydicts=maps_as_pydicts) return None if py_val is None else py_val != 0 cdef dict _scalar_classes = { diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index af241e4be07..7a940d4dec6 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1349,10 +1349,15 @@ cdef class ChunkedArray(_PandasConvertible): for i in range(self.num_chunks): yield self.chunk(i) - def to_pylist(self): + def to_pylist(self, maps_as_pydicts=False): """ Convert to a list of native Python objects. + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + Examples -------- >>> import pyarrow as pa @@ -1363,7 +1368,7 @@ cdef class ChunkedArray(_PandasConvertible): self._assert_cpu() result = [] for i in range(self.num_chunks): - result += self.chunk(i).to_pylist() + result += self.chunk(i).to_pylist(maps_as_pydicts=maps_as_pydicts) return result def __arrow_c_stream__(self, requested_schema=None): @@ -2255,10 +2260,15 @@ cdef class _Tabular(_PandasConvertible): else: return _pc().filter(self, mask, null_selection_behavior) - def to_pydict(self): + def to_pydict(self, maps_as_pydicts=False): """ Convert the Table or RecordBatch to a dict or OrderedDict. + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + Returns ------- dict @@ -2277,14 +2287,19 @@ cdef class _Tabular(_PandasConvertible): entries = [] for i in range(self.num_columns): name = self.field(i).name - column = self[i].to_pylist() + column = self[i].to_pylist(maps_as_pydicts=maps_as_pydicts) entries.append((name, column)) return ordered_dict(entries) - def to_pylist(self): + def to_pylist(self, maps_as_pydicts=False): """ Convert the Table or RecordBatch to a list of rows / dictionaries. + Parameters + ---------- + maps_as_pydicts : bool, default False + Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + Returns ------- list @@ -2300,7 +2315,7 @@ cdef class _Tabular(_PandasConvertible): >>> table.to_pylist() [{'n_legs': 2, 'animals': 'Flamingo'}, {'n_legs': 4, 'animals': 'Horse'}, ... """ - pydict = self.to_pydict() + pydict = self.to_pydict(maps_as_pydicts=maps_as_pydicts) names = self.schema.names pylist = [{column: pydict[column][row] for column in names} for row in range(self.num_rows)] diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 634d9ce2d8d..20bf1f3c858 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -99,7 +99,7 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): class ExampleUuidScalarType(pa.ExtensionScalar): - def as_py(self): + def as_py(self, maps_as_pydicts=False): return None if self.value is None else UUID(bytes=self.value.as_py()) diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 3f4a53c473e..4e4cfb231b3 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -786,6 +786,19 @@ def test_map(pickle_module): restored = pickle_module.loads(pickle_module.dumps(s)) assert restored.equals(s) + assert s.as_py(maps_as_pydicts=True) == {'a': 1, 'b': 2} + + +def test_map_duplicate_fields(): + ty = pa.map_(pa.string(), pa.int8()) + v = [('a', 1), ('a', 2)] + s = pa.scalar(v, type=ty) + + assert s.as_py() == v + + with pytest.raises(ValueError): + assert s.as_py(maps_as_pydicts=True) + def test_dictionary(pickle_module): indices = pa.array([2, None, 1, 2, 0, None]) @@ -898,3 +911,15 @@ def test_map_scalar_as_py_with_custom_field_name(): pa.field("custom_value", pa.string()), ), ).as_py() == [("foo", "bar")] + + +def test_nested_map_types_with_maps_as_pydicts(): + ty = pa.struct([ + pa.field('x', pa.map_(pa.string(), pa.int8())), + pa.field('y', pa.list_(pa.map_(pa.string(), pa.int8()))), + ]) + + v = {'x': {'a': 1}, 'y': [{'b': 2}, {'c': 3}]} + s = pa.scalar(v, type=ty) + + assert s.as_py(maps_as_pydicts=True) == v diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 4c058ccecda..f709d979e29 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -1888,6 +1888,26 @@ def test_table_unify_dictionaries(): assert table.schema.metadata == {b"key1": b"value1"} +def test_table_maps_as_pydicts(): + arrays = [ + pa.array( + [{'x': 1, 'y': 2}, {'z': 3}], + type=pa.map_(pa.string(), pa.int32()) + ) + ] + table = pa.Table.from_arrays(arrays, names=['a']) + + table_dict = table.to_pydict(maps_as_pydicts=True) + assert 'a' in table_dict + column_list = table_dict['a'] + assert len(column_list) == 2 + assert column_list == [{'x': 1, 'y': 2}, {'z': 3}] + + table_list = table.to_pylist(maps_as_pydicts=True) + assert len(table_list) == 2 + assert table_list == [{'a': {'x': 1, 'y': 2}}, {'a': {'z': 3}}] + + def test_concat_tables(): data = [ list(range(5)), From a3492b63ec3f3a3b3f9cfbac96b2e27f4dbcbad7 Mon Sep 17 00:00:00 2001 From: Jonas Dedden Date: Mon, 10 Feb 2025 19:59:40 +0100 Subject: [PATCH 2/7] Made `maps_as_pydicts` compatible to the pandas API --- python/pyarrow/array.pxi | 34 +- python/pyarrow/scalar.pxi | 630 ++++++++++++++++---- python/pyarrow/table.pxi | 51 +- python/pyarrow/tests/test_extension_type.py | 2 +- python/pyarrow/tests/test_scalars.py | 10 +- python/pyarrow/tests/test_table.py | 4 +- 6 files changed, 597 insertions(+), 134 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 87dbf16a9c8..ef461b64065 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1651,14 +1651,25 @@ cdef class Array(_PandasConvertible): array = array.copy() return array - def to_pylist(self, maps_as_pydicts=False): + def to_pylist(self, maps_as_pydicts=None): """ Convert to a list of native Python objects. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. Returns ------- @@ -2291,7 +2302,7 @@ cdef class MonthDayNanoIntervalArray(Array): Concrete class for Arrow arrays of interval[MonthDayNano] type. """ - def to_pylist(self, maps_as_pydicts=False): + def to_pylist(self, maps_as_pydicts=None): """ Convert to a list of native Python objects. @@ -2299,8 +2310,19 @@ cdef class MonthDayNanoIntervalArray(Array): Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. Returns ------- diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index d3450e390b1..be023873efe 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -16,6 +16,7 @@ # under the License. import collections +import warnings from uuid import UUID @@ -148,14 +149,25 @@ cdef class Scalar(_Weakrefable): def __reduce__(self): return scalar, (self.as_py(), self.type) - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python representation. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ raise NotImplementedError() @@ -177,14 +189,25 @@ cdef class NullScalar(Scalar): def __init__(self): pass - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python None. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ return None @@ -197,14 +220,25 @@ cdef class BooleanScalar(Scalar): Concrete class for boolean scalars. """ - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python bool. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef CBooleanScalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -215,14 +249,25 @@ cdef class UInt8Scalar(Scalar): Concrete class for uint8 scalars. """ - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python int. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef CUInt8Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -233,14 +278,25 @@ cdef class Int8Scalar(Scalar): Concrete class for int8 scalars. """ - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python int. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef CInt8Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -251,14 +307,25 @@ cdef class UInt16Scalar(Scalar): Concrete class for uint16 scalars. """ - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python int. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef CUInt16Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -269,14 +336,25 @@ cdef class Int16Scalar(Scalar): Concrete class for int16 scalars. """ - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python int. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef CInt16Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -287,14 +365,25 @@ cdef class UInt32Scalar(Scalar): Concrete class for uint32 scalars. """ - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python int. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef CUInt32Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -305,14 +394,25 @@ cdef class Int32Scalar(Scalar): Concrete class for int32 scalars. """ - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python int. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef CInt32Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -323,14 +423,25 @@ cdef class UInt64Scalar(Scalar): Concrete class for uint64 scalars. """ - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python int. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef CUInt64Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -341,14 +452,25 @@ cdef class Int64Scalar(Scalar): Concrete class for int64 scalars. """ - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python int. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef CInt64Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -359,14 +481,25 @@ cdef class HalfFloatScalar(Scalar): Concrete class for float scalars. """ - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python float. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef CHalfFloatScalar* sp = self.wrapped.get() return PyHalf_FromHalf(sp.value) if sp.is_valid else None @@ -377,14 +510,25 @@ cdef class FloatScalar(Scalar): Concrete class for float scalars. """ - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python float. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef CFloatScalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -395,14 +539,25 @@ cdef class DoubleScalar(Scalar): Concrete class for double scalars. """ - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python float. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef CDoubleScalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -413,14 +568,25 @@ cdef class Decimal32Scalar(Scalar): Concrete class for decimal32 scalars. """ - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python Decimal. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef: CDecimal32Scalar* sp = self.wrapped.get() @@ -438,14 +604,25 @@ cdef class Decimal64Scalar(Scalar): Concrete class for decimal64 scalars. """ - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python Decimal. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef: CDecimal64Scalar* sp = self.wrapped.get() @@ -463,14 +640,25 @@ cdef class Decimal128Scalar(Scalar): Concrete class for decimal128 scalars. """ - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python Decimal. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef: CDecimal128Scalar* sp = self.wrapped.get() @@ -488,14 +676,25 @@ cdef class Decimal256Scalar(Scalar): Concrete class for decimal256 scalars. """ - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python Decimal. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef: CDecimal256Scalar* sp = self.wrapped.get() @@ -518,14 +717,25 @@ cdef class Date32Scalar(Scalar): cdef CDate32Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python datetime.datetime instance. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef CDate32Scalar* sp = self.wrapped.get() @@ -548,14 +758,25 @@ cdef class Date64Scalar(Scalar): cdef CDate64Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python datetime.datetime instance. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef CDate64Scalar* sp = self.wrapped.get() @@ -607,14 +828,25 @@ cdef class Time32Scalar(Scalar): cdef CTime32Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python datetime.timedelta instance. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef: CTime32Scalar* sp = self.wrapped.get() @@ -636,14 +868,25 @@ cdef class Time64Scalar(Scalar): cdef CTime64Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python datetime.timedelta instance. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef: CTime64Scalar* sp = self.wrapped.get() @@ -665,7 +908,7 @@ cdef class TimestampScalar(Scalar): cdef CTimestampScalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Pandas Timestamp instance (if units are nanoseconds and pandas is available), otherwise as a Python @@ -673,8 +916,19 @@ cdef class TimestampScalar(Scalar): Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef: CTimestampScalar* sp = self.wrapped.get() @@ -718,7 +972,7 @@ cdef class DurationScalar(Scalar): cdef CDurationScalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Pandas Timedelta instance (if units are nanoseconds and pandas is available), otherwise as a Python @@ -726,8 +980,19 @@ cdef class DurationScalar(Scalar): Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef: CDurationScalar* sp = self.wrapped.get() @@ -770,14 +1035,25 @@ cdef class MonthDayNanoIntervalScalar(Scalar): """ return self.as_py() - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a pyarrow.MonthDayNano. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ cdef: PyObject* val @@ -800,14 +1076,25 @@ cdef class BinaryScalar(Scalar): cdef CBaseBinaryScalar* sp = self.wrapped.get() return pyarrow_wrap_buffer(sp.value) if sp.is_valid else None - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python bytes. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ buffer = self.as_buffer() return None if buffer is None else buffer.to_pybytes() @@ -826,14 +1113,25 @@ cdef class StringScalar(BinaryScalar): Concrete class for string-like (utf8) scalars. """ - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python string. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ buffer = self.as_buffer() return None if buffer is None else str(buffer, 'utf8') @@ -882,14 +1180,25 @@ cdef class ListScalar(Scalar): """ return iter(self.values) - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python list. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ arr = self.values return None if arr is None else arr.to_pylist(maps_as_pydicts=maps_as_pydicts) @@ -967,14 +1276,25 @@ cdef class StructScalar(Scalar, collections.abc.Mapping): else: raise KeyError(key) from exc - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python dict. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ if self.is_valid: try: @@ -1028,15 +1348,32 @@ cdef class MapScalar(ListScalar): for k, v in zip(arr.field(self.type.key_field.name), arr.field(self.type.item_field.name)): yield (k.as_py(), v.as_py()) - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this value as a Python list or dict, depending on 'maps_as_pydicts'. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ + if maps_as_pydicts not in (None, "lossy", "strict"): + raise ValueError( + "Invalid value for 'maps_as_pydicts': " + + "valid values are 'lossy', 'strict' or `None` (default). " + + f"Received '{maps_as_pydicts}'." + ) if not self.is_valid: return None if not maps_as_pydicts: @@ -1044,10 +1381,13 @@ cdef class MapScalar(ListScalar): result_dict = {} for key, value in self: if key in result_dict: - raise ValueError( - "Converting to Python dictionary is not supported when " - "duplicate field names are present" - ) + if maps_as_pydicts == "strict": + raise ValueError( + "Converting to Python dictionary is not supported when " + "duplicate keys are present." + ) + else: + warnings.warn(f"Encountered key '{key}' which was already encountered.") result_dict[key] = value return result_dict @@ -1122,14 +1462,25 @@ cdef class DictionaryScalar(Scalar): cdef CDictionaryScalar* sp = self.wrapped.get() return pyarrow_wrap_array(sp.value.dictionary) - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this encoded value as a Python object. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ return self.value.as_py(maps_as_pydicts=maps_as_pydicts) if self.is_valid else None @@ -1146,14 +1497,25 @@ cdef class RunEndEncodedScalar(Scalar): cdef CRunEndEncodedScalar* sp = self.wrapped.get() return Scalar.wrap(sp.value) - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return underlying value as a Python object. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ return self.value.as_py(maps_as_pydicts=maps_as_pydicts) @@ -1177,14 +1539,25 @@ cdef class UnionScalar(Scalar): dp = self.wrapped.get() return Scalar.wrap(dp.value) if dp.is_valid else None - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return underlying value as a Python object. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ value = self.value return None if value is None else value.as_py(maps_as_pydicts=maps_as_pydicts) @@ -1211,14 +1584,25 @@ cdef class ExtensionScalar(Scalar): cdef CExtensionScalar* sp = self.wrapped.get() return Scalar.wrap(sp.value) if sp.is_valid else None - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this scalar as a Python object. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ return None if self.value is None else self.value.as_py(maps_as_pydicts=maps_as_pydicts) @@ -1277,14 +1661,25 @@ class UuidScalar(ExtensionScalar): Concrete class for Uuid extension scalar. """ - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this scalar as a Python UUID. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ return None if self.value is None else UUID(bytes=self.value.as_py(maps_as_pydicts=maps_as_pydicts)) @@ -1342,14 +1737,25 @@ cdef class Bool8Scalar(ExtensionScalar): Concrete class for bool8 extension scalar. """ - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): """ Return this scalar as a Python object. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. """ py_val = super().as_py(maps_as_pydicts=maps_as_pydicts) return None if py_val is None else py_val != 0 diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 7a940d4dec6..2dc5ec3523c 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1349,14 +1349,25 @@ cdef class ChunkedArray(_PandasConvertible): for i in range(self.num_chunks): yield self.chunk(i) - def to_pylist(self, maps_as_pydicts=False): + def to_pylist(self, maps_as_pydicts=None): """ Convert to a list of native Python objects. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. Examples -------- @@ -2260,14 +2271,25 @@ cdef class _Tabular(_PandasConvertible): else: return _pc().filter(self, mask, null_selection_behavior) - def to_pydict(self, maps_as_pydicts=False): + def to_pydict(self, maps_as_pydicts=None): """ Convert the Table or RecordBatch to a dict or OrderedDict. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. Returns ------- @@ -2291,14 +2313,25 @@ cdef class _Tabular(_PandasConvertible): entries.append((name, column)) return ordered_dict(entries) - def to_pylist(self, maps_as_pydicts=False): + def to_pylist(self, maps_as_pydicts=None): """ Convert the Table or RecordBatch to a list of rows / dictionaries. Parameters ---------- - maps_as_pydicts : bool, default False - Whether to treat elements of type Map as python dictionaries or as a list of (key, value) tuples + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. Returns ------- diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 20bf1f3c858..ed1be42fdc7 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -99,7 +99,7 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): class ExampleUuidScalarType(pa.ExtensionScalar): - def as_py(self, maps_as_pydicts=False): + def as_py(self, maps_as_pydicts=None): return None if self.value is None else UUID(bytes=self.value.as_py()) diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 4e4cfb231b3..66a6137bea0 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -786,7 +786,7 @@ def test_map(pickle_module): restored = pickle_module.loads(pickle_module.dumps(s)) assert restored.equals(s) - assert s.as_py(maps_as_pydicts=True) == {'a': 1, 'b': 2} + assert s.as_py(maps_as_pydicts="strict") == {'a': 1, 'b': 2} def test_map_duplicate_fields(): @@ -794,10 +794,12 @@ def test_map_duplicate_fields(): v = [('a', 1), ('a', 2)] s = pa.scalar(v, type=ty) - assert s.as_py() == v + assert s.as_py(maps_as_pydicts=None) == v with pytest.raises(ValueError): - assert s.as_py(maps_as_pydicts=True) + assert s.as_py(maps_as_pydicts="strict") + + assert s.as_py(maps_as_pydicts="lossy") == [{'a': 2}] def test_dictionary(pickle_module): @@ -922,4 +924,4 @@ def test_nested_map_types_with_maps_as_pydicts(): v = {'x': {'a': 1}, 'y': [{'b': 2}, {'c': 3}]} s = pa.scalar(v, type=ty) - assert s.as_py(maps_as_pydicts=True) == v + assert s.as_py(maps_as_pydicts="strict") == v diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index f709d979e29..180ae7b4c1a 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -1897,13 +1897,13 @@ def test_table_maps_as_pydicts(): ] table = pa.Table.from_arrays(arrays, names=['a']) - table_dict = table.to_pydict(maps_as_pydicts=True) + table_dict = table.to_pydict(maps_as_pydicts="strict") assert 'a' in table_dict column_list = table_dict['a'] assert len(column_list) == 2 assert column_list == [{'x': 1, 'y': 2}, {'z': 3}] - table_list = table.to_pylist(maps_as_pydicts=True) + table_list = table.to_pylist(maps_as_pydicts="strict") assert len(table_list) == 2 assert table_list == [{'a': {'x': 1, 'y': 2}}, {'a': {'z': 3}}] From 59264a55a462117b7f1ccb7a7b6e4e84bb89e24f Mon Sep 17 00:00:00 2001 From: Jonas Dedden Date: Mon, 10 Feb 2025 20:07:15 +0100 Subject: [PATCH 3/7] Make `maps_as_pydicts` a kwarg-only parameter --- python/pyarrow/array.pxi | 4 +- python/pyarrow/scalar.pxi | 72 ++++++++++----------- python/pyarrow/table.pxi | 6 +- python/pyarrow/tests/test_extension_type.py | 2 +- 4 files changed, 42 insertions(+), 42 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ef461b64065..8aa169f0cab 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1651,7 +1651,7 @@ cdef class Array(_PandasConvertible): array = array.copy() return array - def to_pylist(self, maps_as_pydicts=None): + def to_pylist(self, *, maps_as_pydicts=None): """ Convert to a list of native Python objects. @@ -2302,7 +2302,7 @@ cdef class MonthDayNanoIntervalArray(Array): Concrete class for Arrow arrays of interval[MonthDayNano] type. """ - def to_pylist(self, maps_as_pydicts=None): + def to_pylist(self, *, maps_as_pydicts=None): """ Convert to a list of native Python objects. diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index be023873efe..9f4eca8d229 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -149,7 +149,7 @@ cdef class Scalar(_Weakrefable): def __reduce__(self): return scalar, (self.as_py(), self.type) - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python representation. @@ -189,7 +189,7 @@ cdef class NullScalar(Scalar): def __init__(self): pass - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python None. @@ -220,7 +220,7 @@ cdef class BooleanScalar(Scalar): Concrete class for boolean scalars. """ - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python bool. @@ -249,7 +249,7 @@ cdef class UInt8Scalar(Scalar): Concrete class for uint8 scalars. """ - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python int. @@ -278,7 +278,7 @@ cdef class Int8Scalar(Scalar): Concrete class for int8 scalars. """ - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python int. @@ -307,7 +307,7 @@ cdef class UInt16Scalar(Scalar): Concrete class for uint16 scalars. """ - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python int. @@ -336,7 +336,7 @@ cdef class Int16Scalar(Scalar): Concrete class for int16 scalars. """ - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python int. @@ -365,7 +365,7 @@ cdef class UInt32Scalar(Scalar): Concrete class for uint32 scalars. """ - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python int. @@ -394,7 +394,7 @@ cdef class Int32Scalar(Scalar): Concrete class for int32 scalars. """ - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python int. @@ -423,7 +423,7 @@ cdef class UInt64Scalar(Scalar): Concrete class for uint64 scalars. """ - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python int. @@ -452,7 +452,7 @@ cdef class Int64Scalar(Scalar): Concrete class for int64 scalars. """ - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python int. @@ -481,7 +481,7 @@ cdef class HalfFloatScalar(Scalar): Concrete class for float scalars. """ - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python float. @@ -510,7 +510,7 @@ cdef class FloatScalar(Scalar): Concrete class for float scalars. """ - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python float. @@ -539,7 +539,7 @@ cdef class DoubleScalar(Scalar): Concrete class for double scalars. """ - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python float. @@ -568,7 +568,7 @@ cdef class Decimal32Scalar(Scalar): Concrete class for decimal32 scalars. """ - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python Decimal. @@ -604,7 +604,7 @@ cdef class Decimal64Scalar(Scalar): Concrete class for decimal64 scalars. """ - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python Decimal. @@ -640,7 +640,7 @@ cdef class Decimal128Scalar(Scalar): Concrete class for decimal128 scalars. """ - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python Decimal. @@ -676,7 +676,7 @@ cdef class Decimal256Scalar(Scalar): Concrete class for decimal256 scalars. """ - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python Decimal. @@ -717,7 +717,7 @@ cdef class Date32Scalar(Scalar): cdef CDate32Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python datetime.datetime instance. @@ -758,7 +758,7 @@ cdef class Date64Scalar(Scalar): cdef CDate64Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python datetime.datetime instance. @@ -828,7 +828,7 @@ cdef class Time32Scalar(Scalar): cdef CTime32Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python datetime.timedelta instance. @@ -868,7 +868,7 @@ cdef class Time64Scalar(Scalar): cdef CTime64Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python datetime.timedelta instance. @@ -908,7 +908,7 @@ cdef class TimestampScalar(Scalar): cdef CTimestampScalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Pandas Timestamp instance (if units are nanoseconds and pandas is available), otherwise as a Python @@ -972,7 +972,7 @@ cdef class DurationScalar(Scalar): cdef CDurationScalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Pandas Timedelta instance (if units are nanoseconds and pandas is available), otherwise as a Python @@ -1035,7 +1035,7 @@ cdef class MonthDayNanoIntervalScalar(Scalar): """ return self.as_py() - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a pyarrow.MonthDayNano. @@ -1076,7 +1076,7 @@ cdef class BinaryScalar(Scalar): cdef CBaseBinaryScalar* sp = self.wrapped.get() return pyarrow_wrap_buffer(sp.value) if sp.is_valid else None - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python bytes. @@ -1113,7 +1113,7 @@ cdef class StringScalar(BinaryScalar): Concrete class for string-like (utf8) scalars. """ - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python string. @@ -1180,7 +1180,7 @@ cdef class ListScalar(Scalar): """ return iter(self.values) - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python list. @@ -1276,7 +1276,7 @@ cdef class StructScalar(Scalar, collections.abc.Mapping): else: raise KeyError(key) from exc - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python dict. @@ -1348,7 +1348,7 @@ cdef class MapScalar(ListScalar): for k, v in zip(arr.field(self.type.key_field.name), arr.field(self.type.item_field.name)): yield (k.as_py(), v.as_py()) - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this value as a Python list or dict, depending on 'maps_as_pydicts'. @@ -1462,7 +1462,7 @@ cdef class DictionaryScalar(Scalar): cdef CDictionaryScalar* sp = self.wrapped.get() return pyarrow_wrap_array(sp.value.dictionary) - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this encoded value as a Python object. @@ -1497,7 +1497,7 @@ cdef class RunEndEncodedScalar(Scalar): cdef CRunEndEncodedScalar* sp = self.wrapped.get() return Scalar.wrap(sp.value) - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return underlying value as a Python object. @@ -1539,7 +1539,7 @@ cdef class UnionScalar(Scalar): dp = self.wrapped.get() return Scalar.wrap(dp.value) if dp.is_valid else None - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return underlying value as a Python object. @@ -1584,7 +1584,7 @@ cdef class ExtensionScalar(Scalar): cdef CExtensionScalar* sp = self.wrapped.get() return Scalar.wrap(sp.value) if sp.is_valid else None - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this scalar as a Python object. @@ -1661,7 +1661,7 @@ class UuidScalar(ExtensionScalar): Concrete class for Uuid extension scalar. """ - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this scalar as a Python UUID. @@ -1737,7 +1737,7 @@ cdef class Bool8Scalar(ExtensionScalar): Concrete class for bool8 extension scalar. """ - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): """ Return this scalar as a Python object. diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 2dc5ec3523c..a5c30f8786f 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1349,7 +1349,7 @@ cdef class ChunkedArray(_PandasConvertible): for i in range(self.num_chunks): yield self.chunk(i) - def to_pylist(self, maps_as_pydicts=None): + def to_pylist(self, *, maps_as_pydicts=None): """ Convert to a list of native Python objects. @@ -2271,7 +2271,7 @@ cdef class _Tabular(_PandasConvertible): else: return _pc().filter(self, mask, null_selection_behavior) - def to_pydict(self, maps_as_pydicts=None): + def to_pydict(self, *, maps_as_pydicts=None): """ Convert the Table or RecordBatch to a dict or OrderedDict. @@ -2313,7 +2313,7 @@ cdef class _Tabular(_PandasConvertible): entries.append((name, column)) return ordered_dict(entries) - def to_pylist(self, maps_as_pydicts=None): + def to_pylist(self, *, maps_as_pydicts=None): """ Convert the Table or RecordBatch to a list of rows / dictionaries. diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index ed1be42fdc7..185b5bb424b 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -99,7 +99,7 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): class ExampleUuidScalarType(pa.ExtensionScalar): - def as_py(self, maps_as_pydicts=None): + def as_py(self, *, maps_as_pydicts=None): return None if self.value is None else UUID(bytes=self.value.as_py()) From e861daa784571ff94aa4c50fb8bab23e61ecfa8c Mon Sep 17 00:00:00 2001 From: Jonas Dedden Date: Mon, 10 Feb 2025 20:08:23 +0100 Subject: [PATCH 4/7] Correct test typo --- python/pyarrow/tests/test_scalars.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 66a6137bea0..f1686299883 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -799,7 +799,7 @@ def test_map_duplicate_fields(): with pytest.raises(ValueError): assert s.as_py(maps_as_pydicts="strict") - assert s.as_py(maps_as_pydicts="lossy") == [{'a': 2}] + assert s.as_py(maps_as_pydicts="lossy") == {'a': 2} def test_dictionary(pickle_module): From 548793bc0bc83bbac5a77d939eef62d764811a37 Mon Sep 17 00:00:00 2001 From: Jonas Dedden Date: Mon, 10 Feb 2025 20:10:59 +0100 Subject: [PATCH 5/7] Formatting fix --- python/pyarrow/scalar.pxi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 9f4eca8d229..1a9532bf5e3 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1387,7 +1387,8 @@ cdef class MapScalar(ListScalar): "duplicate keys are present." ) else: - warnings.warn(f"Encountered key '{key}' which was already encountered.") + warnings.warn( + f"Encountered key '{key}' which was already encountered.") result_dict[key] = value return result_dict From 7ebf2d9f0c564092704f4e57f21aafab052341bc Mon Sep 17 00:00:00 2001 From: Jonas Dedden Date: Wed, 19 Feb 2025 17:59:00 +0100 Subject: [PATCH 6/7] Implement PR remarks --- python/pyarrow/array.pxi | 20 +- python/pyarrow/scalar.pxi | 412 ++++----------------------- python/pyarrow/table.pxi | 24 +- python/pyarrow/tests/test_scalars.py | 5 +- 4 files changed, 74 insertions(+), 387 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 8aa169f0cab..91770a52199 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1664,12 +1664,10 @@ cdef class Array(_PandasConvertible): Arrow Map, as in [(key1, value1), (key2, value2), ...]. If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. Returns ------- @@ -2312,17 +2310,7 @@ cdef class MonthDayNanoIntervalArray(Array): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. Returns ------- diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 1a9532bf5e3..326bf6d1bef 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -162,12 +162,10 @@ cdef class Scalar(_Weakrefable): Arrow Map, as in [(key1, value1), (key2, value2), ...]. If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. """ raise NotImplementedError() @@ -197,17 +195,7 @@ cdef class NullScalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ return None @@ -228,17 +216,7 @@ cdef class BooleanScalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef CBooleanScalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -257,17 +235,7 @@ cdef class UInt8Scalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef CUInt8Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -286,17 +254,7 @@ cdef class Int8Scalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef CInt8Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -315,17 +273,7 @@ cdef class UInt16Scalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef CUInt16Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -344,17 +292,7 @@ cdef class Int16Scalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef CInt16Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -373,17 +311,7 @@ cdef class UInt32Scalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef CUInt32Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -402,17 +330,7 @@ cdef class Int32Scalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef CInt32Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -431,17 +349,7 @@ cdef class UInt64Scalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef CUInt64Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -460,17 +368,7 @@ cdef class Int64Scalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef CInt64Scalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -489,17 +387,7 @@ cdef class HalfFloatScalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef CHalfFloatScalar* sp = self.wrapped.get() return PyHalf_FromHalf(sp.value) if sp.is_valid else None @@ -518,17 +406,7 @@ cdef class FloatScalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef CFloatScalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -547,17 +425,7 @@ cdef class DoubleScalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef CDoubleScalar* sp = self.wrapped.get() return sp.value if sp.is_valid else None @@ -576,17 +444,7 @@ cdef class Decimal32Scalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef: CDecimal32Scalar* sp = self.wrapped.get() @@ -612,17 +470,7 @@ cdef class Decimal64Scalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef: CDecimal64Scalar* sp = self.wrapped.get() @@ -648,17 +496,7 @@ cdef class Decimal128Scalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef: CDecimal128Scalar* sp = self.wrapped.get() @@ -684,17 +522,7 @@ cdef class Decimal256Scalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef: CDecimal256Scalar* sp = self.wrapped.get() @@ -725,17 +553,7 @@ cdef class Date32Scalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef CDate32Scalar* sp = self.wrapped.get() @@ -766,17 +584,7 @@ cdef class Date64Scalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef CDate64Scalar* sp = self.wrapped.get() @@ -836,17 +644,7 @@ cdef class Time32Scalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef: CTime32Scalar* sp = self.wrapped.get() @@ -876,17 +674,7 @@ cdef class Time64Scalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef: CTime64Scalar* sp = self.wrapped.get() @@ -918,17 +706,7 @@ cdef class TimestampScalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef: CTimestampScalar* sp = self.wrapped.get() @@ -982,17 +760,7 @@ cdef class DurationScalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef: CDurationScalar* sp = self.wrapped.get() @@ -1043,17 +811,7 @@ cdef class MonthDayNanoIntervalScalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ cdef: PyObject* val @@ -1084,17 +842,7 @@ cdef class BinaryScalar(Scalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ buffer = self.as_buffer() return None if buffer is None else buffer.to_pybytes() @@ -1121,17 +869,7 @@ cdef class StringScalar(BinaryScalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ buffer = self.as_buffer() return None if buffer is None else str(buffer, 'utf8') @@ -1193,12 +931,10 @@ cdef class ListScalar(Scalar): Arrow Map, as in [(key1, value1), (key2, value2), ...]. If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. """ arr = self.values return None if arr is None else arr.to_pylist(maps_as_pydicts=maps_as_pydicts) @@ -1289,12 +1025,10 @@ cdef class StructScalar(Scalar, collections.abc.Mapping): Arrow Map, as in [(key1, value1), (key2, value2), ...]. If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. """ if self.is_valid: try: @@ -1361,18 +1095,16 @@ cdef class MapScalar(ListScalar): Arrow Map, as in [(key1, value1), (key2, value2), ...]. If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. """ if maps_as_pydicts not in (None, "lossy", "strict"): raise ValueError( "Invalid value for 'maps_as_pydicts': " + "valid values are 'lossy', 'strict' or `None` (default). " - + f"Received '{maps_as_pydicts}'." + + f"Received {maps_as_pydict!r}." ) if not self.is_valid: return None @@ -1382,9 +1114,9 @@ cdef class MapScalar(ListScalar): for key, value in self: if key in result_dict: if maps_as_pydicts == "strict": - raise ValueError( - "Converting to Python dictionary is not supported when " - "duplicate keys are present." + raise KeyError( + "Converting to Python dictionary is not supported in strict mode " + f"when duplicate keys are present (duplicate key was '{key}')." ) else: warnings.warn( @@ -1476,12 +1208,10 @@ cdef class DictionaryScalar(Scalar): Arrow Map, as in [(key1, value1), (key2, value2), ...]. If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. """ return self.value.as_py(maps_as_pydicts=maps_as_pydicts) if self.is_valid else None @@ -1511,12 +1241,10 @@ cdef class RunEndEncodedScalar(Scalar): Arrow Map, as in [(key1, value1), (key2, value2), ...]. If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. """ return self.value.as_py(maps_as_pydicts=maps_as_pydicts) @@ -1553,12 +1281,10 @@ cdef class UnionScalar(Scalar): Arrow Map, as in [(key1, value1), (key2, value2), ...]. If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. """ value = self.value return None if value is None else value.as_py(maps_as_pydicts=maps_as_pydicts) @@ -1598,12 +1324,10 @@ cdef class ExtensionScalar(Scalar): Arrow Map, as in [(key1, value1), (key2, value2), ...]. If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. """ return None if self.value is None else self.value.as_py(maps_as_pydicts=maps_as_pydicts) @@ -1670,19 +1394,9 @@ class UuidScalar(ExtensionScalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ - return None if self.value is None else UUID(bytes=self.value.as_py(maps_as_pydicts=maps_as_pydicts)) + return None if self.value is None else UUID(bytes=self.value.as_py()) cdef class FixedShapeTensorScalar(ExtensionScalar): @@ -1746,19 +1460,9 @@ cdef class Bool8Scalar(ExtensionScalar): ---------- maps_as_pydicts : str, optional, default `None` Valid values are `None`, 'lossy', or 'strict'. - The default behavior (`None`), is to convert Arrow Map arrays to - Python association lists (list-of-tuples) in the same order as the - Arrow Map, as in [(key1, value1), (key2, value2), ...]. - - If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + This parameter is ignored for non-nested Scalars. """ - py_val = super().as_py(maps_as_pydicts=maps_as_pydicts) + py_val = super().as_py() return None if py_val is None else py_val != 0 cdef dict _scalar_classes = { diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index a5c30f8786f..5a6cd390489 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1362,12 +1362,10 @@ cdef class ChunkedArray(_PandasConvertible): Arrow Map, as in [(key1, value1), (key2, value2), ...]. If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. Examples -------- @@ -2284,12 +2282,10 @@ cdef class _Tabular(_PandasConvertible): Arrow Map, as in [(key1, value1), (key2, value2), ...]. If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. Returns ------- @@ -2326,12 +2322,10 @@ cdef class _Tabular(_PandasConvertible): Arrow Map, as in [(key1, value1), (key2, value2), ...]. If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. - This can change the ordering of (key, value) pairs, and will - deduplicate multiple keys, resulting in a possible loss of data. - If 'lossy', this key deduplication results in a warning printed - when detected. If 'strict', this instead results in an exception - being raised when detected. + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. Returns ------- diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index f1686299883..29db36eddc7 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -796,10 +796,11 @@ def test_map_duplicate_fields(): assert s.as_py(maps_as_pydicts=None) == v - with pytest.raises(ValueError): + with pytest.raises(KeyError): assert s.as_py(maps_as_pydicts="strict") - assert s.as_py(maps_as_pydicts="lossy") == {'a': 2} + with pytest.warns(match="Encountered key 'a' which was already encountered"): + assert s.as_py(maps_as_pydicts="lossy") == {'a': 2} def test_dictionary(pickle_module): From c86b4673c4f33904c66b25c9cb45353e6d8f2518 Mon Sep 17 00:00:00 2001 From: Jonas Dedden Date: Thu, 20 Feb 2025 13:21:36 +0100 Subject: [PATCH 7/7] Fix typo --- python/pyarrow/scalar.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 326bf6d1bef..04442c1f5d2 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1104,7 +1104,7 @@ cdef class MapScalar(ListScalar): raise ValueError( "Invalid value for 'maps_as_pydicts': " + "valid values are 'lossy', 'strict' or `None` (default). " - + f"Received {maps_as_pydict!r}." + + f"Received {maps_as_pydicts!r}." ) if not self.is_valid: return None