From 6029453ce33b55271fd22dba377a1ebc3ea72823 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 14 Aug 2022 17:49:57 +0200 Subject: [PATCH 01/12] ENG: Support mask in unique --- pandas/_libs/hashtable_class_helper.pxi.in | 57 +++++++++++++++++++--- pandas/core/algorithms.py | 16 ++++-- pandas/core/arrays/masked.py | 11 +++++ 3 files changed, 74 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 8a2b9c2f77627..82ec1c35f2bbb 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -521,7 +521,7 @@ cdef class {{name}}HashTable(HashTable): def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None, bint ignore_na=False, - object mask=None, bint return_inverse=False): + object mask=None, bint return_inverse=False, bint use_result_mask=False): """ Calculate unique values and labels (no sorting!) @@ -551,6 +551,9 @@ cdef class {{name}}HashTable(HashTable): return_inverse : bool, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. + use_result_mask: bool, default False + Whether to create a result mask for the unique values. Not supported + with return_inverse=True. Returns ------- @@ -566,7 +569,9 @@ cdef class {{name}}HashTable(HashTable): {{c_type}} val, na_value2 khiter_t k {{name}}VectorData *ud - bint use_na_value, use_mask + UInt8Vector result_mask + UInt8VectorData *rmd + bint use_na_value, use_mask, seen_na = False uint8_t[:] mask_values if return_inverse: @@ -574,6 +579,9 @@ cdef class {{name}}HashTable(HashTable): ud = uniques.data use_na_value = na_value is not None use_mask = mask is not None + use_result_mask = use_result_mask and use_mask + result_mask = UInt8Vector() + rmd = result_mask.data if use_mask: mask_values = mask.view("uint8") @@ -605,6 +613,24 @@ cdef class {{name}}HashTable(HashTable): # and replace the corresponding label with na_sentinel labels[i] = na_sentinel continue + elif not ignore_na and use_result_mask and not seen_na: + if mask_values[i]: + seen_na = True + if needs_resize(ud): + with gil: + if uniques.external_view_exists: + raise ValueError("external reference to " + "uniques held, but " + "Vector.resize() needed") + uniques.resize() + if uniques.external_view_exists: + raise ValueError("external reference to " + "result_mask held, but " + "Vector.resize() needed") + result_mask.resize() + append_data_{{dtype}}(ud, val) + append_data_uint8(rmd, 1) + continue k = kh_get_{{dtype}}(self.table, val) @@ -619,7 +645,16 @@ cdef class {{name}}HashTable(HashTable): "uniques held, but " "Vector.resize() needed") uniques.resize() + if use_result_mask: + if uniques.external_view_exists: + raise ValueError("external reference to " + "result_mask held, but " + "Vector.resize() needed") + result_mask.resize() append_data_{{dtype}}(ud, val) + if use_result_mask: + append_data_uint8(rmd, 0) + if return_inverse: self.table.vals[k] = count labels[i] = count @@ -632,9 +667,11 @@ cdef class {{name}}HashTable(HashTable): if return_inverse: return uniques.to_array(), labels.base # .base -> underlying ndarray + if use_result_mask: + return uniques.to_array(), result_mask.to_array() return uniques.to_array() - def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): + def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False, object mask=None): """ Calculate unique values and labels (no sorting!) @@ -652,10 +689,14 @@ cdef class {{name}}HashTable(HashTable): Unique values of input, not sorted labels : ndarray[intp_t] (if return_inverse) The labels from values to uniques + mask : ndarray[bool], optional + If not None, the mask is used as indicator for missing values + (True = missing, False = valid) instead of `na_value` or """ uniques = {{name}}Vector() + use_result_mask = True if mask is not None else False return self._unique(values, uniques, ignore_na=False, - return_inverse=return_inverse) + return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask) def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, object na_value=None, object mask=None): @@ -1013,7 +1054,7 @@ cdef class StringHashTable(HashTable): return uniques.to_array(), labels.base # .base -> underlying ndarray return uniques.to_array() - def unique(self, ndarray[object] values, bint return_inverse=False): + def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None): """ Calculate unique values and labels (no sorting!) @@ -1024,6 +1065,8 @@ cdef class StringHashTable(HashTable): return_inverse : bool, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. + mask : ndarray[bool], optional + Not yet implemented for StringHashTable Returns ------- @@ -1266,7 +1309,7 @@ cdef class PyObjectHashTable(HashTable): return uniques.to_array(), labels.base # .base -> underlying ndarray return uniques.to_array() - def unique(self, ndarray[object] values, bint return_inverse=False): + def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None): """ Calculate unique values and labels (no sorting!) @@ -1277,6 +1320,8 @@ cdef class PyObjectHashTable(HashTable): return_inverse : bool, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. + mask : ndarray[bool], optional + Not yet implemented for PyObjectHashTable Returns ------- diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index a4736c2a141a5..f53f9944967f2 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -310,7 +310,7 @@ def _check_object_for_strings(values: np.ndarray) -> str: # --------------- # -def unique(values): +def unique(values, mask=None): """ Return unique values based on a hash table. @@ -322,6 +322,8 @@ def unique(values): Parameters ---------- values : 1d array-like + mask: 1d boolean array + Mask of the values. Returns ------- @@ -414,9 +416,15 @@ def unique(values): htable, values = _get_hashtable_algo(values) table = htable(len(values)) - uniques = table.unique(values) - uniques = _reconstruct_data(uniques, original.dtype, original) - return uniques + if mask is None: + uniques = table.unique(values) + uniques = _reconstruct_data(uniques, original.dtype, original) + return uniques + + else: + uniques, mask = table.unique(values, mask=mask) + uniques = _reconstruct_data(uniques, original.dtype, original) + return uniques, mask.astype("bool") unique1d = unique diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 128c7e44f5075..7b0d92bfc74a0 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -851,6 +851,17 @@ def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT: mask = mask.copy() return type(self)(data, mask, copy=False) + def unique(self: BaseMaskedArrayT) -> BaseMaskedArrayT: + """ + Compute the BaseMaskedArray of unique values. + + Returns + ------- + uniques : BaseMaskedArray + """ + uniques, mask = algos.unique(self._data, self._mask) + return type(self)(uniques, mask, copy=False) + @doc(ExtensionArray.searchsorted) def searchsorted( self, From 539c1ba457cc06a75a0fe68b851bc1e4fe70cf7e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 16 Aug 2022 12:32:20 +0200 Subject: [PATCH 02/12] Fix bug and add test --- pandas/_libs/hashtable_class_helper.pxi.in | 5 ++++- pandas/tests/test_algos.py | 7 +++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 82ec1c35f2bbb..85eb709c8fb6e 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -613,8 +613,11 @@ cdef class {{name}}HashTable(HashTable): # and replace the corresponding label with na_sentinel labels[i] = na_sentinel continue - elif not ignore_na and use_result_mask and not seen_na: + elif not ignore_na and use_result_mask: if mask_values[i]: + if seen_na: + continue + seen_na = True if needs_resize(ud): with gil: diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index def63c552e059..e602340f5fa43 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -834,6 +834,13 @@ def test_do_not_mangle_na_values(self, unique_nulls_fixture, unique_nulls_fixtur assert a[0] is unique_nulls_fixture assert a[1] is unique_nulls_fixture2 + def test_unique_masked(self, any_numeric_ea_dtype): + # GH# + ser = Series([1, pd.NA, 2] * 3, dtype=any_numeric_ea_dtype) + result = pd.unique(ser) + expected = pd.array([1, pd.NA, 2], dtype=any_numeric_ea_dtype) + tm.assert_extension_array_equal(result, expected) + class TestIsin: def test_invalid(self): From b87ddbeb73845006f392069a68bb9512df57ce17 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 16 Aug 2022 12:35:46 +0200 Subject: [PATCH 03/12] Add not implemented errors --- pandas/_libs/hashtable_class_helper.pxi.in | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 85eb709c8fb6e..6730b38ca442d 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -579,7 +579,12 @@ cdef class {{name}}HashTable(HashTable): ud = uniques.data use_na_value = na_value is not None use_mask = mask is not None - use_result_mask = use_result_mask and use_mask + if not use_mask and use_result_mask: + raise NotImplementedError # pragma: no cover + + if use_result_mask and return_inverse: + raise NotImplementedError # pragma: no cover + result_mask = UInt8Vector() rmd = result_mask.data @@ -626,7 +631,7 @@ cdef class {{name}}HashTable(HashTable): "uniques held, but " "Vector.resize() needed") uniques.resize() - if uniques.external_view_exists: + if result_mask.external_view_exists: raise ValueError("external reference to " "result_mask held, but " "Vector.resize() needed") @@ -649,7 +654,7 @@ cdef class {{name}}HashTable(HashTable): "Vector.resize() needed") uniques.resize() if use_result_mask: - if uniques.external_view_exists: + if result_mask.external_view_exists: raise ValueError("external reference to " "result_mask held, but " "Vector.resize() needed") From 38bf9974e3545aeab10676cd0f29dcd53fa2ae77 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 16 Aug 2022 12:42:10 +0200 Subject: [PATCH 04/12] Add asvs --- asv_bench/benchmarks/hash_functions.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/asv_bench/benchmarks/hash_functions.py b/asv_bench/benchmarks/hash_functions.py index d9a291dc27125..da752b902b4fd 100644 --- a/asv_bench/benchmarks/hash_functions.py +++ b/asv_bench/benchmarks/hash_functions.py @@ -39,6 +39,21 @@ def time_unique(self, exponent): pd.unique(self.a2) +class Unique: + params = ["Int64", "Float64"] + param_names = ["dtype"] + + def setup(self, dtype): + self.ser = pd.Series(([1, pd.NA, 2] + list(range(100_000))) * 3, dtype=dtype) + self.ser_unique = pd.Series(list(range(300_000)) + [pd.NA], dtype=dtype) + + def time_unique_with_duplicates(self, exponent): + pd.unique(self.ser) + + def time_unique(self, exponent): + pd.unique(self.ser_unique) + + class NumericSeriesIndexing: params = [ From 676d5c66113240356e44bcf30f7a3c93cd86879d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 16 Aug 2022 12:55:41 +0200 Subject: [PATCH 05/12] Add gh ref --- pandas/tests/test_algos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index e602340f5fa43..fd617dcb4565e 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -835,7 +835,7 @@ def test_do_not_mangle_na_values(self, unique_nulls_fixture, unique_nulls_fixtur assert a[1] is unique_nulls_fixture2 def test_unique_masked(self, any_numeric_ea_dtype): - # GH# + # GH#48019 ser = Series([1, pd.NA, 2] * 3, dtype=any_numeric_ea_dtype) result = pd.unique(ser) expected = pd.array([1, pd.NA, 2], dtype=any_numeric_ea_dtype) From 08ad510698f434cc92ae898c52a9bd53835e7ce7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 16 Aug 2022 13:55:20 +0200 Subject: [PATCH 06/12] Fix docstring --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f53f9944967f2..3c44f87c63803 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -322,7 +322,7 @@ def unique(values, mask=None): Parameters ---------- values : 1d array-like - mask: 1d boolean array + mask : 1d boolean array Mask of the values. Returns From e829db7a4d367df81f6778049cac9c14b9e1e7ea Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 16 Aug 2022 17:32:24 +0200 Subject: [PATCH 07/12] Fix mask annotation --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 3c44f87c63803..133229163e8d6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -322,7 +322,7 @@ def unique(values, mask=None): Parameters ---------- values : 1d array-like - mask : 1d boolean array + mask : 1d bool array Mask of the values. Returns From dce9ef804a6c6154cc0cd3c93e0fb27c86f6bea3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 16 Aug 2022 22:10:56 +0200 Subject: [PATCH 08/12] Refactor unique --- pandas/core/algorithms.py | 7 ++++++- pandas/core/arrays/masked.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 133229163e8d6..daa5ebd7fc28e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -310,7 +310,7 @@ def _check_object_for_strings(values: np.ndarray) -> str: # --------------- # -def unique(values, mask=None): +def unique(values): """ Return unique values based on a hash table. @@ -406,6 +406,11 @@ def unique(values, mask=None): >>> pd.unique([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]) array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) """ + return _unique(values) + + +def _unique(values, mask: npt.NDArray[np.bool_] | None = None): + """See algorithms.unique for docs""" values = _ensure_arraylike(values) if is_extension_array_dtype(values.dtype): diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 7b0d92bfc74a0..c558f70caa3e8 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -859,7 +859,7 @@ def unique(self: BaseMaskedArrayT) -> BaseMaskedArrayT: ------- uniques : BaseMaskedArray """ - uniques, mask = algos.unique(self._data, self._mask) + uniques, mask = algos._unique(self._data, self._mask) return type(self)(uniques, mask, copy=False) @doc(ExtensionArray.searchsorted) From 59d50af9e88c59dda543d9d42f8a2de9edc95a5f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 16 Aug 2022 23:13:52 +0200 Subject: [PATCH 09/12] Renamen _unique --- pandas/core/algorithms.py | 8 +++----- pandas/core/arrays/masked.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index daa5ebd7fc28e..f55d1d489377d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -322,8 +322,6 @@ def unique(values): Parameters ---------- values : 1d array-like - mask : 1d bool array - Mask of the values. Returns ------- @@ -406,11 +404,11 @@ def unique(values): >>> pd.unique([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]) array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) """ - return _unique(values) + return unique_with_mask(values) -def _unique(values, mask: npt.NDArray[np.bool_] | None = None): - """See algorithms.unique for docs""" +def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None): + """See algorithms.unique for docs. Takes a mask for masked arrays.""" values = _ensure_arraylike(values) if is_extension_array_dtype(values.dtype): diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index c558f70caa3e8..15946ab9ce80d 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -859,7 +859,7 @@ def unique(self: BaseMaskedArrayT) -> BaseMaskedArrayT: ------- uniques : BaseMaskedArray """ - uniques, mask = algos._unique(self._data, self._mask) + uniques, mask = algos.unique_with_mask(self._data, self._mask) return type(self)(uniques, mask, copy=False) @doc(ExtensionArray.searchsorted) From f6d3137e6fffe2813337ff38684ff0e3543bd7ee Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 17 Aug 2022 00:08:11 +0200 Subject: [PATCH 10/12] Fix typing issue --- pandas/core/algorithms.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f55d1d489377d..1a5cda357296e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -427,6 +427,7 @@ def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None): else: uniques, mask = table.unique(values, mask=mask) uniques = _reconstruct_data(uniques, original.dtype, original) + assert mask is not None # for mypy return uniques, mask.astype("bool") From 362f824637aa00c4378b73146357b07c6870db0f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 17 Aug 2022 09:35:19 +0200 Subject: [PATCH 11/12] Fix docstring --- pandas/_libs/hashtable_class_helper.pxi.in | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 6730b38ca442d..e645c1cd76fed 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -690,6 +690,9 @@ cdef class {{name}}HashTable(HashTable): return_inverse : bool, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. + mask : ndarray[bool], optional + If not None, the mask is used as indicator for missing values + (True = missing, False = valid) instead of `na_value` or Returns ------- @@ -697,9 +700,8 @@ cdef class {{name}}HashTable(HashTable): Unique values of input, not sorted labels : ndarray[intp_t] (if return_inverse) The labels from values to uniques - mask : ndarray[bool], optional - If not None, the mask is used as indicator for missing values - (True = missing, False = valid) instead of `na_value` or + result_mask: ndarray[bool], if mask is given as input + The mask for the result values. """ uniques = {{name}}Vector() use_result_mask = True if mask is not None else False From 101018c98ac0ba5efd2ae68d2c6056c3dee3ad46 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 17 Aug 2022 09:54:16 +0200 Subject: [PATCH 12/12] Fix _unique docstring too --- pandas/_libs/hashtable_class_helper.pxi.in | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index e645c1cd76fed..3c9a1f86ad2a1 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -561,6 +561,8 @@ cdef class {{name}}HashTable(HashTable): Unique values of input, not sorted labels : ndarray[intp_t] (if return_inverse=True) The labels from values to uniques + result_mask: ndarray[bool], if use_result_mask is true + The mask for the result values. """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values)