From 06126cf8c936a24d4e692fbf313127b9ad801649 Mon Sep 17 00:00:00 2001 From: LTLA Date: Thu, 8 Jan 2026 21:58:35 +1100 Subject: [PATCH 1/6] Added utility to identify duplicates. --- src/biocutils/duplicated.py | 59 +++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 src/biocutils/duplicated.py diff --git a/src/biocutils/duplicated.py b/src/biocutils/duplicated.py new file mode 100644 index 0000000..fbc8f70 --- /dev/null +++ b/src/biocutils/duplicated.py @@ -0,0 +1,59 @@ +import numpy + + +@singledispatch +def duplicated(x: Any, incomparables: set = set(), from_last: bool = False) -> numpy.ndarray: + available = set() + output = numpy.ndarray(len(x), dtype=numpy.bool_) + + def process(i, y): + if y in incomparables: + output[i] = False + elif y in available: + output[i] = True + else: + available.add(y) + output[i] = False + + if not from_last: + for i, y in enumerate(x): + process(i, y) + else: + for i in range(len(x) - 1, -1, -1): + process(i, x[i]) + + return output + + +@duplicated.register +def _duplicated_Factor(x: Factor, incomparables: set = set(), from_last: bool = False) -> numpy.ndarray: + present = [] + for lev in x.get_levels(): + if lev in incomparables: + present.append(None) + else: + present.append(False) + + def process(i, y): + tmp = present[i] + if tmp is None: + output[i] = False + elif tmp: + output[i] = True + else: + present[i] = True + output[i] = False + + if not from_last: + for i, y in enumerate(x): + process(i, y) + else: + for i in range(len(x) - 1, -1, -1): + process(i, x[i]) + + return output + + +def unique(x: Any) -> Any: + return subset(x, numpy.where(duplicated(x))[0]) + From e31002cdd900646b90c26d833f7a118f9fc55a17 Mon Sep 17 00:00:00 2001 From: LTLA Date: Thu, 8 Jan 2026 23:55:09 +1100 Subject: [PATCH 2/6] Added more stuff. --- src/biocutils/duplicated.py | 53 ++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/src/biocutils/duplicated.py b/src/biocutils/duplicated.py index fbc8f70..6056a16 100644 --- a/src/biocutils/duplicated.py +++ b/src/biocutils/duplicated.py @@ -1,8 +1,33 @@ +from typing import Any, Union, Sequence + import numpy +from .Factor import Factor + @singledispatch -def duplicated(x: Any, incomparables: set = set(), from_last: bool = False) -> numpy.ndarray: +def duplicated(x: Any, incomparables: Union[set, Sequence] = set(), from_last: bool = False) -> numpy.ndarray: + """ + Find duplicated elements of ``x``. + + Args: + x: + Object to be searched for duplicates. + This is usually a sequence that can be iterated over. + + incomparables: + Values of ``x`` that cannot be compared. + Any value of ``x`` in ``incomparables`` will never be a duplicate. + Any object that has an ``__in__`` method can be used here. + + from_last: + Whether to report the last occurrence as a non-duplicate. + + Returns: + NumPy array of length equal to that of ``x``, + containing truthy values for only the first occurrence of each value of ``x``. + If ``from_last = True``, truthy values are only reported for the last occurrence of each value of ``x``. + """ available = set() output = numpy.ndarray(len(x), dtype=numpy.bool_) @@ -26,7 +51,7 @@ def process(i, y): @duplicated.register -def _duplicated_Factor(x: Factor, incomparables: set = set(), from_last: bool = False) -> numpy.ndarray: +def _duplicated_Factor(x: Factor, incomparables: Union[set, Sequence] = set(), from_last: bool = False) -> numpy.ndarray: present = [] for lev in x.get_levels(): if lev in incomparables: @@ -54,6 +79,26 @@ def process(i, y): return output -def unique(x: Any) -> Any: - return subset(x, numpy.where(duplicated(x))[0]) +def unique(x: Any, incomparables: Union[set, Sequence] = set(), from_last: bool = False) -> Any: + """ + Get all unique values of ``x``. + + Args: + x: + Object in which to find unique entries. + This is usually a sequence that can be iterated over. + + incomparables: + Values of ``x`` that cannot be compared. + Any value of ``x`` in ``incomparables`` will never be a duplicate. + Any object that has an ``__in__`` method can be used here. + from_last: + Whether to retain the last occurrence of each value in ``x``. + By default, the first occurrence is retained. + + Returns: + An object containing unique values of ``x``. + This is usually of the same class as ``x``. + """ + return subset(x, numpy.where(duplicated(x))[0]) From 45fc695e7bfc2a76955bf66ba144fd4a7cea7dde Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 9 Jan 2026 00:22:40 +1100 Subject: [PATCH 3/6] Added examples and stuff. --- src/biocutils/__init__.py | 2 ++ src/biocutils/duplicated.py | 18 +++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/biocutils/__init__.py b/src/biocutils/__init__.py index 81ce3ae..51c42a1 100644 --- a/src/biocutils/__init__.py +++ b/src/biocutils/__init__.py @@ -63,3 +63,5 @@ from .biocobject import BiocObject from .table import table + +from .duplicated import duplicated, unique diff --git a/src/biocutils/duplicated.py b/src/biocutils/duplicated.py index 6056a16..6b2f6be 100644 --- a/src/biocutils/duplicated.py +++ b/src/biocutils/duplicated.py @@ -1,8 +1,10 @@ from typing import Any, Union, Sequence +from functools import singledispatch import numpy from .Factor import Factor +from .subset import subset @singledispatch @@ -27,7 +29,15 @@ def duplicated(x: Any, incomparables: Union[set, Sequence] = set(), from_last: b NumPy array of length equal to that of ``x``, containing truthy values for only the first occurrence of each value of ``x``. If ``from_last = True``, truthy values are only reported for the last occurrence of each value of ``x``. + + Examples: + >>> import biocutils + >>> biocutils.duplicated([1,2,1,2,3,2]) + >>> biocutils.duplicated([1,2,1,2,3,2], from_last=True) + >>> biocutils.duplicated([1,2,None,None,3,2]) + >>> biocutils.duplicated([1,2,None,None,3,2], incomparables=set([None])) """ + available = set() output = numpy.ndarray(len(x), dtype=numpy.bool_) @@ -100,5 +110,11 @@ def unique(x: Any, incomparables: Union[set, Sequence] = set(), from_last: bool Returns: An object containing unique values of ``x``. This is usually of the same class as ``x``. + + Examples: + >>> import biocutils + >>> biocutils.unique([1,2,1,2,3,2]) + >>> biocutils.unique([1,2,None,None,3,2]) + >>> biocutils.unique([1,2,None,None,3,2], incomparables=set([None])) """ - return subset(x, numpy.where(duplicated(x))[0]) + return subset(x, numpy.where(numpy.logical_not(duplicated(x, incomparables=incomparables, from_last=from_last)))[0]) From ad0818f6a6e811d667b988f34354ed1f4d60b93d Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 9 Jan 2026 00:56:31 +1100 Subject: [PATCH 4/6] Added tests, fixed the bugs. --- src/biocutils/duplicated.py | 18 +++++++++++++----- tests/test_duplicated.py | 22 ++++++++++++++++++++++ 2 files changed, 35 insertions(+), 5 deletions(-) create mode 100644 tests/test_duplicated.py diff --git a/src/biocutils/duplicated.py b/src/biocutils/duplicated.py index 6b2f6be..9d859b1 100644 --- a/src/biocutils/duplicated.py +++ b/src/biocutils/duplicated.py @@ -68,23 +68,31 @@ def _duplicated_Factor(x: Factor, incomparables: Union[set, Sequence] = set(), f present.append(None) else: present.append(False) - + + # Handling codes of -1, i.e., None. + if None in incomparables: + present.append(None) + else: + present.append(False) + + output = numpy.ndarray(len(x), dtype=numpy.bool_) def process(i, y): - tmp = present[i] + tmp = present[y] if tmp is None: output[i] = False elif tmp: output[i] = True else: - present[i] = True + present[y] = True output[i] = False if not from_last: - for i, y in enumerate(x): + for i, y in enumerate(x.get_codes()): process(i, y) else: + codes = x.get_codes() for i in range(len(x) - 1, -1, -1): - process(i, x[i]) + process(i, codes[i]) return output diff --git a/tests/test_duplicated.py b/tests/test_duplicated.py new file mode 100644 index 0000000..b32b9b3 --- /dev/null +++ b/tests/test_duplicated.py @@ -0,0 +1,22 @@ +import biocutils + + +def test_duplicated_basic(): + assert list(biocutils.duplicated([1,2,1,2,3,2])) == [False, False, True, True, False, True] + assert list(biocutils.duplicated([1,2,1,2,3,2], from_last=True)) == [True, True, False, True, False, False] + assert list(biocutils.duplicated([1,2,None,None,3,2,3])) == [False, False, False, True, False, True, True] + assert list(biocutils.duplicated([1,2,None,None,3,2,3], incomparables=set([None]))) == [False, False, False, False, False, True, True] + + +def test_duplicated_Factor(): + assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,1,2,3,2]))) == [False, False, True, True, False, True] + assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,1,2,3,2]), from_last=True)) == [True, True, False, True, False, False] + assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,None,None,3,2,3]))) == [False, False, False, True, False, True, True] + assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,None,None,3,2,3]), incomparables=set([None]))) == [False, False, False, False, False, True, True] + + +def test_unique(): + assert biocutils.unique([1,2,1,2,3,2]) == [1,2,3] + assert biocutils.unique([1,2,1,2,3,2], from_last=True) == [1,3,2] + assert biocutils.unique([1,2,None,None,3,2]) == [1,2,None,3] + assert biocutils.unique([1,2,None,None,3,2], incomparables=set([None])) == [1,2,None,None,3] From a1ceefab0213c5567c722b70b3792119f75438ba Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 8 Jan 2026 13:57:35 +0000 Subject: [PATCH 5/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/biocutils/duplicated.py | 99 ++++++++++++++++++++++++++++++++----- tests/test_duplicated.py | 8 +-- 2 files changed, 90 insertions(+), 17 deletions(-) diff --git a/src/biocutils/duplicated.py b/src/biocutils/duplicated.py index 9d859b1..812e987 100644 --- a/src/biocutils/duplicated.py +++ b/src/biocutils/duplicated.py @@ -15,11 +15,11 @@ def duplicated(x: Any, incomparables: Union[set, Sequence] = set(), from_last: b Args: x: Object to be searched for duplicates. - This is usually a sequence that can be iterated over. + This is usually a sequence that can be iterated over. incomparables: Values of ``x`` that cannot be compared. - Any value of ``x`` in ``incomparables`` will never be a duplicate. + Any value of ``x`` in ``incomparables`` will never be a duplicate. Any object that has an ``__in__`` method can be used here. from_last: @@ -32,10 +32,50 @@ def duplicated(x: Any, incomparables: Union[set, Sequence] = set(), from_last: b Examples: >>> import biocutils - >>> biocutils.duplicated([1,2,1,2,3,2]) - >>> biocutils.duplicated([1,2,1,2,3,2], from_last=True) - >>> biocutils.duplicated([1,2,None,None,3,2]) - >>> biocutils.duplicated([1,2,None,None,3,2], incomparables=set([None])) + >>> biocutils.duplicated( + ... [ + ... 1, + ... 2, + ... 1, + ... 2, + ... 3, + ... 2, + ... ] + ... ) + >>> biocutils.duplicated( + ... [ + ... 1, + ... 2, + ... 1, + ... 2, + ... 3, + ... 2, + ... ], + ... from_last=True, + ... ) + >>> biocutils.duplicated( + ... [ + ... 1, + ... 2, + ... None, + ... None, + ... 3, + ... 2, + ... ] + ... ) + >>> biocutils.duplicated( + ... [ + ... 1, + ... 2, + ... None, + ... None, + ... 3, + ... 2, + ... ], + ... incomparables=set( + ... [None] + ... ), + ... ) """ available = set() @@ -61,7 +101,9 @@ def process(i, y): @duplicated.register -def _duplicated_Factor(x: Factor, incomparables: Union[set, Sequence] = set(), from_last: bool = False) -> numpy.ndarray: +def _duplicated_Factor( + x: Factor, incomparables: Union[set, Sequence] = set(), from_last: bool = False +) -> numpy.ndarray: present = [] for lev in x.get_levels(): if lev in incomparables: @@ -76,6 +118,7 @@ def _duplicated_Factor(x: Factor, incomparables: Union[set, Sequence] = set(), f present.append(False) output = numpy.ndarray(len(x), dtype=numpy.bool_) + def process(i, y): tmp = present[y] if tmp is None: @@ -104,15 +147,15 @@ def unique(x: Any, incomparables: Union[set, Sequence] = set(), from_last: bool Args: x: Object in which to find unique entries. - This is usually a sequence that can be iterated over. + This is usually a sequence that can be iterated over. incomparables: Values of ``x`` that cannot be compared. - Any value of ``x`` in ``incomparables`` will never be a duplicate. + Any value of ``x`` in ``incomparables`` will never be a duplicate. Any object that has an ``__in__`` method can be used here. from_last: - Whether to retain the last occurrence of each value in ``x``. + Whether to retain the last occurrence of each value in ``x``. By default, the first occurrence is retained. Returns: @@ -121,8 +164,38 @@ def unique(x: Any, incomparables: Union[set, Sequence] = set(), from_last: bool Examples: >>> import biocutils - >>> biocutils.unique([1,2,1,2,3,2]) - >>> biocutils.unique([1,2,None,None,3,2]) - >>> biocutils.unique([1,2,None,None,3,2], incomparables=set([None])) + >>> biocutils.unique( + ... [ + ... 1, + ... 2, + ... 1, + ... 2, + ... 3, + ... 2, + ... ] + ... ) + >>> biocutils.unique( + ... [ + ... 1, + ... 2, + ... None, + ... None, + ... 3, + ... 2, + ... ] + ... ) + >>> biocutils.unique( + ... [ + ... 1, + ... 2, + ... None, + ... None, + ... 3, + ... 2, + ... ], + ... incomparables=set( + ... [None] + ... ), + ... ) """ return subset(x, numpy.where(numpy.logical_not(duplicated(x, incomparables=incomparables, from_last=from_last)))[0]) diff --git a/tests/test_duplicated.py b/tests/test_duplicated.py index b32b9b3..3e3a5c3 100644 --- a/tests/test_duplicated.py +++ b/tests/test_duplicated.py @@ -2,15 +2,15 @@ def test_duplicated_basic(): - assert list(biocutils.duplicated([1,2,1,2,3,2])) == [False, False, True, True, False, True] - assert list(biocutils.duplicated([1,2,1,2,3,2], from_last=True)) == [True, True, False, True, False, False] + assert list(biocutils.duplicated([1,2,1,2,3,2])) == [False, False, True, True, False, True] + assert list(biocutils.duplicated([1,2,1,2,3,2], from_last=True)) == [True, True, False, True, False, False] assert list(biocutils.duplicated([1,2,None,None,3,2,3])) == [False, False, False, True, False, True, True] assert list(biocutils.duplicated([1,2,None,None,3,2,3], incomparables=set([None]))) == [False, False, False, False, False, True, True] def test_duplicated_Factor(): - assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,1,2,3,2]))) == [False, False, True, True, False, True] - assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,1,2,3,2]), from_last=True)) == [True, True, False, True, False, False] + assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,1,2,3,2]))) == [False, False, True, True, False, True] + assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,1,2,3,2]), from_last=True)) == [True, True, False, True, False, False] assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,None,None,3,2,3]))) == [False, False, False, True, False, True, True] assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,None,None,3,2,3]), incomparables=set([None]))) == [False, False, False, False, False, True, True] From 217e03f1ba7f5204e325243c848a108d8ec3b21f Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 9 Jan 2026 01:00:32 +1100 Subject: [PATCH 6/6] Add coverage when incomparable Factor level is a string. --- tests/test_duplicated.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_duplicated.py b/tests/test_duplicated.py index 3e3a5c3..3edeb6c 100644 --- a/tests/test_duplicated.py +++ b/tests/test_duplicated.py @@ -13,6 +13,7 @@ def test_duplicated_Factor(): assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,1,2,3,2]), from_last=True)) == [True, True, False, True, False, False] assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,None,None,3,2,3]))) == [False, False, False, True, False, True, True] assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,None,None,3,2,3]), incomparables=set([None]))) == [False, False, False, False, False, True, True] + assert list(biocutils.duplicated(biocutils.Factor.from_sequence([1,2,None,None,3,2,3]), incomparables=set(["2"]))) == [False, False, False, True, False, False, True] def test_unique():