From b50f71123abd5492702c32195de325ceced27f8c Mon Sep 17 00:00:00 2001 From: LTLA Date: Wed, 7 Jan 2026 14:58:58 +1100 Subject: [PATCH 1/6] Implemented R's split() utility to split a sequence by a grouping factor. --- src/biocutils/__init__.py | 1 + src/biocutils/split.py | 70 +++++++++++++++++++++++++++++++++++++++ tests/test_split.py | 26 +++++++++++++++ 3 files changed, 97 insertions(+) create mode 100644 src/biocutils/split.py create mode 100644 tests/test_split.py diff --git a/src/biocutils/__init__.py b/src/biocutils/__init__.py index 81ce3ae..8c7ddd4 100644 --- a/src/biocutils/__init__.py +++ b/src/biocutils/__init__.py @@ -63,3 +63,4 @@ from .biocobject import BiocObject from .table import table +from .split import split diff --git a/src/biocutils/split.py b/src/biocutils/split.py new file mode 100644 index 0000000..22108bf --- /dev/null +++ b/src/biocutils/split.py @@ -0,0 +1,70 @@ +from typing import Any, Sequence, Union + +from .NamedList import NamedList +from .Factor import Factor +from .match import match +from .subset import subset +from .get_height import get_height + + +def split(x: Any, f: Sequence, drop: bool = False, as_NamedList: bool = False) -> Union[dict, NamedList]: + """ + Split a sequence ``x`` into groups defined by a categorical factor ``f``. + + Args: + x: + Values to be divided into groups. + Any object that supports :py:func:`~biocutils.subset.subset` can be used here. + + f: + A sequence of categorical variables defining the groupings. + This should have length equal to the "height" of ``x`` (see :py:func:`~biocutils.get_height.get_height`). + + The order of groups is defined by sorting all unique variables in ``f``. + If a :py:class:`~biocutils.Factor.Factor` is provided, the order of groups is defined by the existing levels. + + drop: + Whether to drop unused levels, if ``f`` is a ``Factor``. + + as_NamedList: + Whether to return the results as a :py:class:`~biocutils.NamedList.NamedList`. + This automatically converts all groups into strings. + + Returns: + A dictionary where each key is a unique group and each value contains that group's entries from ``x``. + If ``as_NamedList = true``, this is a ``NamedList`` instead. + + Examples: + >>> import numpy + >>> x = numpy.random.rand(10) + >>> f = numpy.random.choice(["A", "B", "C"], 10) + >>> import biocutils + >>> biocutils.split(x, f) + >>> biocutils.split(x, f, as_NamedList=True) + >>> biocutils.split(x, biocutils.Factor.from_sequence(f, ["X", "A", "Y", "B", "Z", "C"]), drop=False) + """ + + if isinstance(f, Factor): + if drop: + f = f.drop_unused_levels() + levels = f.get_levels() + indices = f.get_codes() + else: + levels = sorted(list(set(f))) + indices = match(f, levels) + + if get_height(x) != get_height(f): + raise ValueError("heights of 'x' and 'f' should be the same") + + collected = [] + for l in levels: + collected.append([]) + for i, j in enumerate(indices): + collected[j].append(i) + for i, c in enumerate(collected): + collected[i] = subset(x, c) + + if as_NamedList: + return NamedList(collected, levels) + else: + return dict(zip(levels, collected)) diff --git a/tests/test_split.py b/tests/test_split.py new file mode 100644 index 0000000..46fe7c3 --- /dev/null +++ b/tests/test_split.py @@ -0,0 +1,26 @@ +import numpy +import biocutils + + +def test_split_basic(): + x = numpy.random.rand(10) + f = ["B", "A"] * 5 + frag = biocutils.split(x, f) + assert list(frag.keys()) == ["A", "B"] + assert (frag["A"] == x[1:10:2]).all() + assert (frag["B"] == x[0:10:2]).all() + + frag = biocutils.split(x, f, as_NamedList=True) + assert frag.get_names().as_list() == ["A", "B"] + + +def test_split_Factor(): + x = numpy.random.rand(10) + f = biocutils.Factor.from_sequence(["B", "D"] * 5, levels=["E", "D", "C", "B", "A"]) + frag = biocutils.split(x, f, drop=True) + assert list(frag.keys()) == ["D", "B"] + assert (frag["B"] == x[0:10:2]).all() + assert (frag["D"] == x[1:10:2]).all() + + frag = biocutils.split(x, f, drop=False) + assert list(frag.keys()) == ["E", "D", "C", "B", "A"] From 969fcf73413a457b9dd1e3899725ab2400579851 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 7 Jan 2026 04:01:00 +0000 Subject: [PATCH 2/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/biocutils/split.py | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/src/biocutils/split.py b/src/biocutils/split.py index 22108bf..3d57956 100644 --- a/src/biocutils/split.py +++ b/src/biocutils/split.py @@ -25,7 +25,7 @@ def split(x: Any, f: Sequence, drop: bool = False, as_NamedList: bool = False) - drop: Whether to drop unused levels, if ``f`` is a ``Factor``. - + as_NamedList: Whether to return the results as a :py:class:`~biocutils.NamedList.NamedList`. This automatically converts all groups into strings. @@ -36,12 +36,37 @@ def split(x: Any, f: Sequence, drop: bool = False, as_NamedList: bool = False) - Examples: >>> import numpy - >>> x = numpy.random.rand(10) - >>> f = numpy.random.choice(["A", "B", "C"], 10) + >>> x = numpy.random.rand( + ... 10 + ... ) + >>> f = numpy.random.choice( + ... ["A", "B", "C"], + ... 10, + ... ) >>> import biocutils - >>> biocutils.split(x, f) - >>> biocutils.split(x, f, as_NamedList=True) - >>> biocutils.split(x, biocutils.Factor.from_sequence(f, ["X", "A", "Y", "B", "Z", "C"]), drop=False) + >>> biocutils.split( + ... x, f + ... ) + >>> biocutils.split( + ... x, + ... f, + ... as_NamedList=True, + ... ) + >>> biocutils.split( + ... x, + ... biocutils.Factor.from_sequence( + ... f, + ... [ + ... "X", + ... "A", + ... "Y", + ... "B", + ... "Z", + ... "C", + ... ], + ... ), + ... drop=False, + ... ) """ if isinstance(f, Factor): @@ -54,7 +79,7 @@ def split(x: Any, f: Sequence, drop: bool = False, as_NamedList: bool = False) - indices = match(f, levels) if get_height(x) != get_height(f): - raise ValueError("heights of 'x' and 'f' should be the same") + raise ValueError("heights of 'x' and 'f' should be the same") collected = [] for l in levels: From aab849e17c9c84cf50963b7f625e3df38915c491 Mon Sep 17 00:00:00 2001 From: LTLA Date: Thu, 8 Jan 2026 00:12:06 +1100 Subject: [PATCH 3/6] Keep ruff happy. --- src/biocutils/split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/biocutils/split.py b/src/biocutils/split.py index 3d57956..5945370 100644 --- a/src/biocutils/split.py +++ b/src/biocutils/split.py @@ -82,7 +82,7 @@ def split(x: Any, f: Sequence, drop: bool = False, as_NamedList: bool = False) - raise ValueError("heights of 'x' and 'f' should be the same") collected = [] - for l in levels: + for lev in levels: collected.append([]) for i, j in enumerate(indices): collected[j].append(i) From f85dfd9bf5eebbe28acfc3a2d6b843bb2560c038 Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 9 Jan 2026 00:15:07 +1100 Subject: [PATCH 4/6] Ignore None and masked values. --- src/biocutils/split.py | 47 +++++++++++++++++++++++++++++++++++++----- tests/test_split.py | 36 ++++++++++++++++++++++++++++++-- 2 files changed, 76 insertions(+), 7 deletions(-) diff --git a/src/biocutils/split.py b/src/biocutils/split.py index 5945370..44ea321 100644 --- a/src/biocutils/split.py +++ b/src/biocutils/split.py @@ -1,4 +1,7 @@ from typing import Any, Sequence, Union +from functools import singledispatch + +import numpy from .NamedList import NamedList from .Factor import Factor @@ -7,7 +10,14 @@ from .get_height import get_height -def split(x: Any, f: Sequence, drop: bool = False, as_NamedList: bool = False) -> Union[dict, NamedList]: +@singledispatch +def split( + x: Any, + f: Sequence, + skip: Union[set, Sequence] = [None, numpy.ma.masked], + drop: bool = False, + as_NamedList: bool = False +) -> Union[dict, NamedList]: """ Split a sequence ``x`` into groups defined by a categorical factor ``f``. @@ -23,6 +33,10 @@ def split(x: Any, f: Sequence, drop: bool = False, as_NamedList: bool = False) - The order of groups is defined by sorting all unique variables in ``f``. If a :py:class:`~biocutils.Factor.Factor` is provided, the order of groups is defined by the existing levels. + skip: + Values of ``f`` to be skipped. + The corresponding entries of ``x`` are also omitted from the output. + drop: Whether to drop unused levels, if ``f`` is a ``Factor``. @@ -72,10 +86,32 @@ def split(x: Any, f: Sequence, drop: bool = False, as_NamedList: bool = False) - if isinstance(f, Factor): if drop: f = f.drop_unused_levels() - levels = f.get_levels() - indices = f.get_codes() + if len(skip) > 0: + levels = [] + reindex = [] + for lev in f.get_levels(): + ix = -1 + if lev not in skip: + levels.append(lev) + ix = len(reindex) + reindex.append(ix) + indices = [] + for code in f.get_codes(): + if code >= 0: + code = reindex[code] + indices.append(code) + else: + levels = f.get_levels() + indices = f.get_codes() else: - levels = sorted(list(set(f))) + if len(skip) > 0: + levels = set() + for y in f: + if y not in skip: + levels.add(y) + else: + levels = set(f) + levels = sorted(list(levels)) indices = match(f, levels) if get_height(x) != get_height(f): @@ -85,7 +121,8 @@ def split(x: Any, f: Sequence, drop: bool = False, as_NamedList: bool = False) - for lev in levels: collected.append([]) for i, j in enumerate(indices): - collected[j].append(i) + if j >= 0: + collected[j].append(i) for i, c in enumerate(collected): collected[i] = subset(x, c) diff --git a/tests/test_split.py b/tests/test_split.py index 46fe7c3..c663155 100644 --- a/tests/test_split.py +++ b/tests/test_split.py @@ -5,22 +5,54 @@ def test_split_basic(): x = numpy.random.rand(10) f = ["B", "A"] * 5 + frag = biocutils.split(x, f) assert list(frag.keys()) == ["A", "B"] assert (frag["A"] == x[1:10:2]).all() assert (frag["B"] == x[0:10:2]).all() - frag = biocutils.split(x, f, as_NamedList=True) - assert frag.get_names().as_list() == ["A", "B"] + frag2 = biocutils.split(x, f, skip=[]) + assert list(frag.keys()) == list(frag2.keys()) + assert (frag["A"] == frag2["A"]).all() + assert (frag["B"] == frag2["B"]).all() + + nfrag = biocutils.split(x, f, as_NamedList=True) + assert nfrag.get_names().as_list() == ["A", "B"] + + +def test_split_basic_none(): + x = numpy.random.rand(15) + f = ["A", "B", None] * 5 + + frag = biocutils.split(x, f) + assert list(frag.keys()) == ["A", "B"] + assert (frag["A"] == x[0:15:3]).all() + assert (frag["B"] == x[1:15:3]).all() def test_split_Factor(): x = numpy.random.rand(10) f = biocutils.Factor.from_sequence(["B", "D"] * 5, levels=["E", "D", "C", "B", "A"]) + frag = biocutils.split(x, f, drop=True) assert list(frag.keys()) == ["D", "B"] assert (frag["B"] == x[0:10:2]).all() assert (frag["D"] == x[1:10:2]).all() + frag2 = biocutils.split(x, f, skip=[], drop=True) + assert list(frag.keys()) == list(frag2.keys()) + assert (frag["B"] == frag2["B"]).all() + assert (frag["D"] == frag2["D"]).all() + frag = biocutils.split(x, f, drop=False) assert list(frag.keys()) == ["E", "D", "C", "B", "A"] + + +def test_split_Factor_none(): + x = numpy.random.rand(15) + f = biocutils.Factor.from_sequence(["A", "B", None] * 5) + + frag = biocutils.split(x, f) + assert list(frag.keys()) == ["A", "B"] + assert (frag["A"] == x[0:15:3]).all() + assert (frag["B"] == x[1:15:3]).all() From ccbbfd382caff3dab379e363fc963254d08bbbec Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 8 Jan 2026 13:15:50 +0000 Subject: [PATCH 5/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/biocutils/split.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/biocutils/split.py b/src/biocutils/split.py index 44ea321..b60a96c 100644 --- a/src/biocutils/split.py +++ b/src/biocutils/split.py @@ -16,7 +16,7 @@ def split( f: Sequence, skip: Union[set, Sequence] = [None, numpy.ma.masked], drop: bool = False, - as_NamedList: bool = False + as_NamedList: bool = False, ) -> Union[dict, NamedList]: """ Split a sequence ``x`` into groups defined by a categorical factor ``f``. @@ -95,7 +95,7 @@ def split( levels.append(lev) ix = len(reindex) reindex.append(ix) - indices = [] + indices = [] for code in f.get_codes(): if code >= 0: code = reindex[code] From 25cb182b9dd0d4dd1d3770631efcea3620237671 Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 9 Jan 2026 00:30:07 +1100 Subject: [PATCH 6/6] Bugfix for skippable levels of a Factor. --- src/biocutils/split.py | 2 +- tests/test_split.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/biocutils/split.py b/src/biocutils/split.py index b60a96c..2ae56c7 100644 --- a/src/biocutils/split.py +++ b/src/biocutils/split.py @@ -92,8 +92,8 @@ def split( for lev in f.get_levels(): ix = -1 if lev not in skip: + ix = len(levels) levels.append(lev) - ix = len(reindex) reindex.append(ix) indices = [] for code in f.get_codes(): diff --git a/tests/test_split.py b/tests/test_split.py index c663155..cd05f1e 100644 --- a/tests/test_split.py +++ b/tests/test_split.py @@ -56,3 +56,7 @@ def test_split_Factor_none(): assert list(frag.keys()) == ["A", "B"] assert (frag["A"] == x[0:15:3]).all() assert (frag["B"] == x[1:15:3]).all() + + frag = biocutils.split(x, f, skip=set([None, "A"])) + assert list(frag.keys()) == ["B"] + assert (frag["B"] == x[1:15:3]).all()