From b50f71123abd5492702c32195de325ceced27f8c Mon Sep 17 00:00:00 2001
From: LTLA <infinite.monkeys.with.keyboards@gmail.com>
Date: Wed, 7 Jan 2026 14:58:58 +1100
Subject: [PATCH 1/6] Implemented R's split() utility to split a sequence by a
 grouping factor.

---
 src/biocutils/__init__.py |  1 +
 src/biocutils/split.py    | 70 +++++++++++++++++++++++++++++++++++++++
 tests/test_split.py       | 26 +++++++++++++++
 3 files changed, 97 insertions(+)
 create mode 100644 src/biocutils/split.py
 create mode 100644 tests/test_split.py

diff --git a/src/biocutils/__init__.py b/src/biocutils/__init__.py
index 81ce3ae..8c7ddd4 100644
--- a/src/biocutils/__init__.py
+++ b/src/biocutils/__init__.py
@@ -63,3 +63,4 @@
 
 from .biocobject import BiocObject
 from .table import table
+from .split import split
diff --git a/src/biocutils/split.py b/src/biocutils/split.py
new file mode 100644
index 0000000..22108bf
--- /dev/null
+++ b/src/biocutils/split.py
@@ -0,0 +1,70 @@
+from typing import Any, Sequence, Union
+
+from .NamedList import NamedList
+from .Factor import Factor
+from .match import match
+from .subset import subset
+from .get_height import get_height
+
+
+def split(x: Any, f: Sequence, drop: bool = False, as_NamedList: bool = False) -> Union[dict, NamedList]:
+    """
+    Split a sequence ``x`` into groups defined by a categorical factor ``f``.
+
+    Args:
+        x:
+            Values to be divided into groups.
+            Any object that supports :py:func:`~biocutils.subset.subset` can be used here.
+
+        f:
+            A sequence of categorical variables defining the groupings.
+            This should have length equal to the "height" of ``x`` (see :py:func:`~biocutils.get_height.get_height`).
+
+            The order of groups is defined by sorting all unique variables in ``f``.
+            If a :py:class:`~biocutils.Factor.Factor` is provided, the order of groups is defined by the existing levels.
+
+        drop:
+            Whether to drop unused levels, if ``f`` is a ``Factor``.
+        
+        as_NamedList:
+            Whether to return the results as a :py:class:`~biocutils.NamedList.NamedList`.
+            This automatically converts all groups into strings.
+
+    Returns:
+        A dictionary where each key is a unique group and each value contains that group's entries from ``x``.
+        If ``as_NamedList = true``, this is a ``NamedList`` instead.
+
+    Examples:
+        >>> import numpy
+        >>> x = numpy.random.rand(10)
+        >>> f = numpy.random.choice(["A", "B", "C"], 10)
+        >>> import biocutils
+        >>> biocutils.split(x, f)
+        >>> biocutils.split(x, f, as_NamedList=True)
+        >>> biocutils.split(x, biocutils.Factor.from_sequence(f, ["X", "A", "Y", "B", "Z", "C"]), drop=False)
+    """
+
+    if isinstance(f, Factor):
+        if drop:
+            f = f.drop_unused_levels()
+        levels = f.get_levels()
+        indices = f.get_codes()
+    else:
+        levels = sorted(list(set(f)))
+        indices = match(f, levels)
+
+    if get_height(x) != get_height(f):
+        raise ValueError("heights of 'x' and 'f' should be the same") 
+
+    collected = []
+    for l in levels:
+        collected.append([])
+    for i, j in enumerate(indices):
+        collected[j].append(i)
+    for i, c in enumerate(collected):
+        collected[i] = subset(x, c)
+
+    if as_NamedList:
+        return NamedList(collected, levels)
+    else:
+        return dict(zip(levels, collected))
diff --git a/tests/test_split.py b/tests/test_split.py
new file mode 100644
index 0000000..46fe7c3
--- /dev/null
+++ b/tests/test_split.py
@@ -0,0 +1,26 @@
+import numpy
+import biocutils
+
+
+def test_split_basic():
+    x = numpy.random.rand(10)
+    f = ["B", "A"] * 5
+    frag = biocutils.split(x, f)
+    assert list(frag.keys()) == ["A", "B"]
+    assert (frag["A"] == x[1:10:2]).all()
+    assert (frag["B"] == x[0:10:2]).all()
+
+    frag = biocutils.split(x, f, as_NamedList=True)
+    assert frag.get_names().as_list() == ["A", "B"]
+
+
+def test_split_Factor():
+    x = numpy.random.rand(10)
+    f = biocutils.Factor.from_sequence(["B", "D"] * 5, levels=["E", "D", "C", "B", "A"])
+    frag = biocutils.split(x, f, drop=True)
+    assert list(frag.keys()) == ["D", "B"]
+    assert (frag["B"] == x[0:10:2]).all()
+    assert (frag["D"] == x[1:10:2]).all()
+
+    frag = biocutils.split(x, f, drop=False)
+    assert list(frag.keys()) == ["E", "D", "C", "B", "A"]

From 969fcf73413a457b9dd1e3899725ab2400579851 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 7 Jan 2026 04:01:00 +0000
Subject: [PATCH 2/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/biocutils/split.py | 39 ++++++++++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 7 deletions(-)

diff --git a/src/biocutils/split.py b/src/biocutils/split.py
index 22108bf..3d57956 100644
--- a/src/biocutils/split.py
+++ b/src/biocutils/split.py
@@ -25,7 +25,7 @@ def split(x: Any, f: Sequence, drop: bool = False, as_NamedList: bool = False) -
 
         drop:
             Whether to drop unused levels, if ``f`` is a ``Factor``.
-        
+
         as_NamedList:
             Whether to return the results as a :py:class:`~biocutils.NamedList.NamedList`.
             This automatically converts all groups into strings.
@@ -36,12 +36,37 @@ def split(x: Any, f: Sequence, drop: bool = False, as_NamedList: bool = False) -
 
     Examples:
         >>> import numpy
-        >>> x = numpy.random.rand(10)
-        >>> f = numpy.random.choice(["A", "B", "C"], 10)
+        >>> x = numpy.random.rand(
+        ...     10
+        ... )
+        >>> f = numpy.random.choice(
+        ...     ["A", "B", "C"],
+        ...     10,
+        ... )
         >>> import biocutils
-        >>> biocutils.split(x, f)
-        >>> biocutils.split(x, f, as_NamedList=True)
-        >>> biocutils.split(x, biocutils.Factor.from_sequence(f, ["X", "A", "Y", "B", "Z", "C"]), drop=False)
+        >>> biocutils.split(
+        ...     x, f
+        ... )
+        >>> biocutils.split(
+        ...     x,
+        ...     f,
+        ...     as_NamedList=True,
+        ... )
+        >>> biocutils.split(
+        ...     x,
+        ...     biocutils.Factor.from_sequence(
+        ...         f,
+        ...         [
+        ...             "X",
+        ...             "A",
+        ...             "Y",
+        ...             "B",
+        ...             "Z",
+        ...             "C",
+        ...         ],
+        ...     ),
+        ...     drop=False,
+        ... )
     """
 
     if isinstance(f, Factor):
@@ -54,7 +79,7 @@ def split(x: Any, f: Sequence, drop: bool = False, as_NamedList: bool = False) -
         indices = match(f, levels)
 
     if get_height(x) != get_height(f):
-        raise ValueError("heights of 'x' and 'f' should be the same") 
+        raise ValueError("heights of 'x' and 'f' should be the same")
 
     collected = []
     for l in levels:

From aab849e17c9c84cf50963b7f625e3df38915c491 Mon Sep 17 00:00:00 2001
From: LTLA <infinite.monkeys.with.keyboards@gmail.com>
Date: Thu, 8 Jan 2026 00:12:06 +1100
Subject: [PATCH 3/6] Keep ruff happy.

---
 src/biocutils/split.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/biocutils/split.py b/src/biocutils/split.py
index 3d57956..5945370 100644
--- a/src/biocutils/split.py
+++ b/src/biocutils/split.py
@@ -82,7 +82,7 @@ def split(x: Any, f: Sequence, drop: bool = False, as_NamedList: bool = False) -
         raise ValueError("heights of 'x' and 'f' should be the same")
 
     collected = []
-    for l in levels:
+    for lev in levels:
         collected.append([])
     for i, j in enumerate(indices):
         collected[j].append(i)

From f85dfd9bf5eebbe28acfc3a2d6b843bb2560c038 Mon Sep 17 00:00:00 2001
From: LTLA <infinite.monkeys.with.keyboards@gmail.com>
Date: Fri, 9 Jan 2026 00:15:07 +1100
Subject: [PATCH 4/6] Ignore None and masked values.

---
 src/biocutils/split.py | 47 +++++++++++++++++++++++++++++++++++++-----
 tests/test_split.py    | 36 ++++++++++++++++++++++++++++++--
 2 files changed, 76 insertions(+), 7 deletions(-)

diff --git a/src/biocutils/split.py b/src/biocutils/split.py
index 5945370..44ea321 100644
--- a/src/biocutils/split.py
+++ b/src/biocutils/split.py
@@ -1,4 +1,7 @@
 from typing import Any, Sequence, Union
+from functools import singledispatch
+
+import numpy
 
 from .NamedList import NamedList
 from .Factor import Factor
@@ -7,7 +10,14 @@
 from .get_height import get_height
 
 
-def split(x: Any, f: Sequence, drop: bool = False, as_NamedList: bool = False) -> Union[dict, NamedList]:
+@singledispatch
+def split(
+    x: Any,
+    f: Sequence,
+    skip: Union[set, Sequence] = [None, numpy.ma.masked],
+    drop: bool = False,
+    as_NamedList: bool = False
+) -> Union[dict, NamedList]:
     """
     Split a sequence ``x`` into groups defined by a categorical factor ``f``.
 
@@ -23,6 +33,10 @@ def split(x: Any, f: Sequence, drop: bool = False, as_NamedList: bool = False) -
             The order of groups is defined by sorting all unique variables in ``f``.
             If a :py:class:`~biocutils.Factor.Factor` is provided, the order of groups is defined by the existing levels.
 
+        skip:
+            Values of ``f`` to be skipped.
+            The corresponding entries of ``x`` are also omitted from the output.
+
         drop:
             Whether to drop unused levels, if ``f`` is a ``Factor``.
 
@@ -72,10 +86,32 @@ def split(x: Any, f: Sequence, drop: bool = False, as_NamedList: bool = False) -
     if isinstance(f, Factor):
         if drop:
             f = f.drop_unused_levels()
-        levels = f.get_levels()
-        indices = f.get_codes()
+        if len(skip) > 0:
+            levels = []
+            reindex = []
+            for lev in f.get_levels():
+                ix = -1
+                if lev not in skip:
+                    levels.append(lev)
+                    ix = len(reindex)
+                reindex.append(ix)
+            indices = [] 
+            for code in f.get_codes():
+                if code >= 0:
+                    code = reindex[code]
+                indices.append(code)
+        else:
+            levels = f.get_levels()
+            indices = f.get_codes()
     else:
-        levels = sorted(list(set(f)))
+        if len(skip) > 0:
+            levels = set()
+            for y in f:
+                if y not in skip:
+                    levels.add(y)
+        else:
+            levels = set(f)
+        levels = sorted(list(levels))
         indices = match(f, levels)
 
     if get_height(x) != get_height(f):
@@ -85,7 +121,8 @@ def split(x: Any, f: Sequence, drop: bool = False, as_NamedList: bool = False) -
     for lev in levels:
         collected.append([])
     for i, j in enumerate(indices):
-        collected[j].append(i)
+        if j >= 0:
+            collected[j].append(i)
     for i, c in enumerate(collected):
         collected[i] = subset(x, c)
 
diff --git a/tests/test_split.py b/tests/test_split.py
index 46fe7c3..c663155 100644
--- a/tests/test_split.py
+++ b/tests/test_split.py
@@ -5,22 +5,54 @@
 def test_split_basic():
     x = numpy.random.rand(10)
     f = ["B", "A"] * 5
+
     frag = biocutils.split(x, f)
     assert list(frag.keys()) == ["A", "B"]
     assert (frag["A"] == x[1:10:2]).all()
     assert (frag["B"] == x[0:10:2]).all()
 
-    frag = biocutils.split(x, f, as_NamedList=True)
-    assert frag.get_names().as_list() == ["A", "B"]
+    frag2 = biocutils.split(x, f, skip=[])
+    assert list(frag.keys()) == list(frag2.keys())
+    assert (frag["A"] == frag2["A"]).all()
+    assert (frag["B"] == frag2["B"]).all()
+
+    nfrag = biocutils.split(x, f, as_NamedList=True)
+    assert nfrag.get_names().as_list() == ["A", "B"]
+
+
+def test_split_basic_none():
+    x = numpy.random.rand(15)
+    f = ["A", "B", None] * 5
+
+    frag = biocutils.split(x, f)
+    assert list(frag.keys()) == ["A", "B"]
+    assert (frag["A"] == x[0:15:3]).all()
+    assert (frag["B"] == x[1:15:3]).all()
 
 
 def test_split_Factor():
     x = numpy.random.rand(10)
     f = biocutils.Factor.from_sequence(["B", "D"] * 5, levels=["E", "D", "C", "B", "A"])
+
     frag = biocutils.split(x, f, drop=True)
     assert list(frag.keys()) == ["D", "B"]
     assert (frag["B"] == x[0:10:2]).all()
     assert (frag["D"] == x[1:10:2]).all()
 
+    frag2 = biocutils.split(x, f, skip=[], drop=True)
+    assert list(frag.keys()) == list(frag2.keys())
+    assert (frag["B"] == frag2["B"]).all()
+    assert (frag["D"] == frag2["D"]).all()
+
     frag = biocutils.split(x, f, drop=False)
     assert list(frag.keys()) == ["E", "D", "C", "B", "A"]
+
+
+def test_split_Factor_none():
+    x = numpy.random.rand(15)
+    f = biocutils.Factor.from_sequence(["A", "B", None] * 5)
+
+    frag = biocutils.split(x, f)
+    assert list(frag.keys()) == ["A", "B"]
+    assert (frag["A"] == x[0:15:3]).all()
+    assert (frag["B"] == x[1:15:3]).all()

From ccbbfd382caff3dab379e363fc963254d08bbbec Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 8 Jan 2026 13:15:50 +0000
Subject: [PATCH 5/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/biocutils/split.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/biocutils/split.py b/src/biocutils/split.py
index 44ea321..b60a96c 100644
--- a/src/biocutils/split.py
+++ b/src/biocutils/split.py
@@ -16,7 +16,7 @@ def split(
     f: Sequence,
     skip: Union[set, Sequence] = [None, numpy.ma.masked],
     drop: bool = False,
-    as_NamedList: bool = False
+    as_NamedList: bool = False,
 ) -> Union[dict, NamedList]:
     """
     Split a sequence ``x`` into groups defined by a categorical factor ``f``.
@@ -95,7 +95,7 @@ def split(
                     levels.append(lev)
                     ix = len(reindex)
                 reindex.append(ix)
-            indices = [] 
+            indices = []
             for code in f.get_codes():
                 if code >= 0:
                     code = reindex[code]

From 25cb182b9dd0d4dd1d3770631efcea3620237671 Mon Sep 17 00:00:00 2001
From: LTLA <infinite.monkeys.with.keyboards@gmail.com>
Date: Fri, 9 Jan 2026 00:30:07 +1100
Subject: [PATCH 6/6] Bugfix for skippable levels of a Factor.

---
 src/biocutils/split.py | 2 +-
 tests/test_split.py    | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/biocutils/split.py b/src/biocutils/split.py
index b60a96c..2ae56c7 100644
--- a/src/biocutils/split.py
+++ b/src/biocutils/split.py
@@ -92,8 +92,8 @@ def split(
             for lev in f.get_levels():
                 ix = -1
                 if lev not in skip:
+                    ix = len(levels)
                     levels.append(lev)
-                    ix = len(reindex)
                 reindex.append(ix)
             indices = []
             for code in f.get_codes():
diff --git a/tests/test_split.py b/tests/test_split.py
index c663155..cd05f1e 100644
--- a/tests/test_split.py
+++ b/tests/test_split.py
@@ -56,3 +56,7 @@ def test_split_Factor_none():
     assert list(frag.keys()) == ["A", "B"]
     assert (frag["A"] == x[0:15:3]).all()
     assert (frag["B"] == x[1:15:3]).all()
+
+    frag = biocutils.split(x, f, skip=set([None, "A"]))
+    assert list(frag.keys()) == ["B"]
+    assert (frag["B"] == x[1:15:3]).all()