diff --git a/package/AUTHORS b/package/AUTHORS index 81ada5e07ca..a81f5cd7e68 100644 --- a/package/AUTHORS +++ b/package/AUTHORS @@ -324,4 +324,4 @@ Logo The MDAnalysis 'Atom' logo was designed by Christian Beckstein; it is Copyright (c) 2011 Christian Beckstein and made available under a -Creative Commons Attribution-NoDerivs 3.0 Unported License. +Creative Commons Attribution-NoDerivs 3.0 Unported License. \ No newline at end of file diff --git a/package/CHANGELOG b/package/CHANGELOG index 1a36fce7284..3d45b256679 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -45,6 +45,9 @@ Fixes DSSP by porting upstream PyDSSP 0.9.1 fix (Issue #4913) Enhancements + * Improved performance of inverse index mapping in AtomGroup using an optimized + Cython implementation in lib._cutils.inverse_int_index() + (Issue #3387, PR #5252) * Added `select=None` in `analysis.rms.RMSD` to perform no selection on the input `atomgroup` and `reference` (Issue #5300, PR #5296) * MOL2Parser now reads unit cell dimensions from @CRYSIN records (Issue #3341) @@ -3627,4 +3630,4 @@ Testsuite licenses 11/12/07 naveen - * prepared for release outside lab + * prepared for release outside lab \ No newline at end of file diff --git a/package/MDAnalysis/core/groups.py b/package/MDAnalysis/core/groups.py index aad2fb02f34..05cab8382a6 100644 --- a/package/MDAnalysis/core/groups.py +++ b/package/MDAnalysis/core/groups.py @@ -122,6 +122,7 @@ from ..exceptions import NoDataError from . import topologyobjects from ._get_readers import get_writer_for, get_converter_for +from ..lib._cutil import inverse_int_index def _unpickle(u, ix): @@ -912,10 +913,7 @@ def _asunique(self, group, sorted=False, set_mask=False): indices = unique_int_1d_unsorted(self.ix) if set_mask: - mask = np.zeros_like(self.ix) - for i, x in enumerate(indices): - values = np.where(self.ix == x)[0] - mask[values] = i + mask = inverse_int_index(self.ix, indices) self._unique_restore_mask = mask issorted = int_array_is_sorted(indices) diff --git a/package/MDAnalysis/lib/_cutil.pyx b/package/MDAnalysis/lib/_cutil.pyx index 5c447eada86..19bc9d354f7 100644 --- a/package/MDAnalysis/lib/_cutil.pyx +++ b/package/MDAnalysis/lib/_cutil.pyx @@ -37,7 +37,7 @@ from cython.operator cimport dereference as deref cnp.import_array() -__all__ = ['unique_int_1d', 'make_whole', 'find_fragments', +__all__ = ['unique_int_1d', 'inverse_int_index', 'make_whole', 'find_fragments', '_sarrus_det_single', '_sarrus_det_multiple'] cdef extern from "calc_distances.h": @@ -91,6 +91,65 @@ def unique_int_1d(cnp.intp_t[:] values): return np.array(result) +@cython.boundscheck(False) +@cython.wraparound(False) +def inverse_int_index(cnp.intp_t[:] values, + cnp.intp_t[:] unique_vals): + r"""Construct an inverse index array (mask) mapping values to unique_vals. + + The returned mask contains the indices such that: + + .. math:: + \text{unique\_vals}[\text{mask}] == \text{values} + + Parameters + ---------- + values : numpy.ndarray + 1D array of integers (can contain duplicates). + unique_vals : numpy.ndarray + 1D array of unique integers corresponding to the elements in `values`. + + Returns + ------- + numpy.ndarray + An integer array `mask` of the same length as `values`, where + ``mask[i]`` is the index of ``values[i]`` in `unique_vals`. + + + Notes + ----- + + + .. versionadded:: 2.11.0 + + + Examples + -------- + >>> import numpy as np + >>> from MDAnalysis.lib._cutil import inverse_int_index + >>> vals = np.array([1, 5, 3, 3, 6], dtype=np.intp) + >>> uniq = np.array([1, 5, 3, 6], dtype=np.intp) + >>> mask = inverse_int_index(vals, uniq) + >>> mask + array([0, 1, 2, 2, 3]) + >>> np.all(uniq[mask] == vals) + True + """ + + cdef Py_ssize_t n = values.shape[0] + cdef Py_ssize_t m = unique_vals.shape[0] + cdef Py_ssize_t i + + cdef dict lookup = {} + cdef cnp.intp_t[:] mask = np.empty(n, dtype=np.intp) + + for i in range(m): + lookup[unique_vals[i]] = i + + for i in range(n): + mask[i] = lookup[values[i]] + + return np.array(mask) @cython.boundscheck(False) def _in2d(cnp.intp_t[:, :] arr1, cnp.intp_t[:, :] arr2): @@ -515,4 +574,4 @@ def find_fragments(atoms, bondlist): # Add fragment to output frags.append(np.asarray(this_frag)) - return frags + return frags \ No newline at end of file diff --git a/testsuite/MDAnalysisTests/lib/test_cutil.py b/testsuite/MDAnalysisTests/lib/test_cutil.py index 47c4d7f905c..f27f925e127 100644 --- a/testsuite/MDAnalysisTests/lib/test_cutil.py +++ b/testsuite/MDAnalysisTests/lib/test_cutil.py @@ -28,6 +28,7 @@ unique_int_1d, find_fragments, _in2d, + inverse_int_index, ) @@ -103,3 +104,52 @@ def test_in2d_VE(arr1, arr2): ValueError, match=r"Both arrays must be \(n, 2\) arrays" ): _in2d(arr1, arr2) + + +def _python_reference_mask(ix, indices): + mask = np.zeros_like(ix) + for i, x in enumerate(indices): + values = np.where(ix == x)[0] + mask[values] = i + return mask + + +@pytest.mark.parametrize( + "ix,indices", + [ + # unsorted and not unique + ( + np.array([1, 5, 3, 3, 6], dtype=np.intp), + np.array([1, 5, 3, 6], dtype=np.intp), + ), + # sorted and not unique + ( + np.array([1, 3, 3, 5, 6], dtype=np.intp), + np.array([1, 3, 5, 6], dtype=np.intp), + ), + # unsorted and unique + ( + np.array([1, 5, 3, 6], dtype=np.intp), + np.array([1, 5, 3, 6], dtype=np.intp), + ), + # sorted and unique + ( + np.array([1, 3, 5, 6], dtype=np.intp), + np.array([1, 3, 5, 6], dtype=np.intp), + ), + # all elements identical + ( + np.array([5, 5, 5], dtype=np.intp), + np.array([5], dtype=np.intp), + ), + # single element + ( + np.array([7], dtype=np.intp), + np.array([7], dtype=np.intp), + ), + ], +) +def test_inverse_int_index(ix, indices): + pyref = _python_reference_mask(ix, indices) + cy = inverse_int_index(ix, indices) + assert_equal(pyref, cy)