From d07e9640cb7ab06d7e9656a6bc816681fbaedbe4 Mon Sep 17 00:00:00 2001 From: Ayush Agarwal Date: Thu, 26 Feb 2026 15:08:49 +0530 Subject: [PATCH 01/13] Optimize inverse index mapping in groups.py (Fixes Issue #3387) --- package/MDAnalysis/core/groups.py | 6 ++--- package/MDAnalysis/lib/_cutil.pyx | 40 +++++++++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/package/MDAnalysis/core/groups.py b/package/MDAnalysis/core/groups.py index aad2fb02f34..05cab8382a6 100644 --- a/package/MDAnalysis/core/groups.py +++ b/package/MDAnalysis/core/groups.py @@ -122,6 +122,7 @@ from ..exceptions import NoDataError from . import topologyobjects from ._get_readers import get_writer_for, get_converter_for +from ..lib._cutil import inverse_int_index def _unpickle(u, ix): @@ -912,10 +913,7 @@ def _asunique(self, group, sorted=False, set_mask=False): indices = unique_int_1d_unsorted(self.ix) if set_mask: - mask = np.zeros_like(self.ix) - for i, x in enumerate(indices): - values = np.where(self.ix == x)[0] - mask[values] = i + mask = inverse_int_index(self.ix, indices) self._unique_restore_mask = mask issorted = int_array_is_sorted(indices) diff --git a/package/MDAnalysis/lib/_cutil.pyx b/package/MDAnalysis/lib/_cutil.pyx index 5c447eada86..7c350c50990 100644 --- a/package/MDAnalysis/lib/_cutil.pyx +++ b/package/MDAnalysis/lib/_cutil.pyx @@ -37,7 +37,7 @@ from cython.operator cimport dereference as deref cnp.import_array() -__all__ = ['unique_int_1d', 'make_whole', 'find_fragments', +__all__ = ['unique_int_1d', 'inverse_int_index', 'make_whole', 'find_fragments', '_sarrus_det_single', '_sarrus_det_multiple'] cdef extern from "calc_distances.h": @@ -91,6 +91,42 @@ def unique_int_1d(cnp.intp_t[:] values): return np.array(result) +@cython.boundscheck(False) +@cython.wraparound(False) +def inverse_int_index(cnp.intp_t[:] values, + cnp.intp_t[:] unique_vals): + """ + Construct inverse index map such that: + + unique_vals[mask] == values + + Parameters + ---------- + values : numpy.ndarray + 1D array of integers. + unique_vals : numpy.ndarray + 1D array of unique integers (unsorted). + + Returns + ------- + numpy.ndarray + Integer mask mapping values -> index in unique_vals. + """ + + cdef Py_ssize_t n = values.shape[0] + cdef Py_ssize_t m = unique_vals.shape[0] + cdef Py_ssize_t i + + cdef dict lookup = {} + cdef cnp.intp_t[:] mask = np.empty(n, dtype=np.intp) + + for i in range(m): + lookup[unique_vals[i]] = i + + for i in range(n): + mask[i] = lookup[values[i]] + + return np.array(mask) @cython.boundscheck(False) def _in2d(cnp.intp_t[:, :] arr1, cnp.intp_t[:, :] arr2): @@ -515,4 +551,4 @@ def find_fragments(atoms, bondlist): # Add fragment to output frags.append(np.asarray(this_frag)) - return frags + return frags \ No newline at end of file From 463a1ec8706ecc3d2bfda8b5ebdf11e0512be4b6 Mon Sep 17 00:00:00 2001 From: Ayush Agarwal Date: Thu, 26 Feb 2026 16:20:12 +0530 Subject: [PATCH 02/13] Add changelog entry and authors entry for PR #5252 --- package/AUTHORS | 1 + package/CHANGELOG | 3 +++ 2 files changed, 4 insertions(+) diff --git a/package/AUTHORS b/package/AUTHORS index 1d7e1646794..6c9bad0abb0 100644 --- a/package/AUTHORS +++ b/package/AUTHORS @@ -271,6 +271,7 @@ Chronological list of authors - Mohammad Ayaan - Khushi Phougat - Kushagar Garg + - Ayush Agarwal External code ------------- diff --git a/package/CHANGELOG b/package/CHANGELOG index fec72a9729c..e1dcfa710a3 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -41,6 +41,9 @@ Enhancements * Adds support for parsing `.tpr` files produced by GROMACS 2026.0 * Enables parallelization for analysis.diffusionmap.DistanceMatrix (Issue #4679, PR #4745) + * Improve performance of inverse index mapping in AtomGroup by + replacing O(n^2) logic with optimized Cython implementation. + (Issue #3387, PR #5252) Changes * The msd.py inside analysis is changed, and ProgressBar is implemented inside From d62eecd6ab8ae84cf9cd7c088dc178a2f2b583ba Mon Sep 17 00:00:00 2001 From: Ayush Agarwal Date: Mon, 2 Mar 2026 21:28:50 +0530 Subject: [PATCH 03/13] Added thresholding to using cython function --- package/MDAnalysis/core/groups.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/package/MDAnalysis/core/groups.py b/package/MDAnalysis/core/groups.py index 05cab8382a6..2f2159f0f4b 100644 --- a/package/MDAnalysis/core/groups.py +++ b/package/MDAnalysis/core/groups.py @@ -913,7 +913,13 @@ def _asunique(self, group, sorted=False, set_mask=False): indices = unique_int_1d_unsorted(self.ix) if set_mask: - mask = inverse_int_index(self.ix, indices) + mask = np.zeros_like(self.ix) + if len(indices) * 50 > len(self.ix) : + mask = inverse_int_index(self.ix, indices) + else : + for i, x in enumerate(indices): + values = np.where(self.ix == x)[0] + mask[values] = i self._unique_restore_mask = mask issorted = int_array_is_sorted(indices) From fa8ad6c0adc6a29a475d4d4ce8ff43bfc4b18606 Mon Sep 17 00:00:00 2001 From: Ayush Agarwal Date: Mon, 2 Mar 2026 21:54:20 +0530 Subject: [PATCH 04/13] Number of unique values threshold for choosing cython function --- package/MDAnalysis/core/groups.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package/MDAnalysis/core/groups.py b/package/MDAnalysis/core/groups.py index 2f2159f0f4b..0106a920d66 100644 --- a/package/MDAnalysis/core/groups.py +++ b/package/MDAnalysis/core/groups.py @@ -914,7 +914,7 @@ def _asunique(self, group, sorted=False, set_mask=False): indices = unique_int_1d_unsorted(self.ix) if set_mask: mask = np.zeros_like(self.ix) - if len(indices) * 50 > len(self.ix) : + if len(indices) * 50 > len(self.ix) and len(indices) > 80: mask = inverse_int_index(self.ix, indices) else : for i, x in enumerate(indices): From 9ae78cbb466d8c3d789f7fdae50e450761158248 Mon Sep 17 00:00:00 2001 From: Ayush Agarwal Date: Mon, 2 Mar 2026 22:21:56 +0530 Subject: [PATCH 05/13] Optimised pythonic calculation of inverse index mapping --- package/MDAnalysis/core/groups.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/package/MDAnalysis/core/groups.py b/package/MDAnalysis/core/groups.py index 0106a920d66..6b4538e2d0e 100644 --- a/package/MDAnalysis/core/groups.py +++ b/package/MDAnalysis/core/groups.py @@ -918,8 +918,7 @@ def _asunique(self, group, sorted=False, set_mask=False): mask = inverse_int_index(self.ix, indices) else : for i, x in enumerate(indices): - values = np.where(self.ix == x)[0] - mask[values] = i + mask[self.ix == x] = i self._unique_restore_mask = mask issorted = int_array_is_sorted(indices) From fd97d1922c36a520666003695f955e7387cfde62 Mon Sep 17 00:00:00 2001 From: Ayush Agarwal Date: Mon, 2 Mar 2026 23:23:27 +0530 Subject: [PATCH 06/13] Removed unique value number threshold for benchmarking --- package/MDAnalysis/core/groups.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package/MDAnalysis/core/groups.py b/package/MDAnalysis/core/groups.py index 6b4538e2d0e..29e5a53e283 100644 --- a/package/MDAnalysis/core/groups.py +++ b/package/MDAnalysis/core/groups.py @@ -914,7 +914,7 @@ def _asunique(self, group, sorted=False, set_mask=False): indices = unique_int_1d_unsorted(self.ix) if set_mask: mask = np.zeros_like(self.ix) - if len(indices) * 50 > len(self.ix) and len(indices) > 80: + if len(indices) * 50 > len(self.ix): mask = inverse_int_index(self.ix, indices) else : for i, x in enumerate(indices): From f56d24686b7e94bc873d38559cb57b0c68ea104f Mon Sep 17 00:00:00 2001 From: Ayush Agarwal Date: Mon, 2 Mar 2026 23:52:07 +0530 Subject: [PATCH 07/13] Trying 10 for cython threshold --- package/MDAnalysis/core/groups.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package/MDAnalysis/core/groups.py b/package/MDAnalysis/core/groups.py index 29e5a53e283..2877707cda9 100644 --- a/package/MDAnalysis/core/groups.py +++ b/package/MDAnalysis/core/groups.py @@ -914,7 +914,7 @@ def _asunique(self, group, sorted=False, set_mask=False): indices = unique_int_1d_unsorted(self.ix) if set_mask: mask = np.zeros_like(self.ix) - if len(indices) * 50 > len(self.ix): + if len(indices) * 10 > len(self.ix): mask = inverse_int_index(self.ix, indices) else : for i, x in enumerate(indices): From fcb4931904f035ca2bd529e20903f5147e04ab91 Mon Sep 17 00:00:00 2001 From: Ayush Agarwal Date: Tue, 3 Mar 2026 00:13:15 +0530 Subject: [PATCH 08/13] Removed thresholding altogether for benchmarking --- package/MDAnalysis/core/groups.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/package/MDAnalysis/core/groups.py b/package/MDAnalysis/core/groups.py index 2877707cda9..05cab8382a6 100644 --- a/package/MDAnalysis/core/groups.py +++ b/package/MDAnalysis/core/groups.py @@ -913,12 +913,7 @@ def _asunique(self, group, sorted=False, set_mask=False): indices = unique_int_1d_unsorted(self.ix) if set_mask: - mask = np.zeros_like(self.ix) - if len(indices) * 10 > len(self.ix): - mask = inverse_int_index(self.ix, indices) - else : - for i, x in enumerate(indices): - mask[self.ix == x] = i + mask = inverse_int_index(self.ix, indices) self._unique_restore_mask = mask issorted = int_array_is_sorted(indices) From e22709e96d22dbec5d44458731d47e4b4e21f44e Mon Sep 17 00:00:00 2001 From: Ayush Agarwal Date: Wed, 11 Mar 2026 23:25:41 +0530 Subject: [PATCH 09/13] Corrected AUTHORS file --- package/AUTHORS | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/package/AUTHORS b/package/AUTHORS index 1b690fd0dd6..5b56d82dedb 100644 --- a/package/AUTHORS +++ b/package/AUTHORS @@ -271,8 +271,11 @@ Chronological list of authors - Mohammad Ayaan - Khushi Phougat - Kushagar Garg - - Ayush Agarwal - Jeremy M. G. Leung + - Harshit Gajjela + - Kunj Sinha + - Ayush Agarwal + - Parth Uppal External code ------------- @@ -320,4 +323,4 @@ Logo The MDAnalysis 'Atom' logo was designed by Christian Beckstein; it is Copyright (c) 2011 Christian Beckstein and made available under a -Creative Commons Attribution-NoDerivs 3.0 Unported License. +Creative Commons Attribution-NoDerivs 3.0 Unported License. \ No newline at end of file From 97f112c1f10e7d977dbc4241d4fc5a012f98ce78 Mon Sep 17 00:00:00 2001 From: Ayush Agarwal Date: Wed, 11 Mar 2026 23:33:23 +0530 Subject: [PATCH 10/13] Corrected CHANGELOG --- package/CHANGELOG | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/package/CHANGELOG b/package/CHANGELOG index e10fa1a49a8..c12c96d93b8 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -15,11 +15,18 @@ The rules for this file: ------------------------------------------------------------------------------- ??/??/?? IAlibay, orbeckst, marinegor, tylerjereddy, ljwoods2, marinegor, - spyke7, talagayev, tanii1125, BradyAJohnston, hejamu, jeremyleung521 + spyke7, talagayev, tanii1125, BradyAJohnston, hejamu, jeremyleung521, + harshitgajjela-droid, kunjsinha, aygarwal, jauy123 * 2.11.0 Fixes + * Improved performance of inverse index mapping in AtomGroup using an optimized + Cython implementation in lib._cutils.inverse_int_index() + (Issue #3387, PR #5252) + * Fixes TypeError with np.int64 indexing in GSD Reader (Issue #5224) + * Fixed bug in add_transformations allowing non-callable transformations + (Issue #2558, PR #2558) * Drop/Replace lock file test in test_xdr (Issue #5236, PR #5237) * HydrogenBondAnalysis: Fixed `count_by_time()` when using `run(FrameIterator)` that results in `self.start` and `self.end` being None (Issue #5200, PR #5202) @@ -38,13 +45,15 @@ Fixes DSSP by porting upstream PyDSSP 0.9.1 fix (Issue #4913) Enhancements + * Reduces duplication of code in _apply() function (Issue #5247, PR #5294) + * Added new top-level `MDAnalysis.fetch` module (PR #4943) + * Added new function `MDAnalysis.fetch.from_PDB` to download structure files from wwPDB + using `pooch` as optional dependency (Issue #4907, PR #4943) + * Added benchmarks for package.MDAnalysis.analysis.msd.EinsteinMSD (PR #5277) * Improved performance of `AtomGroup.wrap()` with compounds (PR #5220) * Adds support for parsing `.tpr` files produced by GROMACS 2026.0 * Enables parallelization for analysis.diffusionmap.DistanceMatrix (Issue #4679, PR #4745) - * Improve performance of inverse index mapping in AtomGroup by - replacing O(n^2) logic with optimized Cython implementation. - (Issue #3387, PR #5252) Changes * The msd.py inside analysis is changed, and ProgressBar is implemented inside @@ -3615,4 +3624,4 @@ Testsuite licenses 11/12/07 naveen - * prepared for release outside lab + * prepared for release outside lab \ No newline at end of file From 50ac6875bceb07eaba0a5a7b4e1acacc0f3523db Mon Sep 17 00:00:00 2001 From: Ayush Agarwal Date: Fri, 13 Mar 2026 11:36:30 +0530 Subject: [PATCH 11/13] Corrected CHANGELOG --- package/CHANGELOG | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/package/CHANGELOG b/package/CHANGELOG index c12c96d93b8..56fa780cd79 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -21,9 +21,6 @@ The rules for this file: * 2.11.0 Fixes - * Improved performance of inverse index mapping in AtomGroup using an optimized - Cython implementation in lib._cutils.inverse_int_index() - (Issue #3387, PR #5252) * Fixes TypeError with np.int64 indexing in GSD Reader (Issue #5224) * Fixed bug in add_transformations allowing non-callable transformations (Issue #2558, PR #2558) @@ -45,6 +42,9 @@ Fixes DSSP by porting upstream PyDSSP 0.9.1 fix (Issue #4913) Enhancements + * Improved performance of inverse index mapping in AtomGroup using an optimized + Cython implementation in lib._cutils.inverse_int_index() + (Issue #3387, PR #5252) * Reduces duplication of code in _apply() function (Issue #5247, PR #5294) * Added new top-level `MDAnalysis.fetch` module (PR #4943) * Added new function `MDAnalysis.fetch.from_PDB` to download structure files from wwPDB From e1df729f8322a64f3937785456eef616282529ac Mon Sep 17 00:00:00 2001 From: Ayush Agarwal Date: Tue, 24 Mar 2026 22:03:34 +0530 Subject: [PATCH 12/13] Added test for cython function correctness --- testsuite/MDAnalysisTests/lib/test_cutil.py | 50 +++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/testsuite/MDAnalysisTests/lib/test_cutil.py b/testsuite/MDAnalysisTests/lib/test_cutil.py index 47c4d7f905c..f27f925e127 100644 --- a/testsuite/MDAnalysisTests/lib/test_cutil.py +++ b/testsuite/MDAnalysisTests/lib/test_cutil.py @@ -28,6 +28,7 @@ unique_int_1d, find_fragments, _in2d, + inverse_int_index, ) @@ -103,3 +104,52 @@ def test_in2d_VE(arr1, arr2): ValueError, match=r"Both arrays must be \(n, 2\) arrays" ): _in2d(arr1, arr2) + + +def _python_reference_mask(ix, indices): + mask = np.zeros_like(ix) + for i, x in enumerate(indices): + values = np.where(ix == x)[0] + mask[values] = i + return mask + + +@pytest.mark.parametrize( + "ix,indices", + [ + # unsorted and not unique + ( + np.array([1, 5, 3, 3, 6], dtype=np.intp), + np.array([1, 5, 3, 6], dtype=np.intp), + ), + # sorted and not unique + ( + np.array([1, 3, 3, 5, 6], dtype=np.intp), + np.array([1, 3, 5, 6], dtype=np.intp), + ), + # unsorted and unique + ( + np.array([1, 5, 3, 6], dtype=np.intp), + np.array([1, 5, 3, 6], dtype=np.intp), + ), + # sorted and unique + ( + np.array([1, 3, 5, 6], dtype=np.intp), + np.array([1, 3, 5, 6], dtype=np.intp), + ), + # all elements identical + ( + np.array([5, 5, 5], dtype=np.intp), + np.array([5], dtype=np.intp), + ), + # single element + ( + np.array([7], dtype=np.intp), + np.array([7], dtype=np.intp), + ), + ], +) +def test_inverse_int_index(ix, indices): + pyref = _python_reference_mask(ix, indices) + cy = inverse_int_index(ix, indices) + assert_equal(pyref, cy) From fc3f507bf81d73df91a789d632b17f9472d0e8bc Mon Sep 17 00:00:00 2001 From: Ayush Agarwal Date: Wed, 25 Mar 2026 20:58:07 +0530 Subject: [PATCH 13/13] Added standard numpy docstring to inverse_int_index --- package/MDAnalysis/lib/_cutil.pyx | 35 +++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/package/MDAnalysis/lib/_cutil.pyx b/package/MDAnalysis/lib/_cutil.pyx index 7c350c50990..19bc9d354f7 100644 --- a/package/MDAnalysis/lib/_cutil.pyx +++ b/package/MDAnalysis/lib/_cutil.pyx @@ -95,22 +95,45 @@ def unique_int_1d(cnp.intp_t[:] values): @cython.wraparound(False) def inverse_int_index(cnp.intp_t[:] values, cnp.intp_t[:] unique_vals): - """ - Construct inverse index map such that: + r"""Construct an inverse index array (mask) mapping values to unique_vals. - unique_vals[mask] == values + The returned mask contains the indices such that: + + .. math:: + \text{unique\_vals}[\text{mask}] == \text{values} Parameters ---------- values : numpy.ndarray - 1D array of integers. + 1D array of integers (can contain duplicates). unique_vals : numpy.ndarray - 1D array of unique integers (unsorted). + 1D array of unique integers corresponding to the elements in `values`. Returns ------- numpy.ndarray - Integer mask mapping values -> index in unique_vals. + An integer array `mask` of the same length as `values`, where + ``mask[i]`` is the index of ``values[i]`` in `unique_vals`. + + + Notes + ----- + + + .. versionadded:: 2.11.0 + + + Examples + -------- + >>> import numpy as np + >>> from MDAnalysis.lib._cutil import inverse_int_index + >>> vals = np.array([1, 5, 3, 3, 6], dtype=np.intp) + >>> uniq = np.array([1, 5, 3, 6], dtype=np.intp) + >>> mask = inverse_int_index(vals, uniq) + >>> mask + array([0, 1, 2, 2, 3]) + >>> np.all(uniq[mask] == vals) + True """ cdef Py_ssize_t n = values.shape[0]