diff --git a/package/MDAnalysis/core/groups.py b/package/MDAnalysis/core/groups.py index ddcf9b792ee..66414fa8f17 100644 --- a/package/MDAnalysis/core/groups.py +++ b/package/MDAnalysis/core/groups.py @@ -2689,6 +2689,10 @@ def select_atoms(self, sel, *othersel, **selgroups): record_type *record_type* for selecting either ATOM or HETATM from PDB-like files. e.g. ``select_atoms('name CA and not record_type HETATM')`` + smarts *SMARTS-query* + select atoms using Daylight's SMARTS queries, e.g. ``smarts + [#7;R]`` to find nitrogen atoms in rings. Requires RDKit. + All matches (max 1000) are combined as a unique match **Boolean** @@ -2835,6 +2839,8 @@ def select_atoms(self, sel, *othersel, **selgroups): equivalent ``global group`` selection. Removed flags affecting default behaviour for periodic selections; periodic are now on by default (as with default flags) + .. versionchanged:: 2.0.0 + Added the *smarts* selection. """ if not sel: diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py index bdb156ff249..c9de1a25862 100644 --- a/package/MDAnalysis/core/selection.py +++ b/package/MDAnalysis/core/selection.py @@ -600,6 +600,60 @@ def apply(self, group): return group[group.aromaticities].unique +class SmartsSelection(Selection): + """Select atoms based on SMARTS queries. + + Uses RDKit to run the query and converts the result to MDAnalysis. + Supports chirality. + """ + token = 'smarts' + + def __init__(self, parser, tokens): + # The parser will add spaces around parentheses and then split the + # selection based on spaces to create the tokens + # If the input SMARTS query contained parentheses, the query will be + # split because of that and we need to reconstruct it + # We also need to keep the parentheses that are not part of the smarts + # query intact + pattern = [] + counter = {"(": 0, ")": 0} + # loop until keyword but ignore parentheses as a keyword + while tokens[0] in ["(", ")"] or not is_keyword(tokens[0]): + # keep track of the number of open and closed parentheses + if tokens[0] in ["(", ")"]: + counter[tokens[0]] += 1 + # if the char is a closing ")" but there's no corresponding + # open "(" then we've reached then end of the smarts query and + # the current token ")" is part of a grouping parenthesis + if tokens[0] == ")" and counter["("] < (counter[")"]): + break + # add the token to the pattern and remove it from the tokens + val = tokens.popleft() + pattern.append(val) + self.pattern = "".join(pattern) + + def apply(self, group): + try: + from rdkit import Chem + except ImportError: + raise ImportError("RDKit is required for SMARTS-based atom " + "selection but it's not installed. Try " + "installing it with \n" + "conda install -c conda-forge rdkit") + pattern = Chem.MolFromSmarts(self.pattern) + if not pattern: + raise ValueError(f"{self.pattern!r} is not a valid SMARTS query") + mol = group.convert_to("RDKIT") + matches = mol.GetSubstructMatches(pattern, useChirality=True) + # convert rdkit indices to mdanalysis' + indices = [ + mol.GetAtomWithIdx(idx).GetIntProp("_MDAnalysis_index") + for match in matches for idx in match] + # create boolean mask for atoms based on index + mask = np.in1d(range(group.n_atoms), np.unique(indices)) + return group[mask] + + class ResidSelection(Selection): """Select atoms based on numerical fields diff --git a/package/doc/sphinx/source/documentation_pages/selections.rst b/package/doc/sphinx/source/documentation_pages/selections.rst index 80dc3aa8b60..88a06924998 100644 --- a/package/doc/sphinx/source/documentation_pages/selections.rst +++ b/package/doc/sphinx/source/documentation_pages/selections.rst @@ -99,6 +99,12 @@ moltype *molecule-type* select by molecule type, e.g. ``moltype Protein_A``. At the moment, only the TPR format defines the molecule type. +smarts *SMARTS-query* + select atoms using Daylight's SMARTS queries, e.g. ``smarts [#7;R]`` to + find nitrogen atoms in rings. Requires RDKit. All matches (max 1000) are + combined as a unique match. + + Pattern matching ---------------- diff --git a/testsuite/MDAnalysisTests/core/test_atomselections.py b/testsuite/MDAnalysisTests/core/test_atomselections.py index b7d0f515b7f..cee4d63f315 100644 --- a/testsuite/MDAnalysisTests/core/test_atomselections.py +++ b/testsuite/MDAnalysisTests/core/test_atomselections.py @@ -42,6 +42,7 @@ TRZ_psf, TRZ, PDB_icodes, PDB_HOLE, + PDB_helix, ) from MDAnalysisTests import make_Universe @@ -526,17 +527,43 @@ def u(self): u = MDAnalysis.Universe.from_smiles(smi, addHs=False, generate_coordinates=False) return u - + + @pytest.fixture + def u2(self): + u = MDAnalysis.Universe.from_smiles("Nc1cc(C[C@H]([O-])C=O)c[nH]1") + return u + @pytest.mark.parametrize("sel_str, n_atoms", [ ("aromatic", 5), ("not aromatic", 1), ("type N and aromatic", 1), ("type C and aromatic", 4), ]) - def test_selection(self, u, sel_str, n_atoms): + def test_aromatic_selection(self, u, sel_str, n_atoms): sel = u.select_atoms(sel_str) assert sel.n_atoms == n_atoms + @pytest.mark.parametrize("sel_str, indices", [ + ("smarts n", [10]), + ("smarts [#7]", [0, 10]), + ("smarts a", [1, 2, 3, 9, 10]), + ("smarts c", [1, 2, 3, 9]), + ("smarts [*-]", [6]), + ("smarts [$([!#1]);$([!R][R])]", [0, 4]), + ("smarts [$([C@H](-[CH2])(-[O-])-C=O)]", [5]), + ("smarts [$([C@@H](-[CH2])(-[O-])-C=O)]", []), + ("smarts a and type C", [1, 2, 3, 9]), + ("(smarts a) and (type C)", [1, 2, 3, 9]), + ("smarts a and type N", [10]), + ]) + def test_smarts_selection(self, u2, sel_str, indices): + sel = u2.select_atoms(sel_str) + assert_equal(sel.indices, indices) + + def test_invalid_smarts_sel_raises_error(self, u2): + with pytest.raises(ValueError, match="not a valid SMARTS"): + u2.select_atoms("smarts foo") + class TestSelectionsNucleicAcids(object): @pytest.fixture(scope='class')