Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions package/MDAnalysis/core/groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -2689,6 +2689,10 @@ def select_atoms(self, sel, *othersel, **selgroups):
record_type *record_type*
for selecting either ATOM or HETATM from PDB-like files.
e.g. ``select_atoms('name CA and not record_type HETATM')``
smarts *SMARTS-query*
select atoms using Daylight's SMARTS queries, e.g. ``smarts
[#7;R]`` to find nitrogen atoms in rings. Requires RDKit.
All matches (max 1000) are combined as a unique match

**Boolean**

Expand Down Expand Up @@ -2835,6 +2839,8 @@ def select_atoms(self, sel, *othersel, **selgroups):
equivalent ``global group`` selection.
Removed flags affecting default behaviour for periodic selections;
periodic are now on by default (as with default flags)
.. versionchanged:: 2.0.0
Added the *smarts* selection.
"""

if not sel:
Expand Down
54 changes: 54 additions & 0 deletions package/MDAnalysis/core/selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,60 @@ def apply(self, group):
return group[group.aromaticities].unique


class SmartsSelection(Selection):
"""Select atoms based on SMARTS queries.

Uses RDKit to run the query and converts the result to MDAnalysis.
Supports chirality.
"""
token = 'smarts'

def __init__(self, parser, tokens):
# The parser will add spaces around parentheses and then split the
# selection based on spaces to create the tokens
# If the input SMARTS query contained parentheses, the query will be
# split because of that and we need to reconstruct it
# We also need to keep the parentheses that are not part of the smarts
# query intact
pattern = []
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it simpler to enforce no white space allowed in the pattern?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

White spaces are added automatically by the parser around parentheses:
https://github.com/cbouy/mdanalysis/blob/rdkit-converter/package/MDAnalysis/core/selection.py#L1182

counter = {"(": 0, ")": 0}
# loop until keyword but ignore parentheses as a keyword
while tokens[0] in ["(", ")"] or not is_keyword(tokens[0]):
# keep track of the number of open and closed parentheses
if tokens[0] in ["(", ")"]:
counter[tokens[0]] += 1
# if the char is a closing ")" but there's no corresponding
# open "(" then we've reached then end of the smarts query and
# the current token ")" is part of a grouping parenthesis
if tokens[0] == ")" and counter["("] < (counter[")"]):
break
# add the token to the pattern and remove it from the tokens
val = tokens.popleft()
pattern.append(val)
self.pattern = "".join(pattern)

def apply(self, group):
try:
from rdkit import Chem
except ImportError:
raise ImportError("RDKit is required for SMARTS-based atom "
"selection but it's not installed. Try "
"installing it with \n"
"conda install -c conda-forge rdkit")
pattern = Chem.MolFromSmarts(self.pattern)
if not pattern:
raise ValueError(f"{self.pattern!r} is not a valid SMARTS query")
mol = group.convert_to("RDKIT")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be nice to try and cache this in Topology.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The mol (topology, not coordinates) is already cached in the converter so that we don't need to rebuild it for every frame of a trajectory:
https://github.com/cbouy/mdanalysis/blob/rdkit-converter/package/MDAnalysis/coordinates/RDKit.py#L253-L264

matches = mol.GetSubstructMatches(pattern, useChirality=True)
# convert rdkit indices to mdanalysis'
indices = [
mol.GetAtomWithIdx(idx).GetIntProp("_MDAnalysis_index")
for match in matches for idx in match]
# create boolean mask for atoms based on index
mask = np.in1d(range(group.n_atoms), np.unique(indices))
return group[mask]


class ResidSelection(Selection):
"""Select atoms based on numerical fields

Expand Down
6 changes: 6 additions & 0 deletions package/doc/sphinx/source/documentation_pages/selections.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,12 @@ moltype *molecule-type*
select by molecule type, e.g. ``moltype Protein_A``. At the moment, only
the TPR format defines the molecule type.

smarts *SMARTS-query*
select atoms using Daylight's SMARTS queries, e.g. ``smarts [#7;R]`` to
find nitrogen atoms in rings. Requires RDKit. All matches (max 1000) are
combined as a unique match.


Pattern matching
----------------

Expand Down
31 changes: 29 additions & 2 deletions testsuite/MDAnalysisTests/core/test_atomselections.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
TRZ_psf, TRZ,
PDB_icodes,
PDB_HOLE,
PDB_helix,
)
from MDAnalysisTests import make_Universe

Expand Down Expand Up @@ -526,17 +527,43 @@ def u(self):
u = MDAnalysis.Universe.from_smiles(smi, addHs=False,
generate_coordinates=False)
return u


@pytest.fixture
def u2(self):
u = MDAnalysis.Universe.from_smiles("Nc1cc(C[C@H]([O-])C=O)c[nH]1")
return u

@pytest.mark.parametrize("sel_str, n_atoms", [
("aromatic", 5),
("not aromatic", 1),
("type N and aromatic", 1),
("type C and aromatic", 4),
])
def test_selection(self, u, sel_str, n_atoms):
def test_aromatic_selection(self, u, sel_str, n_atoms):
sel = u.select_atoms(sel_str)
assert sel.n_atoms == n_atoms

@pytest.mark.parametrize("sel_str, indices", [
("smarts n", [10]),
("smarts [#7]", [0, 10]),
("smarts a", [1, 2, 3, 9, 10]),
("smarts c", [1, 2, 3, 9]),
("smarts [*-]", [6]),
("smarts [$([!#1]);$([!R][R])]", [0, 4]),
("smarts [$([C@H](-[CH2])(-[O-])-C=O)]", [5]),
("smarts [$([C@@H](-[CH2])(-[O-])-C=O)]", []),
("smarts a and type C", [1, 2, 3, 9]),
("(smarts a) and (type C)", [1, 2, 3, 9]),
("smarts a and type N", [10]),
])
def test_smarts_selection(self, u2, sel_str, indices):
sel = u2.select_atoms(sel_str)
assert_equal(sel.indices, indices)

def test_invalid_smarts_sel_raises_error(self, u2):
with pytest.raises(ValueError, match="not a valid SMARTS"):
u2.select_atoms("smarts foo")


class TestSelectionsNucleicAcids(object):
@pytest.fixture(scope='class')
Expand Down