-
Notifications
You must be signed in to change notification settings - Fork 4
/
_md_compute_contacts.py
331 lines (301 loc) · 14.8 KB
/
_md_compute_contacts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
##############################################################################
# This file is part of mdciao.
#
# Copyright 2024 Charité Universitätsmedizin Berlin and the Authors
#
# Authors: Guillermo Pérez-Hernandez
# Contributors:
#
# mdciao is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# mdciao is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with mdciao. If not, see <https://www.gnu.org/licenses/>.
##############################################################################
##############################################################################
# Please note: The following method "compute_contacts" is a
# modified version of the one found in the original MDTraj
# Python Library, whose original authors and copyright
# holders are listed below.
#
# The modifications consist in:
# 1. including the indices of the closest atom-pairs in the returned values.
# 2. adapting 'sidechain' and 'sidechain-heavy' schemes for the cases of hydrogen-less glycines
# and non-protein residues.
#
# The modified lines are in the documentation for the returned value `atom_pairs`
# and the 'sidechain' and 'sidechain-heavy' arguments, and elsewhere marked with the comment
# '#mdciao' in the file.
#
# The modifications were applied on mdtraj release v1.10.0rc1
# commit 4c9bb6e8bc7d6e86890ff0d57814c3c76f7cf792
##############################################################################
# MDTraj: A Python Library for Loading, Saving, and Manipulating
# Molecular Dynamics Trajectories.
# Copyright 2012-2013 Stanford University and the Authors
#
# Authors: Christian Schwantes
# Contributors: Robert McGibbon
#
# MDTraj is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as
# published by the Free Software Foundation, either version 2.1
# of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with MDTraj. If not, see <http://www.gnu.org/licenses/>.
##############################################################################
# opened an issue for this:
# https://github.com/mdtraj/mdtraj/issues/1569
# Further reading
# https://www.oreilly.com/library/view/understanding-open-source/0596005814/ch03.html
# https://tldrlegal.com/license/gnu-lesser-general-public-license-v3-(lgpl-3)
# http://oss-watch.ac.uk/resources/lgpl
#https://www.gnu.org/licenses/gpl-faq.html#AllCompatibility
import itertools
import numpy as np
import mdtraj as md
from mdtraj.core import element
from mdtraj.utils import ensure_type
from mdciao.utils.residue_and_atom import _residue_sidechain_membership #mdciao
def compute_contacts(
traj,
contacts="all",
scheme="closest-heavy",
ignore_nonprotein=True,
periodic=True,
soft_min=False,
soft_min_beta=20,
):
"""Compute the distance between pairs of residues in a trajectory.
Parameters
----------
traj : md.Trajectory
An mdtraj trajectory. It must contain topology information.
contacts : array-like, ndim=2 or 'all'
An array containing pairs of indices (0-indexed) of residues to
compute the contacts between, or 'all'. The string 'all' will
select all pairs of residues separated by two or more residues
(i.e. the i to i+1 and i to i+2 pairs will be excluded).
scheme : {'ca', 'closest', 'closest-heavy', 'sidechain', 'sidechain-heavy'}
scheme to determine the distance between two residues:
'ca' : distance between two residues is given by the distance
between their alpha carbons
'closest' : distance is the closest distance between any
two atoms in the residues
'closest-heavy' : distance is the closest distance between
any two non-hydrogen atoms in the residues
'sidechain' : distance is the closest distance between any
two atoms in residue sidechains. For glycine,
'sidechain' tries to use sidechain hydrogens first
and if none are present fall back to any atom of the glycine.
'sidechain-heavy' : distance is the closest distance between
any two non-hydrogen atoms in residue sidechains
For glycine, 'sidechain-heavy' tries to fall back
first to sidechain hydrogens and if none are present
to any atom of the glycine.
ignore_nonprotein : bool
When using `contact==all`, don't compute contacts between
"residues" which are not protein (i.e. do not contain an alpha
carbon).
periodic : bool, default=True
If periodic is True and the trajectory contains unitcell information,
we will compute distances under the minimum image convention.
soft_min : bool, default=False
If soft_min is true, we will use a diffrentiable version of
the scheme. The exact expression used
is d = \frac{\beta}{log\\sum_i{exp(\frac{\beta}{d_i}})} where
beta is user parameter which defaults to 20nm. The expression
we use is copied from the plumed mindist calculator.
http://plumed.github.io/doc-v2.0/user-doc/html/mindist.html
soft_min_beta : float, default=20nm
The value of beta to use for the soft_min distance option.
Very large values might cause small contact distances to go to 0.
Returns
-------
distances : np.ndarray, shape=(n_frames, n_pairs), dtype=np.float32
Distances for each residue-residue contact in each frame
of the trajectory
residue_pairs : np.ndarray, shape=(n_pairs, 2), dtype=int
Each row of this return value gives the indices of the residues
involved in the contact. This argument mirrors the `contacts` input
parameter. When `all` is specified as input, this return value
gives the actual residue pairs resolved from `all`. Furthermore,
when scheme=='ca', any contact pair supplied as input corresponding
to a residue without an alpha carbon (e.g. HOH) is ignored from the
input contacts list, meanings that the indexing of the
output `distances` may not match up with the indexing of the input
`contacts`. But the indexing of `distances` *will* match up with
the indexing of `residue_pairs`
atom_pairs : np.ndarray, shape=(n_pairs, 2), dtype=int
Each row of this return value gives the indices of the atoms
involved in the contact
Examples
--------
>>> # To compute the contact distance between residue 0 and 10 and
>>> # residues 0 and 11
>>> md.compute_contacts(t, [[0, 10], [0, 11]])
>>> # the itertools library can be useful to generate the arrays of indices
>>> group_1 = [0, 1, 2]
>>> group_2 = [10, 11]
>>> pairs = list(itertools.product(group_1, group_2))
>>> print(pairs)
[(0, 10), (0, 11), (1, 10), (1, 11), (2, 10), (2, 11)]
>>> md.compute_contacts(t, pairs)
See Also
--------
mdtraj.geometry.squareform : turn the result from this function
into a square "contact map"
Topology.residue : Get residues from the topology by index
"""
if traj.topology is None:
raise ValueError("contact calculation requires a topology")
if isinstance(contacts, str):
if contacts.lower() != "all":
raise ValueError(
"(%s) is not a valid contacts specifier" % contacts.lower(),
)
residue_pairs = []
for i in range(traj.n_residues):
residue_i = traj.topology.residue(i)
if ignore_nonprotein and not any(a for a in residue_i.atoms if a.name.lower() == "ca"):
continue
for j in range(i + 3, traj.n_residues):
residue_j = traj.topology.residue(j)
if ignore_nonprotein and not any(a for a in residue_j.atoms if a.name.lower() == "ca"):
continue
if residue_i.chain == residue_j.chain:
residue_pairs.append((i, j))
residue_pairs = np.array(residue_pairs)
if len(residue_pairs) == 0:
raise ValueError("No acceptable residue pairs found")
else:
residue_pairs = ensure_type(
np.asarray(contacts),
dtype=int,
ndim=2,
name="contacts",
shape=(None, 2),
warn_on_cast=False,
)
if not np.all((residue_pairs >= 0) * (residue_pairs < traj.n_residues)):
raise ValueError(
"contacts requests a residue that is not in the permitted range",
)
# now the bulk of the function. This will calculate atom distances and then
# re-work them in the required scheme to get residue distances
scheme = scheme.lower()
if scheme not in ["ca", "closest", "closest-heavy", "sidechain", "sidechain-heavy"]:
raise ValueError(
"scheme must be one of [ca, closest, closest-heavy, sidechain, sidechain-heavy]",
)
if scheme == "ca":
if soft_min:
import warnings
warnings.warn(
"The soft_min=True option with scheme=ca gives" "the same results as soft_min=False",
)
filtered_residue_pairs = []
atom_pairs = []
for r0, r1 in residue_pairs:
ca_atoms_0 = [a.index for a in traj.top.residue(r0).atoms if a.name.lower() == "ca"]
ca_atoms_1 = [a.index for a in traj.top.residue(r1).atoms if a.name.lower() == "ca"]
if len(ca_atoms_0) == 1 and len(ca_atoms_1) == 1:
atom_pairs.append((ca_atoms_0[0], ca_atoms_1[0]))
filtered_residue_pairs.append((r0, r1))
elif len(ca_atoms_0) == 0 or len(ca_atoms_1) == 0:
# residue does not contain a CA atom, skip it
if contacts != "all":
# if the user manually asked for this residue, and didn't use "all"
import warnings
warnings.warn(
"Ignoring contacts pair %d-%d. No alpha carbon." % (r0, r1),
)
else:
raise ValueError(
"More than 1 alpha carbon detected in residue %d or %d" % (r0, r1),
)
residue_pairs = np.array(filtered_residue_pairs)
distances = md.compute_distances(traj, atom_pairs, periodic=periodic)
aa_pairs = [[pair] * traj.n_frames for pair in atom_pairs]
elif scheme in ["closest", "closest-heavy", "sidechain", "sidechain-heavy"]:
if scheme == "closest":
residue_membership = [[atom.index for atom in residue.atoms] for residue in traj.topology.residues]
elif scheme == "closest-heavy":
# then remove the hydrogens from the above list
residue_membership = [
[atom.index for atom in residue.atoms if not (atom.element == element.hydrogen)]
for residue in traj.topology.residues
]
elif scheme == "sidechain":
if "GLY" in [residue.name for residue in traj.topology.residues]: #mdciao
import warnings #mdciao
warnings.warn( #mdciao
"selected topology includes at least one glycine residue. Where no sidechain hydrogen " #mdciao
"is present, the distances involving glycine residues will be " #mdciao
"computed using the all glycine atoms." # mdciao
) #mdciao
residue_membership = [_residue_sidechain_membership(scheme,residue) for residue in traj.topology.residues]
elif scheme == "sidechain-heavy":
# then remove the hydrogens from the above list
if "GLY" in [residue.name for residue in traj.topology.residues]:
import warnings
warnings.warn(
"selected topology includes at least one glycine residue, which has no heavy "
"atoms in its sidechain. The distances involving glycine residues will be "
"computed using the sidechain hydrogen instead. If no sidechain hydrogen is present," #mdciao
" all glycine atoms will be used." #mdciao
)
residue_membership = [_residue_sidechain_membership(scheme, residue) for residue in traj.topology.residues] #mdciao
residue_lens = [len(ainds) for ainds in residue_membership]
atom_pairs = []
n_atom_pairs_per_residue_pair = []
for pair in residue_pairs:
atom_pairs.extend(
list(
itertools.product(
residue_membership[pair[0]],
residue_membership[pair[1]],
),
),
)
n_atom_pairs_per_residue_pair.append(
residue_lens[pair[0]] * residue_lens[pair[1]],
)
atom_distances = md.compute_distances(traj, atom_pairs, periodic=periodic)
# now squash the results based on residue membership
n_residue_pairs = len(residue_pairs)
distances = np.zeros((len(traj), n_residue_pairs), dtype=np.float32)
n_atom_pairs_per_residue_pair = np.asarray(n_atom_pairs_per_residue_pair)
aa_pairs = [] #mdciao
for i in range(n_residue_pairs):
index = int(np.sum(n_atom_pairs_per_residue_pair[:i]))
n = n_atom_pairs_per_residue_pair[i]
idx_min = atom_distances[:, index: index + n].argmin(axis=1) #mdciao
aa_pairs.append(np.array(atom_pairs[index: index + n])[idx_min]) #mdciao
if not soft_min:
distances[:, i] = atom_distances[:, index : index + n].min(axis=1)
else:
distances[:, i] = soft_min_beta / np.log(
np.sum(
np.exp(
soft_min_beta / atom_distances[:, index : index + n],
),
axis=1,
),
)
else:
raise ValueError("This is not supposed to happen!")
return distances, residue_pairs, np.hstack(aa_pairs) #mdciao