1+ {
2+ "nbformat" : 4 ,
3+ "nbformat_minor" : 0 ,
4+ "metadata" : {
5+ "colab" : {
6+ "provenance" : []
7+ },
8+ "kernelspec" : {
9+ "name" : " python3" ,
10+ "display_name" : " Python 3"
11+ },
12+ "language_info" : {
13+ "name" : " python"
14+ }
15+ },
16+ "cells" : [
17+ {
18+ "cell_type" : " markdown" ,
19+ "source" : [
20+ " #ModLoop\n " ,
21+ " \n " ,
22+ " ModLoop is a protocol for automated modeling of loops in protein structures. The server relies on the loop modeling routine in [MODELLER](https://salilab.org/modeller/) that predicts the loop conformations by satisfaction of spatial restraints, without relying on a database of known protein structures.\n " ,
23+ " \n " ,
24+ " The ModLoop protocol can be run in Google Colab. The first step is to install the Modeller software by running the cell below:"
25+ ],
26+ "metadata" : {
27+ "id" : " NtLLmo9CCCOA"
28+ }
29+ },
30+ {
31+ "cell_type" : " code" ,
32+ "execution_count" : null ,
33+ "metadata" : {
34+ "id" : " 5uAWPuZzXlUq"
35+ },
36+ "outputs" : [],
37+ "source" : [
38+ " # Install Modeller from Sali lab website\n " ,
39+ " modver = \" 10.8\"\n " ,
40+ " !wget \" https://salilab.org/modeller/{modver}/modeller_{modver}-1_amd64.deb\"\n " ,
41+ " !apt install \" ./modeller_{modver}-1_amd64.deb\"\n " ,
42+ " !rm \" modeller_{modver}-1_amd64.deb\"\n " ,
43+ " # Add Modeller Python modules to Colab's Python path\n " ,
44+ " import sys\n " ,
45+ " sys.path.append(\" /usr/lib/python3.9/dist-packages\" )"
46+ ]
47+ },
48+ {
49+ "cell_type" : " markdown" ,
50+ "source" : [
51+ " Next, configure the protocol by running the following cell and giving the [MODELLER license key](https://modbase.compbio.ucsf.edu/modloop/help#modkey), uploading the [starting structure](https://modbase.compbio.ucsf.edu/modloop/help#file) in PDB or mmCIF format, and specifying the [loops to refine](https://modbase.compbio.ucsf.edu/modloop/help#loop):"
52+ ],
53+ "metadata" : {
54+ "id" : " 8G2BFSeTDSy9"
55+ }
56+ },
57+ {
58+ "cell_type" : " code" ,
59+ "source" : [
60+ " # @title\n " ,
61+ " import ipywidgets as widgets\n " ,
62+ " from ipywidgets import GridspecLayout\n " ,
63+ " grid = GridspecLayout(3, 2)\n " ,
64+ " grid[0, 0] = widgets.Label(\" Modeller license key\" )\n " ,
65+ " grid[0, 1] = key = widgets.Text()\n " ,
66+ " grid[1, 0] = widgets.Label(\" Upload coordinate file\" )\n " ,
67+ " grid[1, 1] = coord = widgets.FileUpload(multiple=False)\n " ,
68+ " grid[2, 0] = widgets.Label(\" Enter loop segments\" )\n " ,
69+ " grid[2, 1] = loops = widgets.Textarea()\n " ,
70+ " grid"
71+ ],
72+ "metadata" : {
73+ "cellView" : " form" ,
74+ "id" : " -nW-3oUNYadt"
75+ },
76+ "execution_count" : null ,
77+ "outputs" : []
78+ },
79+ {
80+ "cell_type" : " markdown" ,
81+ "source" : [
82+ " We use the input data to set up the protocol:"
83+ ],
84+ "metadata" : {
85+ "id" : " 1atrM6BBEt6o"
86+ }
87+ },
88+ {
89+ "cell_type" : " code" ,
90+ "source" : [
91+ " # Add Modeller license key\n " ,
92+ " with open(f\" /usr/lib/modeller{modver}/modlib/modeller/config.py\" ) as fh:\n " ,
93+ " inst_dir = fh.readline()\n " ,
94+ " with open(f\" /usr/lib/modeller{modver}/modlib/modeller/config.py\" , \" w\" ) as fh:\n " ,
95+ " fh.write(inst_dir)\n " ,
96+ " fh.write(f'license = {key.value!r}\\ n')\n " ,
97+ " \n " ,
98+ " # Save uploaded file to local disk\n " ,
99+ " in_fname = list(coord.value.keys())[0]\n " ,
100+ " with open(in_fname, 'wb') as fh:\n " ,
101+ " fh.write(coord.value[in_fname]['content'])\n " ,
102+ " \n " ,
103+ " def parse_loop_selection(loops):\n " ,
104+ " \"\"\" Split out loop selection and check it\"\"\"\n " ,
105+ " import re\n " ,
106+ " # capitalize and remove spaces\n " ,
107+ " loops = re.sub(r'\\ s+', '', loops.upper())\n " ,
108+ " # replace null chain IDs with a single space\n " ,
109+ " loops = loops.replace(\" ::\" , \" : :\" )\n " ,
110+ " \n " ,
111+ " loop_data = loops.split(\" :\" )[:-1]\n " ,
112+ " \n " ,
113+ " # Make sure correct number of colons were given\n " ,
114+ " if len(loop_data) % 4 != 0:\n " ,
115+ " raise ValueError(\n " ,
116+ " \" Syntax error in loop selection: check to make sure you \"\n " ,
117+ " \" have colons in the correct place (there should be a \"\n " ,
118+ " \" multiple of 4 colons)\" )\n " ,
119+ " \n " ,
120+ " total_res = 0\n " ,
121+ " start_res = []\n " ,
122+ " start_id = []\n " ,
123+ " end_res = []\n " ,
124+ " end_id = []\n " ,
125+ " loops = 0\n " ,
126+ " while loops*4+3 < len(loop_data) and loop_data[loops*4] != \"\" :\n " ,
127+ " try:\n " ,
128+ " start_res.append(int(loop_data[loops*4]))\n " ,
129+ " end_res.append(int(loop_data[loops*4+2]))\n " ,
130+ " except ValueError:\n " ,
131+ " raise ValueError(\n " ,
132+ " \" Residue indices are not numeric\" )\n " ,
133+ " start_id.append(loop_data[loops*4+1])\n " ,
134+ " end_id.append(loop_data[loops*4+3])\n " ,
135+ " # all the selected residues\n " ,
136+ " total_res += (end_res[-1] - start_res[-1] + 1)\n " ,
137+ " \n " ,
138+ " ################################\n " ,
139+ " # too long loops rejected\n " ,
140+ " if ((end_res[-1] - start_res[-1]) > 20\n " ,
141+ " or start_id[-1] != end_id[-1]\n " ,
142+ " or (end_res[-1] - start_res[-1]) < 0):\n " ,
143+ " raise ValueError(\n " ,
144+ " \" The loop selected is too long (>20 residues) or \"\n " ,
145+ " \" shorter than 1 residue or not selected properly \"\n " ,
146+ " \" (syntax problem?) \"\n " ,
147+ " \" starting position %d:%s, ending position: %d:%s\"\n " ,
148+ " % (start_res[-1], start_id[-1], end_res[-1], end_id[-1]))\n " ,
149+ " loops += 1\n " ,
150+ " \n " ,
151+ " ################################\n " ,
152+ " # too many or no residues rejected\n " ,
153+ " if total_res > 20:\n " ,
154+ " raise ValueError(\n " ,
155+ " \" Too many loop residues have been selected \"\n " ,
156+ " \" (selected: %d > limit:20)!\" % total_res)\n " ,
157+ " if total_res <= 0:\n " ,
158+ " raise ValueError(\n " ,
159+ " \" No loop residues selected!\" )\n " ,
160+ " return loop_data\n " ,
161+ " \n " ,
162+ " def get_output_header(loop_data, nmodel):\n " ,
163+ " \"\"\" Return a suitable header for output model files\"\"\"\n " ,
164+ " residue_range = []\n " ,
165+ " for i in range(0, len(loop_data), 4):\n " ,
166+ " residue_range.append(\" %s:%s-%s:%s\" % tuple(loop_data[i:i + 4]))\n " ,
167+ " looplist = \"\\ n\" .join(residue_range)\n " ,
168+ " return f\"\"\"\n " ,
169+ " Dear User,\n " ,
170+ " \n " ,
171+ " Coordinates for the lowest energy model (out of {nmodel} sampled)\n " ,
172+ " are returned with the optimized loop regions, listed below:\n " ,
173+ " {looplist}\n " ,
174+ " \n " ,
175+ " for references please cite these two articles:\n " ,
176+ " \n " ,
177+ " A Fiser, RKG Do and A Sali,\n " ,
178+ " Modeling of loops in protein structures\n " ,
179+ " Prot. Sci. (2000) 9, 1753-1773\n " ,
180+ " \n " ,
181+ " A Fiser and A Sali,\n " ,
182+ " ModLoop: Automated modeling of loops in protein structures\n " ,
183+ " Bioinformatics. (2003) 18(19) 2500-01\n " ,
184+ " \n " ,
185+ " \n " ,
186+ " For further inquiries, please contact: modloop@ucsf.edu\n " ,
187+ " \n " ,
188+ " with best regards,\n " ,
189+ " Andras Fiser\n " ,
190+ " \n " ,
191+ " \n " ,
192+ " \"\"\"\n " ,
193+ " \n " ,
194+ " def add_loop_header(model, loop_data, nmodel):\n " ,
195+ " \"\"\" Add a header to the given model PDB or mmCIF file\"\"\"\n " ,
196+ " with open(model) as fin:\n " ,
197+ " contents = fin.read()\n " ,
198+ " prefix = '#' if model.endswith('.cif') else 'REMARK'\n " ,
199+ " with open(model, 'w') as fout:\n " ,
200+ " for line in get_output_header(loop_data, nmodel).split('\\ n'):\n " ,
201+ " if line == '':\n " ,
202+ " fout.write(prefix + '\\ n')\n " ,
203+ " else:\n " ,
204+ " fout.write(f'{prefix} {line}\\ n')\n " ,
205+ " fout.write(contents)\n " ,
206+ " \n " ,
207+ " # Check the provided set of loop residues to refine\n " ,
208+ " loop_data = parse_loop_selection(loops.value)"
209+ ],
210+ "metadata" : {
211+ "id" : " SMGFauYYZlu6"
212+ },
213+ "execution_count" : null ,
214+ "outputs" : []
215+ },
216+ {
217+ "cell_type" : " markdown" ,
218+ "source" : [
219+ " The loop modeling protocol itself is just a short Python script that runs MODELLER on the input file uploaded earlier, selecting the loop residues given:"
220+ ],
221+ "metadata" : {
222+ "id" : " hek0hWnsFJgR"
223+ }
224+ },
225+ {
226+ "cell_type" : " code" ,
227+ "source" : [
228+ " from modeller import Environ, Selection, ModellerError\n " ,
229+ " from modeller.automodel import LoopModel, refine\n " ,
230+ " import sys\n " ,
231+ " \n " ,
232+ " class MyLoop(LoopModel):\n " ,
233+ " def select_loop_atoms(self):\n " ,
234+ " rngs = []\n " ,
235+ " for i in range(0, len(loop_data), 4):\n " ,
236+ " rngs.append(self.residue_range(\" %s:%s\" % tuple(loop_data[i:i+2]),\n " ,
237+ " \" %s:%s\" % tuple(loop_data[i+2:i+4])))\n " ,
238+ " if len(rngs[-1]) > 30:\n " ,
239+ " raise ModellerError(\" loop too long\" )\n " ,
240+ " s = Selection(rngs)\n " ,
241+ " if len(s.only_no_topology()) > 0:\n " ,
242+ " raise ModellerError(\" some selected residues have no topology\" )\n " ,
243+ " return s\n " ,
244+ " \n " ,
245+ " def make_loop(taskid):\n " ,
246+ " logfile = f'{taskid}.log'\n " ,
247+ " print(f'Logging output to {logfile}')\n " ,
248+ " old_sys_stdout = sys.stdout\n " ,
249+ " try:\n " ,
250+ " sys.stdout = open(logfile, 'w')\n " ,
251+ " env = Environ(rand_seed=-1000-taskid)\n " ,
252+ " m = MyLoop(env, inimodel=in_fname, sequence='loop')\n " ,
253+ " if in_fname.endswith('.cif'):\n " ,
254+ " m.set_output_model_format('MMCIF')\n " ,
255+ " else:\n " ,
256+ " m.set_output_model_format('PDB')\n " ,
257+ " m.loop.md_level = refine.slow\n " ,
258+ " m.loop.starting_model = m.loop.ending_model = taskid\n " ,
259+ " m.make()\n " ,
260+ " return m.loop.outputs[0]\n " ,
261+ " finally:\n " ,
262+ " sys.stdout = old_sys_stdout"
263+ ],
264+ "metadata" : {
265+ "id" : " gCfs5jsRhXPo"
266+ },
267+ "execution_count" : null ,
268+ "outputs" : []
269+ },
270+ {
271+ "cell_type" : " markdown" ,
272+ "source" : [
273+ " Finally, we can run the protocol in parallel. The exact same protocol is run 300 times on the same inputs but with a different random seed. (This will run faster if given a CPU with more cores.) The single structure with the lowest molecular PDF (molpdf) is then selected."
274+ ],
275+ "metadata" : {
276+ "id" : " 4n58sMJMFVtL"
277+ }
278+ },
279+ {
280+ "cell_type" : " code" ,
281+ "source" : [
282+ " import multiprocessing\n " ,
283+ " import operator\n " ,
284+ " \n " ,
285+ " nmodel = 300\n " ,
286+ " with multiprocessing.Pool() as pool:\n " ,
287+ " best_model = min(pool.imap_unordered(make_loop, range(1, nmodel+1)),\n " ,
288+ " key=operator.itemgetter('molpdf'))\n " ,
289+ " print(f\" Best model is {best_model['name']}\" )\n " ,
290+ " \n " ,
291+ " # Add an informative ModLoop header to the best model PDB/mmCIF file\n " ,
292+ " add_loop_header(best_model['name'], loop_data, nmodel)"
293+ ],
294+ "metadata" : {
295+ "collapsed" : true ,
296+ "id" : " 6lejFefP0w-H"
297+ },
298+ "execution_count" : null ,
299+ "outputs" : []
300+ },
301+ {
302+ "cell_type" : " markdown" ,
303+ "source" : [
304+ " Run the cell below to download the selected structure:"
305+ ],
306+ "metadata" : {
307+ "id" : " 0vpeZJEBGHtu"
308+ }
309+ },
310+ {
311+ "cell_type" : " code" ,
312+ "source" : [
313+ " from google.colab import files\n " ,
314+ " files.download(best_model['name'])\n "
315+ ],
316+ "metadata" : {
317+ "id" : " 9_sIwcR69lww"
318+ },
319+ "execution_count" : null ,
320+ "outputs" : []
321+ }
322+ ]
323+ }
0 commit comments