Make PBC type enumeration into PbcType enum class
[gromacs.git] / src / gromacs / gmxpreprocess / pdb2gmx.cpp
blobf3885d21edd3934862df84308a7384f2ea8959fc
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
8 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9 * and including many others, as listed in the AUTHORS file in the
10 * top-level source directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
38 #include "gmxpre.h"
40 #include "pdb2gmx.h"
42 #include <cctype>
43 #include <cstdlib>
44 #include <cstring>
45 #include <ctime>
47 #include <algorithm>
48 #include <memory>
49 #include <string>
50 #include <vector>
52 #include "gromacs/commandline/cmdlineoptionsmodule.h"
53 #include "gromacs/fileio/confio.h"
54 #include "gromacs/fileio/filetypes.h"
55 #include "gromacs/fileio/gmxfio.h"
56 #include "gromacs/fileio/pdbio.h"
57 #include "gromacs/gmxlib/conformation_utilities.h"
58 #include "gromacs/gmxpreprocess/fflibutil.h"
59 #include "gromacs/gmxpreprocess/genhydro.h"
60 #include "gromacs/gmxpreprocess/gpp_atomtype.h"
61 #include "gromacs/gmxpreprocess/grompp_impl.h"
62 #include "gromacs/gmxpreprocess/h_db.h"
63 #include "gromacs/gmxpreprocess/hizzie.h"
64 #include "gromacs/gmxpreprocess/pdb2top.h"
65 #include "gromacs/gmxpreprocess/pgutil.h"
66 #include "gromacs/gmxpreprocess/specbond.h"
67 #include "gromacs/gmxpreprocess/ter_db.h"
68 #include "gromacs/gmxpreprocess/toputil.h"
69 #include "gromacs/gmxpreprocess/xlate.h"
70 #include "gromacs/math/vec.h"
71 #include "gromacs/options/basicoptions.h"
72 #include "gromacs/options/filenameoption.h"
73 #include "gromacs/options/ioptionscontainer.h"
74 #include "gromacs/topology/atomprop.h"
75 #include "gromacs/topology/block.h"
76 #include "gromacs/topology/index.h"
77 #include "gromacs/topology/residuetypes.h"
78 #include "gromacs/topology/symtab.h"
79 #include "gromacs/topology/topology.h"
80 #include "gromacs/utility/dir_separator.h"
81 #include "gromacs/utility/exceptions.h"
82 #include "gromacs/utility/fatalerror.h"
83 #include "gromacs/utility/path.h"
84 #include "gromacs/utility/smalloc.h"
85 #include "gromacs/utility/strdb.h"
86 #include "gromacs/utility/stringutil.h"
88 #include "hackblock.h"
89 #include "resall.h"
91 struct RtpRename
93 RtpRename(const char* newGmx, const char* newMain, const char* newNter, const char* newCter, const char* newBter) :
94 gmx(newGmx),
95 main(newMain),
96 nter(newNter),
97 cter(newCter),
98 bter(newBter)
101 std::string gmx;
102 std::string main;
103 std::string nter;
104 std::string cter;
105 std::string bter;
108 static const char* res2bb_notermini(const std::string& name, gmx::ArrayRef<const RtpRename> rr)
110 /* NOTE: This function returns the main building block name,
111 * it does not take terminal renaming into account.
113 auto found = std::find_if(rr.begin(), rr.end(), [&name](const auto& rename) {
114 return gmx::equalCaseInsensitive(name, rename.gmx);
116 return found != rr.end() ? found->main.c_str() : name.c_str();
119 static const char* select_res(int nr,
120 int resnr,
121 const char* name[],
122 const char* expl[],
123 const char* title,
124 gmx::ArrayRef<const RtpRename> rr)
126 printf("Which %s type do you want for residue %d\n", title, resnr + 1);
127 for (int sel = 0; (sel < nr); sel++)
129 printf("%d. %s (%s)\n", sel, expl[sel], res2bb_notermini(name[sel], rr));
131 printf("\nType a number:");
132 fflush(stdout);
134 int userSelection;
135 if (scanf("%d", &userSelection) != 1)
137 gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr + 1);
140 return name[userSelection];
143 static const char* get_asptp(int resnr, gmx::ArrayRef<const RtpRename> rr)
145 enum
147 easp,
148 easpH,
149 easpNR
151 const char* lh[easpNR] = { "ASP", "ASPH" };
152 const char* expl[easpNR] = { "Not protonated (charge -1)", "Protonated (charge 0)" };
154 return select_res(easpNR, resnr, lh, expl, "ASPARTIC ACID", rr);
157 static const char* get_glutp(int resnr, gmx::ArrayRef<const RtpRename> rr)
159 enum
161 eglu,
162 egluH,
163 egluNR
165 const char* lh[egluNR] = { "GLU", "GLUH" };
166 const char* expl[egluNR] = { "Not protonated (charge -1)", "Protonated (charge 0)" };
168 return select_res(egluNR, resnr, lh, expl, "GLUTAMIC ACID", rr);
171 static const char* get_glntp(int resnr, gmx::ArrayRef<const RtpRename> rr)
173 enum
175 egln,
176 eglnH,
177 eglnNR
179 const char* lh[eglnNR] = { "GLN", "QLN" };
180 const char* expl[eglnNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
182 return select_res(eglnNR, resnr, lh, expl, "GLUTAMINE", rr);
185 static const char* get_lystp(int resnr, gmx::ArrayRef<const RtpRename> rr)
187 enum
189 elys,
190 elysH,
191 elysNR
193 const char* lh[elysNR] = { "LYSN", "LYS" };
194 const char* expl[elysNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
196 return select_res(elysNR, resnr, lh, expl, "LYSINE", rr);
199 static const char* get_argtp(int resnr, gmx::ArrayRef<const RtpRename> rr)
201 enum
203 earg,
204 eargH,
205 eargNR
207 const char* lh[eargNR] = { "ARGN", "ARG" };
208 const char* expl[eargNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
210 return select_res(eargNR, resnr, lh, expl, "ARGININE", rr);
213 static const char* get_histp(int resnr, gmx::ArrayRef<const RtpRename> rr)
215 const char* expl[ehisNR] = { "H on ND1 only", "H on NE2 only", "H on ND1 and NE2",
216 "Coupled to Heme" };
218 return select_res(ehisNR, resnr, hh, expl, "HISTIDINE", rr);
221 static void read_rtprename(const char* fname, FILE* fp, std::vector<RtpRename>* rtprename)
223 char line[STRLEN], buf[STRLEN];
225 int ncol = 0;
226 while (get_a_line(fp, line, STRLEN))
228 /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
229 * For other args, we read up to 6 chars (so we can detect if the length is > 5).
230 * Note that the buffer length has been increased to 7 to allow this,
231 * so we just need to make sure the strings have zero-length initially.
233 char gmx[STRLEN];
234 char main[STRLEN];
235 char nter[STRLEN];
236 char cter[STRLEN];
237 char bter[STRLEN];
238 gmx[0] = '\0';
239 main[0] = '\0';
240 nter[0] = '\0';
241 cter[0] = '\0';
242 bter[0] = '\0';
243 int nc = sscanf(line, "%6s %6s %6s %6s %6s %s", gmx, main, nter, cter, bter, buf);
244 RtpRename newEntry(gmx, main, nter, cter, bter);
245 if (ncol == 0)
247 if (nc != 2 && nc != 5)
249 gmx_fatal(FARGS, "Residue renaming database '%s' has %d columns instead of %d or %d",
250 fname, ncol, 2, 5);
252 ncol = nc;
254 else if (nc != ncol)
256 gmx_fatal(FARGS,
257 "A line in residue renaming database '%s' has %d columns, while previous "
258 "lines have %d columns",
259 fname, nc, ncol);
262 if (nc == 2)
264 /* This file does not have special termini names, copy them from main */
265 newEntry.nter = newEntry.main;
266 newEntry.cter = newEntry.main;
267 newEntry.bter = newEntry.main;
269 rtprename->push_back(newEntry);
273 static std::string search_resrename(gmx::ArrayRef<const RtpRename> rr,
274 const char* name,
275 bool bStart,
276 bool bEnd,
277 bool bCompareFFRTPname)
279 auto found = std::find_if(rr.begin(), rr.end(), [&name, &bCompareFFRTPname](const auto& rename) {
280 return ((!bCompareFFRTPname && (name == rename.gmx))
281 || (bCompareFFRTPname && (name == rename.main)));
284 std::string newName;
285 /* If found in the database, rename this residue's rtp building block,
286 * otherwise keep the old name.
288 if (found != rr.end())
290 if (bStart && bEnd)
292 newName = found->bter;
294 else if (bStart)
296 newName = found->nter;
298 else if (bEnd)
300 newName = found->cter;
302 else
304 newName = found->main;
307 if (newName[0] == '-')
309 gmx_fatal(FARGS, "In the chosen force field there is no residue type for '%s'%s", name,
310 bStart ? (bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus")
311 : (bEnd ? " as an ending terminus" : ""));
315 return newName;
318 static void rename_resrtp(t_atoms* pdba,
319 int nterpairs,
320 gmx::ArrayRef<const int> r_start,
321 gmx::ArrayRef<const int> r_end,
322 gmx::ArrayRef<const RtpRename> rr,
323 t_symtab* symtab,
324 bool bVerbose)
326 bool bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
328 for (int r = 0; r < pdba->nres; r++)
330 bool bStart = false;
331 bool bEnd = false;
332 for (int j = 0; j < nterpairs; j++)
334 if (r == r_start[j])
336 bStart = true;
339 for (int j = 0; j < nterpairs; j++)
341 if (r == r_end[j])
343 bEnd = true;
347 std::string newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, false);
349 if (bFFRTPTERRNM && newName.empty() && (bStart || bEnd))
351 /* This is a terminal residue, but the residue name,
352 * currently stored in .rtp, is not a standard residue name,
353 * but probably a force field specific rtp name.
354 * Check if we need to rename it because it is terminal.
356 newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, true);
359 if (!newName.empty() && newName != *pdba->resinfo[r].rtp)
361 if (bVerbose)
363 printf("Changing rtp entry of residue %d %s to '%s'\n", pdba->resinfo[r].nr,
364 *pdba->resinfo[r].name, newName.c_str());
366 pdba->resinfo[r].rtp = put_symtab(symtab, newName.c_str());
371 static void pdbres_to_gmxrtp(t_atoms* pdba)
373 int i;
375 for (i = 0; (i < pdba->nres); i++)
377 if (pdba->resinfo[i].rtp == nullptr)
379 pdba->resinfo[i].rtp = pdba->resinfo[i].name;
384 static void rename_pdbres(t_atoms* pdba, const char* oldnm, const char* newnm, bool bFullCompare, t_symtab* symtab)
386 char* resnm;
387 int i;
389 for (i = 0; (i < pdba->nres); i++)
391 resnm = *pdba->resinfo[i].name;
392 if ((bFullCompare && (gmx::equalCaseInsensitive(resnm, oldnm)))
393 || (!bFullCompare && strstr(resnm, oldnm) != nullptr))
395 /* Rename the residue name (not the rtp name) */
396 pdba->resinfo[i].name = put_symtab(symtab, newnm);
401 static void rename_bb(t_atoms* pdba, const char* oldnm, const char* newnm, bool bFullCompare, t_symtab* symtab)
403 char* bbnm;
404 int i;
406 for (i = 0; (i < pdba->nres); i++)
408 /* We have not set the rtp name yes, use the residue name */
409 bbnm = *pdba->resinfo[i].name;
410 if ((bFullCompare && (gmx::equalCaseInsensitive(bbnm, oldnm)))
411 || (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
413 /* Change the rtp builing block name */
414 pdba->resinfo[i].rtp = put_symtab(symtab, newnm);
419 static void rename_bbint(t_atoms* pdba,
420 const char* oldnm,
421 const char* gettp(int, gmx::ArrayRef<const RtpRename>),
422 bool bFullCompare,
423 t_symtab* symtab,
424 gmx::ArrayRef<const RtpRename> rr)
426 int i;
427 const char* ptr;
428 char* bbnm;
430 for (i = 0; i < pdba->nres; i++)
432 /* We have not set the rtp name yet, use the residue name */
433 bbnm = *pdba->resinfo[i].name;
434 if ((bFullCompare && (strcmp(bbnm, oldnm) == 0)) || (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
436 ptr = gettp(i, rr);
437 pdba->resinfo[i].rtp = put_symtab(symtab, ptr);
442 static void check_occupancy(t_atoms* atoms, const char* filename, bool bVerbose)
444 int i, ftp;
445 int nzero = 0;
446 int nnotone = 0;
448 ftp = fn2ftp(filename);
449 if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
451 fprintf(stderr, "No occupancies in %s\n", filename);
453 else
455 for (i = 0; (i < atoms->nr); i++)
457 if (atoms->pdbinfo[i].occup != 1)
459 if (bVerbose)
461 fprintf(stderr, "Occupancy for atom %s%d-%s is %f rather than 1\n",
462 *atoms->resinfo[atoms->atom[i].resind].name,
463 atoms->resinfo[atoms->atom[i].resind].nr, *atoms->atomname[i],
464 atoms->pdbinfo[i].occup);
466 if (atoms->pdbinfo[i].occup == 0)
468 nzero++;
470 else
472 nnotone++;
476 if (nzero == atoms->nr)
478 fprintf(stderr, "All occupancy fields zero. This is probably not an X-Ray structure\n");
480 else if ((nzero > 0) || (nnotone > 0))
482 fprintf(stderr,
483 "\n"
484 "WARNING: there were %d atoms with zero occupancy and %d atoms with\n"
485 " occupancy unequal to one (out of %d atoms). Check your pdb file.\n"
486 "\n",
487 nzero, nnotone, atoms->nr);
489 else
491 fprintf(stderr, "All occupancies are one\n");
496 static void write_posres(const char* fn, t_atoms* pdba, real fc)
498 FILE* fp;
499 int i;
501 fp = gmx_fio_fopen(fn, "w");
502 fprintf(fp,
503 "; In this topology include file, you will find position restraint\n"
504 "; entries for all the heavy atoms in your original pdb file.\n"
505 "; This means that all the protons which were added by pdb2gmx are\n"
506 "; not restrained.\n"
507 "\n"
508 "[ position_restraints ]\n"
509 "; %4s%6s%8s%8s%8s\n",
510 "atom", "type", "fx", "fy", "fz");
511 for (i = 0; (i < pdba->nr); i++)
513 if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
515 fprintf(fp, "%6d%6d %g %g %g\n", i + 1, 1, fc, fc, fc);
518 gmx_fio_fclose(fp);
521 static int read_pdball(const char* inf,
522 bool bOutput,
523 const char* outf,
524 char** title,
525 t_atoms* atoms,
526 rvec** x,
527 PbcType* pbcType,
528 matrix box,
529 bool bRemoveH,
530 t_symtab* symtab,
531 ResidueType* rt,
532 const char* watres,
533 AtomProperties* aps,
534 bool bVerbose)
535 /* Read a pdb file. (containing proteins) */
537 int natom, new_natom, i;
539 /* READ IT */
540 printf("Reading %s...\n", inf);
541 readConfAndAtoms(inf, symtab, title, atoms, pbcType, x, nullptr, box);
542 natom = atoms->nr;
543 if (atoms->pdbinfo == nullptr)
545 snew(atoms->pdbinfo, atoms->nr);
547 if (fn2ftp(inf) == efPDB)
549 get_pdb_atomnumber(atoms, aps);
551 if (bRemoveH)
553 new_natom = 0;
554 for (i = 0; i < atoms->nr; i++)
556 if (!is_hydrogen(*atoms->atomname[i]))
558 atoms->atom[new_natom] = atoms->atom[i];
559 atoms->atomname[new_natom] = atoms->atomname[i];
560 atoms->pdbinfo[new_natom] = atoms->pdbinfo[i];
561 copy_rvec((*x)[i], (*x)[new_natom]);
562 new_natom++;
565 atoms->nr = new_natom;
566 natom = new_natom;
569 printf("Read");
570 if (title[0])
572 printf(" '%s',", *title);
574 printf(" %d atoms\n", natom);
576 /* Rename residues */
577 rename_pdbres(atoms, "HOH", watres, false, symtab);
578 rename_pdbres(atoms, "SOL", watres, false, symtab);
579 rename_pdbres(atoms, "WAT", watres, false, symtab);
581 rename_atoms("xlateat.dat", nullptr, atoms, symtab, {}, true, rt, true, bVerbose);
583 if (natom == 0)
585 return 0;
587 if (bOutput)
589 write_sto_conf(outf, *title, atoms, *x, nullptr, *pbcType, box);
592 return natom;
595 static void process_chain(t_atoms* pdba,
596 gmx::ArrayRef<gmx::RVec> x,
597 bool bTrpU,
598 bool bPheU,
599 bool bTyrU,
600 bool bLysMan,
601 bool bAspMan,
602 bool bGluMan,
603 bool bHisMan,
604 bool bArgMan,
605 bool bGlnMan,
606 real angle,
607 real distance,
608 t_symtab* symtab,
609 gmx::ArrayRef<const RtpRename> rr)
611 /* Rename aromatics, lys, asp and histidine */
612 if (bTyrU)
614 rename_bb(pdba, "TYR", "TYRU", false, symtab);
616 if (bTrpU)
618 rename_bb(pdba, "TRP", "TRPU", false, symtab);
620 if (bPheU)
622 rename_bb(pdba, "PHE", "PHEU", false, symtab);
624 if (bLysMan)
626 rename_bbint(pdba, "LYS", get_lystp, false, symtab, rr);
628 if (bArgMan)
630 rename_bbint(pdba, "ARG", get_argtp, false, symtab, rr);
632 if (bGlnMan)
634 rename_bbint(pdba, "GLN", get_glntp, false, symtab, rr);
636 if (bAspMan)
638 rename_bbint(pdba, "ASP", get_asptp, false, symtab, rr);
640 else
642 rename_bb(pdba, "ASPH", "ASP", false, symtab);
644 if (bGluMan)
646 rename_bbint(pdba, "GLU", get_glutp, false, symtab, rr);
648 else
650 rename_bb(pdba, "GLUH", "GLU", false, symtab);
653 if (!bHisMan)
655 set_histp(pdba, gmx::as_rvec_array(x.data()), symtab, angle, distance);
657 else
659 rename_bbint(pdba, "HIS", get_histp, true, symtab, rr);
662 /* Initialize the rtp builing block names with the residue names
663 * for the residues that have not been processed above.
665 pdbres_to_gmxrtp(pdba);
667 /* Now we have all rtp names set.
668 * The rtp names will conform to Gromacs naming,
669 * unless the input pdb file contained one or more force field specific
670 * rtp names as residue names.
674 /* struct for sorting the atoms from the pdb file */
675 typedef struct
677 int resnr; /* residue number */
678 int j; /* database order index */
679 int index; /* original atom number */
680 char anm1; /* second letter of atom name */
681 char altloc; /* alternate location indicator */
682 } t_pdbindex;
684 static bool pdbicomp(const t_pdbindex& a, const t_pdbindex& b)
686 int d = (a.resnr - b.resnr);
687 if (d == 0)
689 d = (a.j - b.j);
690 if (d == 0)
692 d = (a.anm1 - b.anm1);
693 if (d == 0)
695 d = (a.altloc - b.altloc);
699 return d < 0;
702 static void sort_pdbatoms(gmx::ArrayRef<const PreprocessResidue> restp_chain,
703 int natoms,
704 t_atoms** pdbaptr,
705 t_atoms** newPdbAtoms,
706 std::vector<gmx::RVec>* x,
707 t_blocka* block,
708 char*** gnames)
710 t_atoms* pdba = *pdbaptr;
711 std::vector<gmx::RVec> xnew;
712 t_pdbindex* pdbi;
713 char* atomnm;
715 natoms = pdba->nr;
716 snew(pdbi, natoms);
718 for (int i = 0; i < natoms; i++)
720 atomnm = *pdba->atomname[i];
721 const PreprocessResidue* localPpResidue = &restp_chain[pdba->atom[i].resind];
722 auto found =
723 std::find_if(localPpResidue->atomname.begin(), localPpResidue->atomname.end(),
724 [&atomnm](char** it) { return gmx::equalCaseInsensitive(atomnm, *it); });
725 if (found == localPpResidue->atomname.end())
727 char buf[STRLEN];
729 sprintf(buf,
730 "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
731 "while sorting atoms.\n%s",
732 atomnm, *pdba->resinfo[pdba->atom[i].resind].name,
733 pdba->resinfo[pdba->atom[i].resind].nr, localPpResidue->resname.c_str(),
734 localPpResidue->natom(),
735 is_hydrogen(atomnm)
736 ? "\nFor a hydrogen, this can be a different protonation state, or it\n"
737 "might have had a different number in the PDB file and was rebuilt\n"
738 "(it might for instance have been H3, and we only expected H1 & "
739 "H2).\n"
740 "Note that hydrogens might have been added to the entry for the "
741 "N-terminus.\n"
742 "Remove this hydrogen or choose a different protonation state to "
743 "solve it.\n"
744 "Option -ignh will ignore all hydrogens in the input."
745 : ".");
746 gmx_fatal(FARGS, "%s", buf);
748 /* make shadow array to be sorted into indexgroup */
749 pdbi[i].resnr = pdba->atom[i].resind;
750 pdbi[i].j = std::distance(localPpResidue->atomname.begin(), found);
751 pdbi[i].index = i;
752 pdbi[i].anm1 = atomnm[1];
753 pdbi[i].altloc = pdba->pdbinfo[i].altloc;
755 std::sort(pdbi, pdbi + natoms, pdbicomp);
757 /* pdba is sorted in pdbnew using the pdbi index */
758 std::vector<int> a(natoms);
759 srenew(*newPdbAtoms, 1);
760 init_t_atoms((*newPdbAtoms), natoms, true);
761 (*newPdbAtoms)->nr = pdba->nr;
762 (*newPdbAtoms)->nres = pdba->nres;
763 srenew((*newPdbAtoms)->resinfo, pdba->nres);
764 std::copy(pdba->resinfo, pdba->resinfo + pdba->nres, (*newPdbAtoms)->resinfo);
765 for (int i = 0; i < natoms; i++)
767 (*newPdbAtoms)->atom[i] = pdba->atom[pdbi[i].index];
768 (*newPdbAtoms)->atomname[i] = pdba->atomname[pdbi[i].index];
769 (*newPdbAtoms)->pdbinfo[i] = pdba->pdbinfo[pdbi[i].index];
770 xnew.push_back(x->at(pdbi[i].index));
771 /* make indexgroup in block */
772 a[i] = pdbi[i].index;
774 /* clean up */
775 done_atom(pdba);
776 sfree(pdba);
777 /* copy the sorted pdbnew back to pdba */
778 *pdbaptr = *newPdbAtoms;
779 *x = xnew;
780 add_grp(block, gnames, a, "prot_sort");
781 sfree(pdbi);
784 static int remove_duplicate_atoms(t_atoms* pdba, gmx::ArrayRef<gmx::RVec> x, bool bVerbose)
786 int i, j, oldnatoms, ndel;
787 t_resinfo* ri;
789 printf("Checking for duplicate atoms....\n");
790 oldnatoms = pdba->nr;
791 ndel = 0;
792 /* NOTE: pdba->nr is modified inside the loop */
793 for (i = 1; (i < pdba->nr); i++)
795 /* compare 'i' and 'i-1', throw away 'i' if they are identical
796 this is a 'while' because multiple alternate locations can be present */
797 while ((i < pdba->nr) && (pdba->atom[i - 1].resind == pdba->atom[i].resind)
798 && (strcmp(*pdba->atomname[i - 1], *pdba->atomname[i]) == 0))
800 ndel++;
801 if (bVerbose)
803 ri = &pdba->resinfo[pdba->atom[i].resind];
804 printf("deleting duplicate atom %4s %s%4d%c", *pdba->atomname[i], *ri->name,
805 ri->nr, ri->ic);
806 if (ri->chainid && (ri->chainid != ' '))
808 printf(" ch %c", ri->chainid);
810 if (pdba->pdbinfo)
812 if (pdba->pdbinfo[i].atomnr)
814 printf(" pdb nr %4d", pdba->pdbinfo[i].atomnr);
816 if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
818 printf(" altloc %c", pdba->pdbinfo[i].altloc);
821 printf("\n");
823 pdba->nr--;
824 /* We can not free, since it might be in the symtab */
825 /* sfree(pdba->atomname[i]); */
826 for (j = i; j < pdba->nr; j++)
828 pdba->atom[j] = pdba->atom[j + 1];
829 pdba->atomname[j] = pdba->atomname[j + 1];
830 if (pdba->pdbinfo)
832 pdba->pdbinfo[j] = pdba->pdbinfo[j + 1];
834 copy_rvec(x[j + 1], x[j]);
836 srenew(pdba->atom, pdba->nr);
837 /* srenew(pdba->atomname, pdba->nr); */
838 srenew(pdba->pdbinfo, pdba->nr);
841 if (pdba->nr != oldnatoms)
843 printf("Now there are %d atoms. Deleted %d duplicates.\n", pdba->nr, ndel);
846 return pdba->nr;
849 static void checkResidueTypeSanity(t_atoms* pdba, int r0, int r1, ResidueType* rt)
851 std::string startResidueString =
852 gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
853 std::string endResidueString =
854 gmx::formatString("%s%d", *pdba->resinfo[r1 - 1].name, pdba->resinfo[r1 - 1].nr);
856 // Check whether all residues in chain have the same chain ID.
857 bool allResiduesHaveSameChainID = true;
858 char chainID0 = pdba->resinfo[r0].chainid;
859 char chainID;
860 std::string residueString;
862 for (int i = r0 + 1; i < r1; i++)
864 chainID = pdba->resinfo[i].chainid;
865 if (chainID != chainID0)
867 allResiduesHaveSameChainID = false;
868 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
869 break;
873 if (!allResiduesHaveSameChainID)
875 gmx_fatal(FARGS,
876 "The chain covering the range %s--%s does not have a consistent chain ID. "
877 "The first residue has ID '%c', while residue %s has ID '%c'.",
878 startResidueString.c_str(), endResidueString.c_str(), chainID0,
879 residueString.c_str(), chainID);
882 // At this point all residues have the same ID. If they are also non-blank
883 // we can be a bit more aggressive and require the types match too.
884 if (chainID0 != ' ')
886 bool allResiduesHaveSameType = true;
887 std::string restype;
888 std::string restype0 = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[r0].name);
890 for (int i = r0 + 1; i < r1; i++)
892 restype = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
893 if (!gmx::equalCaseInsensitive(restype, restype0))
895 allResiduesHaveSameType = false;
896 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
897 break;
901 if (!allResiduesHaveSameType)
903 gmx_fatal(FARGS,
904 "The residues in the chain %s--%s do not have a consistent type. "
905 "The first residue has type '%s', while residue %s is of type '%s'. "
906 "Either there is a mistake in your chain, or it includes nonstandard "
907 "residue names that have not yet been added to the residuetypes.dat "
908 "file in the GROMACS library directory. If there are other molecules "
909 "such as ligands, they should not have the same chain ID as the "
910 "adjacent protein chain since it's a separate molecule.",
911 startResidueString.c_str(), endResidueString.c_str(), restype0.c_str(),
912 residueString.c_str(), restype.c_str());
917 static void find_nc_ter(t_atoms* pdba, int r0, int r1, int* r_start, int* r_end, ResidueType* rt)
919 int i;
920 gmx::compat::optional<std::string> startrestype;
922 *r_start = -1;
923 *r_end = -1;
925 int startWarnings = 0;
926 int endWarnings = 0;
927 int ionNotes = 0;
929 // Check that all residues have the same chain identifier, and if it is
930 // non-blank we also require the residue types to match.
931 checkResidueTypeSanity(pdba, r0, r1, rt);
933 // If we return correctly from checkResidueTypeSanity(), the only
934 // remaining cases where we can have non-matching residue types is if
935 // the chain ID was blank, which could be the case e.g. for a structure
936 // read from a GRO file or other file types without chain information.
937 // In that case we need to be a bit more liberal and detect chains based
938 // on the residue type.
940 // If we get here, the chain ID must be identical for all residues
941 char chainID = pdba->resinfo[r0].chainid;
943 /* Find the starting terminus (typially N or 5') */
944 for (i = r0; i < r1 && *r_start == -1; i++)
946 startrestype = rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
947 if (!startrestype)
949 continue;
951 if (gmx::equalCaseInsensitive(*startrestype, "Protein")
952 || gmx::equalCaseInsensitive(*startrestype, "DNA")
953 || gmx::equalCaseInsensitive(*startrestype, "RNA"))
955 printf("Identified residue %s%d as a starting terminus.\n", *pdba->resinfo[i].name,
956 pdba->resinfo[i].nr);
957 *r_start = i;
959 else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
961 if (ionNotes < 5)
963 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n",
964 *pdba->resinfo[i].name, pdba->resinfo[i].nr);
966 if (ionNotes == 4)
968 printf("Disabling further notes about ions.\n");
970 ionNotes++;
972 else
974 // Either no known residue type, or one not needing special handling
975 if (startWarnings < 5)
977 if (chainID == ' ')
979 printf("\nWarning: Starting residue %s%d in chain not identified as "
980 "Protein/RNA/DNA.\n"
981 "This chain lacks identifiers, which makes it impossible to do strict\n"
982 "classification of the start/end residues. Here we need to guess this "
983 "residue\n"
984 "should not be part of the chain and instead introduce a break, but "
985 "that will\n"
986 "be catastrophic if they should in fact be linked. Please check your "
987 "structure,\n"
988 "and add %s to residuetypes.dat if this was not correct.\n\n",
989 *pdba->resinfo[i].name, pdba->resinfo[i].nr, *pdba->resinfo[i].name);
991 else
993 printf("\nWarning: No residues in chain starting at %s%d identified as "
994 "Protein/RNA/DNA.\n"
995 "This makes it impossible to link them into a molecule, which could "
996 "either be\n"
997 "correct or a catastrophic error. Please check your structure, and add "
998 "all\n"
999 "necessary residue names to residuetypes.dat if this was not "
1000 "correct.\n\n",
1001 *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1004 if (startWarnings == 4)
1006 printf("Disabling further warnings about unidentified residues at start of "
1007 "chain.\n");
1009 startWarnings++;
1013 if (*r_start >= 0)
1015 /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
1016 for (int i = *r_start; i < r1; i++)
1018 gmx::compat::optional<std::string> restype =
1019 rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
1020 if (!restype)
1022 continue;
1024 if (gmx::equalCaseInsensitive(*restype, *startrestype) && endWarnings == 0)
1026 *r_end = i;
1028 else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
1030 if (ionNotes < 5)
1032 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n",
1033 *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1035 if (ionNotes == 4)
1037 printf("Disabling further notes about ions.\n");
1039 ionNotes++;
1041 else
1043 // Either no known residue type, or one not needing special handling.
1044 GMX_RELEASE_ASSERT(chainID == ' ', "Chain ID must be blank");
1045 // Otherwise the call to checkResidueTypeSanity() will
1046 // have caught the problem.
1047 if (endWarnings < 5)
1049 printf("\nWarning: Residue %s%d in chain has different type ('%s') from\n"
1050 "residue %s%d ('%s'). This chain lacks identifiers, which makes\n"
1051 "it impossible to do strict classification of the start/end residues. "
1052 "Here we\n"
1053 "need to guess this residue should not be part of the chain and "
1054 "instead\n"
1055 "introduce a break, but that will be catastrophic if they should in "
1056 "fact be\n"
1057 "linked. Please check your structure, and add %s to residuetypes.dat\n"
1058 "if this was not correct.\n\n",
1059 *pdba->resinfo[i].name, pdba->resinfo[i].nr, restype->c_str(),
1060 *pdba->resinfo[*r_start].name, pdba->resinfo[*r_start].nr,
1061 startrestype->c_str(), *pdba->resinfo[i].name);
1063 if (endWarnings == 4)
1065 printf("Disabling further warnings about unidentified residues at end of "
1066 "chain.\n");
1068 endWarnings++;
1073 if (*r_end >= 0)
1075 printf("Identified residue %s%d as a ending terminus.\n", *pdba->resinfo[*r_end].name,
1076 pdba->resinfo[*r_end].nr);
1080 /* enum for chain separation */
1081 enum ChainSepType
1083 enChainSep_id_or_ter,
1084 enChainSep_id_and_ter,
1085 enChainSep_ter,
1086 enChainSep_id,
1087 enChainSep_interactive
1089 static const char* ChainSepEnum[] = { "id_or_ter", "id_and_ter", "ter", "id", "interactive" };
1090 static const char* ChainSepInfoString[] = {
1091 "Splitting chemical chains based on TER records or chain id changing.\n",
1092 "Splitting chemical chains based on TER records and chain id changing.\n",
1093 "Splitting chemical chains based on TER records only (ignoring chain id).\n",
1094 "Splitting chemical chains based on changing chain id only (ignoring TER records).\n",
1095 "Splitting chemical chains interactively.\n"
1098 static void modify_chain_numbers(t_atoms* pdba, ChainSepType enumChainSep)
1100 int i;
1101 char old_prev_chainid;
1102 char old_this_chainid;
1103 int old_prev_chainnum;
1104 int old_this_chainnum;
1105 t_resinfo* ri;
1106 char select[STRLEN];
1107 int new_chainnum;
1108 int this_atomnum;
1109 int prev_atomnum;
1110 const char* prev_atomname;
1111 const char* this_atomname;
1112 const char* prev_resname;
1113 const char* this_resname;
1114 int prev_resnum;
1115 int this_resnum;
1116 char prev_chainid;
1117 char this_chainid;
1119 /* The default chain enumeration is based on TER records only */
1120 printf("%s", ChainSepInfoString[enumChainSep]);
1122 old_prev_chainid = '?';
1123 old_prev_chainnum = -1;
1124 new_chainnum = -1;
1126 this_atomname = nullptr;
1127 this_atomnum = -1;
1128 this_resname = nullptr;
1129 this_resnum = -1;
1130 this_chainid = '?';
1132 for (i = 0; i < pdba->nres; i++)
1134 ri = &pdba->resinfo[i];
1135 old_this_chainid = ri->chainid;
1136 old_this_chainnum = ri->chainnum;
1138 prev_atomname = this_atomname;
1139 prev_atomnum = this_atomnum;
1140 prev_resname = this_resname;
1141 prev_resnum = this_resnum;
1142 prev_chainid = this_chainid;
1144 this_atomname = *(pdba->atomname[i]);
1145 this_atomnum = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i + 1;
1146 this_resname = *ri->name;
1147 this_resnum = ri->nr;
1148 this_chainid = ri->chainid;
1150 switch (enumChainSep)
1152 case enChainSep_id_or_ter:
1153 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1155 new_chainnum++;
1157 break;
1159 case enChainSep_id_and_ter:
1160 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1162 new_chainnum++;
1164 break;
1166 case enChainSep_id:
1167 if (old_this_chainid != old_prev_chainid)
1169 new_chainnum++;
1171 break;
1173 case enChainSep_ter:
1174 if (old_this_chainnum != old_prev_chainnum)
1176 new_chainnum++;
1178 break;
1179 case enChainSep_interactive:
1180 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1182 if (i > 0)
1184 printf("Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\
1186 "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]\n",
1187 prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1188 this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1190 if (nullptr == fgets(select, STRLEN - 1, stdin))
1192 gmx_fatal(FARGS, "Error reading from stdin");
1195 if (i == 0 || select[0] == 'y')
1197 new_chainnum++;
1200 break;
1201 default: gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1203 old_prev_chainid = old_this_chainid;
1204 old_prev_chainnum = old_this_chainnum;
1206 ri->chainnum = new_chainnum;
1210 struct t_pdbchain
1212 char chainid = ' ';
1213 char chainnum = ' ';
1214 int start = -1;
1215 int natom = -1;
1216 bool bAllWat = false;
1217 int nterpairs = -1;
1218 std::vector<int> chainstart;
1221 struct t_chain
1223 char chainid = ' ';
1224 int chainnum = ' ';
1225 bool bAllWat = false;
1226 int nterpairs = -1;
1227 std::vector<int> chainstart;
1228 std::vector<MoleculePatchDatabase*> ntdb;
1229 std::vector<MoleculePatchDatabase*> ctdb;
1230 std::vector<int> r_start;
1231 std::vector<int> r_end;
1232 t_atoms* pdba;
1233 std::vector<gmx::RVec> x;
1236 // TODO make all enums into scoped enums
1237 /* enum for vsites */
1238 enum VSitesType
1240 enVSites_none,
1241 enVSites_hydrogens,
1242 enVSites_aromatics
1244 static const char* VSitesEnum[] = { "none", "hydrogens", "aromatics" };
1246 /* enum for water model */
1247 enum WaterType
1249 enWater_select,
1250 enWater_none,
1251 enWater_spc,
1252 enWater_spce,
1253 enWater_tip3p,
1254 enWater_tip4p,
1255 enWater_tip5p,
1256 enWater_tips3p
1258 static const char* WaterEnum[] = { "select", "none", "spc", "spce",
1259 "tip3p", "tip4p", "tip5p", "tips3p" };
1261 /* enum for merge */
1262 enum MergeType
1264 enMerge_no,
1265 enMerge_all,
1266 enMerge_interactive
1268 static const char* MergeEnum[] = { "no", "all", "interactive" };
1270 namespace gmx
1273 namespace
1276 class pdb2gmx : public ICommandLineOptionsModule
1278 public:
1279 pdb2gmx() :
1280 bVsites_(FALSE),
1281 bPrevWat_(FALSE),
1282 bVsiteAromatics_(FALSE),
1283 enumChainSep_(enChainSep_id_or_ter),
1284 enumVSites_(enVSites_none),
1285 enumWater_(enWater_select),
1286 enumMerge_(enMerge_no),
1287 itp_file_(nullptr),
1288 mHmult_(0)
1292 // From ICommandLineOptionsModule
1293 void init(CommandLineModuleSettings* /*settings*/) override {}
1295 void initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings) override;
1297 void optionsFinished() override;
1299 int run() override;
1301 private:
1302 bool bNewRTP_;
1303 bool bInter_;
1304 bool bCysMan_;
1305 bool bLysMan_;
1306 bool bAspMan_;
1307 bool bGluMan_;
1308 bool bHisMan_;
1309 bool bGlnMan_;
1310 bool bArgMan_;
1311 bool bTerMan_;
1312 bool bUnA_;
1313 bool bHeavyH_;
1314 bool bSort_;
1315 bool bAllowMissing_;
1316 bool bRemoveH_;
1317 bool bDeuterate_;
1318 bool bVerbose_;
1319 bool bChargeGroups_;
1320 bool bCmap_;
1321 bool bRenumRes_;
1322 bool bRTPresname_;
1323 bool bIndexSet_;
1324 bool bOutputSet_;
1325 bool bVsites_;
1326 bool bWat_;
1327 bool bPrevWat_;
1328 bool bITP_;
1329 bool bVsiteAromatics_;
1330 real angle_;
1331 real distance_;
1332 real posre_fc_;
1333 real long_bond_dist_;
1334 real short_bond_dist_;
1336 std::string indexOutputFile_;
1337 std::string outputFile_;
1338 std::string topologyFile_;
1339 std::string includeTopologyFile_;
1340 std::string outputConfFile_;
1341 std::string inputConfFile_;
1342 std::string outFile_;
1343 std::string ff_;
1345 ChainSepType enumChainSep_;
1346 VSitesType enumVSites_;
1347 WaterType enumWater_;
1348 MergeType enumMerge_;
1350 FILE* itp_file_;
1351 char forcefield_[STRLEN];
1352 char ffdir_[STRLEN];
1353 char* ffname_;
1354 char* watermodel_;
1355 std::vector<std::string> incls_;
1356 std::vector<t_mols> mols_;
1357 real mHmult_;
1360 void pdb2gmx::initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings)
1362 const char* desc[] = {
1363 "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1364 "some database files, adds hydrogens to the molecules and generates",
1365 "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1366 "and a topology in GROMACS format.",
1367 "These files can subsequently be processed to generate a run input file.",
1368 "[PAR]",
1369 "[THISMODULE] will search for force fields by looking for",
1370 "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1371 "of the current working directory and of the GROMACS library directory",
1372 "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1373 "variable.",
1374 "By default the forcefield selection is interactive,",
1375 "but you can use the [TT]-ff[tt] option to specify one of the short names",
1376 "in the list on the command line instead. In that case [THISMODULE] just looks",
1377 "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1378 "[PAR]",
1379 "After choosing a force field, all files will be read only from",
1380 "the corresponding force field directory.",
1381 "If you want to modify or add a residue types, you can copy the force",
1382 "field directory from the GROMACS library directory to your current",
1383 "working directory. If you want to add new protein residue types,",
1384 "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1385 "or copy the whole library directory to a local directory and set",
1386 "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1387 "Check Chapter 5 of the manual for more information about file formats.",
1388 "[PAR]",
1390 "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1391 "need not necessarily contain a protein structure. Every kind of",
1392 "molecule for which there is support in the database can be converted.",
1393 "If there is no support in the database, you can add it yourself.[PAR]",
1395 "The program has limited intelligence, it reads a number of database",
1396 "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1397 "if necessary this can be done manually. The program can prompt the",
1398 "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1399 "desired. For Lys the choice is between neutral (two protons on NZ) or",
1400 "protonated (three protons, default), for Asp and Glu unprotonated",
1401 "(default) or protonated, for His the proton can be either on ND1,",
1402 "on NE2 or on both. By default these selections are done automatically.",
1403 "For His, this is based on an optimal hydrogen bonding",
1404 "conformation. Hydrogen bonds are defined based on a simple geometric",
1405 "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1406 "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1407 "[TT]-dist[tt] respectively.[PAR]",
1409 "The protonation state of N- and C-termini can be chosen interactively",
1410 "with the [TT]-ter[tt] flag. Default termini are ionized (NH3+ and COO-),",
1411 "respectively. Some force fields support zwitterionic forms for chains of",
1412 "one residue, but for polypeptides these options should NOT be selected.",
1413 "The AMBER force fields have unique forms for the terminal residues,",
1414 "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1415 "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1416 "respectively to use these forms, making sure you preserve the format",
1417 "of the coordinate file. Alternatively, use named terminating residues",
1418 "(e.g. ACE, NME).[PAR]",
1420 "The separation of chains is not entirely trivial since the markup",
1421 "in user-generated PDB files frequently varies and sometimes it",
1422 "is desirable to merge entries across a TER record, for instance",
1423 "if you want a disulfide bridge or distance restraints between",
1424 "two protein chains or if you have a HEME group bound to a protein.",
1425 "In such cases multiple chains should be contained in a single",
1426 "[TT]moleculetype[tt] definition.",
1427 "To handle this, [THISMODULE] uses two separate options.",
1428 "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1429 "start, and termini added when applicable. This can be done based on the",
1430 "existence of TER records, when the chain id changes, or combinations of either",
1431 "or both of these. You can also do the selection fully interactively.",
1432 "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1433 "are merged into one moleculetype, after adding all the chemical termini (or not).",
1434 "This can be turned off (no merging), all non-water chains can be merged into a",
1435 "single molecule, or the selection can be done interactively.[PAR]",
1437 "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1438 "If any of the occupancies are not one, indicating that the atom is",
1439 "not resolved well in the structure, a warning message is issued.",
1440 "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1441 "all occupancy fields may be zero. Either way, it is up to the user",
1442 "to verify the correctness of the input data (read the article!).[PAR]",
1444 "During processing the atoms will be reordered according to GROMACS",
1445 "conventions. With [TT]-n[tt] an index file can be generated that",
1446 "contains one group reordered in the same way. This allows you to",
1447 "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1448 "one limitation: reordering is done after the hydrogens are stripped",
1449 "from the input and before new hydrogens are added. This means that",
1450 "you should not use [TT]-ignh[tt].[PAR]",
1452 "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1453 "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1454 "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1455 "[PAR]",
1457 "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1458 "motions. Angular and out-of-plane motions can be removed by changing",
1459 "hydrogens into virtual sites and fixing angles, which fixes their",
1460 "position relative to neighboring atoms. Additionally, all atoms in the",
1461 "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1462 "can be converted into virtual sites, eliminating the fast improper dihedral",
1463 "fluctuations in these rings (but this feature is deprecated).",
1464 "[BB]Note[bb] that in this case all other hydrogen",
1465 "atoms are also converted to virtual sites. The mass of all atoms that are",
1466 "converted into virtual sites, is added to the heavy atoms.[PAR]",
1467 "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1468 "done by increasing the hydrogen-mass by a factor of 4. This is also",
1469 "done for water hydrogens to slow down the rotational motion of water.",
1470 "The increase in mass of the hydrogens is subtracted from the bonded",
1471 "(heavy) atom so that the total mass of the system remains the same."
1474 settings->setHelpText(desc);
1476 options->addOption(BooleanOption("newrtp").store(&bNewRTP_).defaultValue(false).hidden().description(
1477 "Write the residue database in new format to [TT]new.rtp[tt]"));
1478 options->addOption(
1479 RealOption("lb").store(&long_bond_dist_).defaultValue(0.25).hidden().description("Long bond warning distance"));
1480 options->addOption(
1481 RealOption("sb").store(&short_bond_dist_).defaultValue(0.05).hidden().description("Short bond warning distance"));
1482 options->addOption(
1483 EnumOption<ChainSepType>("chainsep").enumValue(ChainSepEnum).store(&enumChainSep_).description("Condition in PDB files when a new chain should be started (adding termini)"));
1484 options->addOption(EnumOption<MergeType>("merge")
1485 .enumValue(MergeEnum)
1486 .store(&enumMerge_)
1487 .description("Merge multiple chains into a single [moleculetype]"));
1488 options->addOption(StringOption("ff").store(&ff_).defaultValue("select").description(
1489 "Force field, interactive by default. Use [TT]-h[tt] for information."));
1490 options->addOption(
1491 EnumOption<WaterType>("water").store(&enumWater_).enumValue(WaterEnum).description("Water model to use"));
1492 options->addOption(BooleanOption("inter").store(&bInter_).defaultValue(false).description(
1493 "Set the next 8 options to interactive"));
1494 options->addOption(BooleanOption("ss").store(&bCysMan_).defaultValue(false).description(
1495 "Interactive SS bridge selection"));
1496 options->addOption(BooleanOption("ter").store(&bTerMan_).defaultValue(false).description(
1497 "Interactive termini selection, instead of charged (default)"));
1498 options->addOption(BooleanOption("lys").store(&bLysMan_).defaultValue(false).description(
1499 "Interactive lysine selection, instead of charged"));
1500 options->addOption(BooleanOption("arg").store(&bArgMan_).defaultValue(false).description(
1501 "Interactive arginine selection, instead of charged"));
1502 options->addOption(BooleanOption("asp").store(&bAspMan_).defaultValue(false).description(
1503 "Interactive aspartic acid selection, instead of charged"));
1504 options->addOption(BooleanOption("glu").store(&bGluMan_).defaultValue(false).description(
1505 "Interactive glutamic acid selection, instead of charged"));
1506 options->addOption(BooleanOption("gln").store(&bGlnMan_).defaultValue(false).description(
1507 "Interactive glutamine selection, instead of charged"));
1508 options->addOption(BooleanOption("his").store(&bHisMan_).defaultValue(false).description(
1509 "Interactive histidine selection, instead of checking H-bonds"));
1510 options->addOption(RealOption("angle").store(&angle_).defaultValue(135.0).description(
1511 "Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)"));
1512 options->addOption(
1513 RealOption("dist").store(&distance_).defaultValue(0.3).description("Maximum donor-acceptor distance for a H-bond (nm)"));
1514 options->addOption(BooleanOption("una").store(&bUnA_).defaultValue(false).description(
1515 "Select aromatic rings with united CH atoms on phenylalanine, tryptophane and "
1516 "tyrosine"));
1517 options->addOption(BooleanOption("sort").store(&bSort_).defaultValue(true).hidden().description(
1518 "Sort the residues according to database, turning this off is dangerous as charge "
1519 "groups might be broken in parts"));
1520 options->addOption(
1521 BooleanOption("ignh").store(&bRemoveH_).defaultValue(false).description("Ignore hydrogen atoms that are in the coordinate file"));
1522 options->addOption(
1523 BooleanOption("missing")
1524 .store(&bAllowMissing_)
1525 .defaultValue(false)
1526 .description(
1527 "Continue when atoms are missing and bonds cannot be made, dangerous"));
1528 options->addOption(
1529 BooleanOption("v").store(&bVerbose_).defaultValue(false).description("Be slightly more verbose in messages"));
1530 options->addOption(
1531 RealOption("posrefc").store(&posre_fc_).defaultValue(1000).description("Force constant for position restraints"));
1532 options->addOption(
1533 EnumOption<VSitesType>("vsite").store(&enumVSites_).enumValue(VSitesEnum).description("Convert atoms to virtual sites"));
1534 options->addOption(BooleanOption("heavyh").store(&bHeavyH_).defaultValue(false).description(
1535 "Make hydrogen atoms heavy"));
1536 options->addOption(
1537 BooleanOption("deuterate").store(&bDeuterate_).defaultValue(false).description("Change the mass of hydrogens to 2 amu"));
1538 options->addOption(BooleanOption("chargegrp")
1539 .store(&bChargeGroups_)
1540 .defaultValue(true)
1541 .description("Use charge groups in the [REF].rtp[ref] file"));
1542 options->addOption(BooleanOption("cmap").store(&bCmap_).defaultValue(true).description(
1543 "Use cmap torsions (if enabled in the [REF].rtp[ref] file)"));
1544 options->addOption(BooleanOption("renum")
1545 .store(&bRenumRes_)
1546 .defaultValue(false)
1547 .description("Renumber the residues consecutively in the output"));
1548 options->addOption(BooleanOption("rtpres")
1549 .store(&bRTPresname_)
1550 .defaultValue(false)
1551 .description("Use [REF].rtp[ref] entry names as residue names"));
1552 options->addOption(FileNameOption("f")
1553 .legacyType(efSTX)
1554 .inputFile()
1555 .store(&inputConfFile_)
1556 .required()
1557 .defaultBasename("protein")
1558 .defaultType(efPDB)
1559 .description("Structure file"));
1560 options->addOption(FileNameOption("o")
1561 .legacyType(efSTO)
1562 .outputFile()
1563 .store(&outputConfFile_)
1564 .required()
1565 .defaultBasename("conf")
1566 .description("Structure file"));
1567 options->addOption(FileNameOption("p")
1568 .legacyType(efTOP)
1569 .outputFile()
1570 .store(&topologyFile_)
1571 .required()
1572 .defaultBasename("topol")
1573 .description("Topology file"));
1574 options->addOption(FileNameOption("i")
1575 .legacyType(efITP)
1576 .outputFile()
1577 .store(&includeTopologyFile_)
1578 .required()
1579 .defaultBasename("posre")
1580 .description("Include file for topology"));
1581 options->addOption(FileNameOption("n")
1582 .legacyType(efNDX)
1583 .outputFile()
1584 .store(&indexOutputFile_)
1585 .storeIsSet(&bIndexSet_)
1586 .defaultBasename("index")
1587 .description("Index file"));
1588 options->addOption(FileNameOption("q")
1589 .legacyType(efSTO)
1590 .outputFile()
1591 .store(&outFile_)
1592 .storeIsSet(&bOutputSet_)
1593 .defaultBasename("clean")
1594 .defaultType(efPDB)
1595 .description("Structure file"));
1598 void pdb2gmx::optionsFinished()
1600 if (inputConfFile_.empty())
1602 GMX_THROW(InconsistentInputError("You must supply an input file"));
1604 if (bInter_)
1606 /* if anything changes here, also change description of -inter */
1607 bCysMan_ = true;
1608 bTerMan_ = true;
1609 bLysMan_ = true;
1610 bArgMan_ = true;
1611 bAspMan_ = true;
1612 bGluMan_ = true;
1613 bGlnMan_ = true;
1614 bHisMan_ = true;
1617 if (bHeavyH_)
1619 mHmult_ = 4.0;
1621 else if (bDeuterate_)
1623 mHmult_ = 2.0;
1625 else
1627 mHmult_ = 1.0;
1630 /* Force field selection, interactive or direct */
1631 choose_ff(strcmp(ff_.c_str(), "select") == 0 ? nullptr : ff_.c_str(), forcefield_,
1632 sizeof(forcefield_), ffdir_, sizeof(ffdir_));
1634 if (strlen(forcefield_) > 0)
1636 ffname_ = forcefield_;
1637 ffname_[0] = std::toupper(ffname_[0]);
1639 else
1641 gmx_fatal(FARGS, "Empty forcefield string");
1645 int pdb2gmx::run()
1647 char select[STRLEN];
1648 std::vector<DisulfideBond> ssbonds;
1650 int this_atomnum;
1651 int prev_atomnum;
1652 const char* prev_atomname;
1653 const char* this_atomname;
1654 const char* prev_resname;
1655 const char* this_resname;
1656 int prev_resnum;
1657 int this_resnum;
1658 char prev_chainid;
1659 char this_chainid;
1660 int prev_chainnumber;
1661 int this_chainnumber;
1662 int this_chainstart;
1663 int prev_chainstart;
1665 printf("\nUsing the %s force field in directory %s\n\n", ffname_, ffdir_);
1667 choose_watermodel(WaterEnum[enumWater_], ffdir_, &watermodel_);
1669 switch (enumVSites_)
1671 case enVSites_none:
1672 bVsites_ = false;
1673 bVsiteAromatics_ = false;
1674 break;
1675 case enVSites_hydrogens:
1676 bVsites_ = true;
1677 bVsiteAromatics_ = false;
1678 break;
1679 case enVSites_aromatics:
1680 bVsites_ = true;
1681 bVsiteAromatics_ = true;
1682 break;
1683 default:
1684 gmx_fatal(FARGS, "Internal inconsistency: VSitesEnum='%s'", VSitesEnum[enumVSites_]);
1685 } /* end switch */
1687 /* Open the symbol table */
1688 t_symtab symtab;
1689 open_symtab(&symtab);
1691 /* Residue type database */
1692 ResidueType rt;
1694 /* Read residue renaming database(s), if present */
1695 std::vector<std::string> rrn = fflib_search_file_end(ffdir_, ".r2b", FALSE);
1697 std::vector<RtpRename> rtprename;
1698 for (const auto& filename : rrn)
1700 printf("going to rename %s\n", filename.c_str());
1701 FILE* fp = fflib_open(filename);
1702 read_rtprename(filename.c_str(), fp, &rtprename);
1703 gmx_ffclose(fp);
1706 /* Add all alternative names from the residue renaming database to the list
1707 of recognized amino/nucleic acids. */
1708 for (const auto& rename : rtprename)
1710 /* Only add names if the 'standard' gromacs/iupac base name was found */
1711 if (auto restype = rt.optionalTypeOfNamedDatabaseResidue(rename.gmx))
1713 rt.addResidue(rename.main, *restype);
1714 rt.addResidue(rename.nter, *restype);
1715 rt.addResidue(rename.cter, *restype);
1716 rt.addResidue(rename.bter, *restype);
1720 matrix box;
1721 const char* watres;
1722 clear_mat(box);
1723 if (watermodel_ != nullptr && (strstr(watermodel_, "4p") || strstr(watermodel_, "4P")))
1725 watres = "HO4";
1727 else if (watermodel_ != nullptr && (strstr(watermodel_, "5p") || strstr(watermodel_, "5P")))
1729 watres = "HO5";
1731 else
1733 watres = "HOH";
1736 AtomProperties aps;
1737 char* title = nullptr;
1738 PbcType pbcType;
1739 t_atoms pdba_all;
1740 rvec* pdbx;
1741 int natom = read_pdball(inputConfFile_.c_str(), bOutputSet_, outFile_.c_str(), &title, &pdba_all,
1742 &pdbx, &pbcType, box, bRemoveH_, &symtab, &rt, watres, &aps, bVerbose_);
1744 if (natom == 0)
1746 std::string message = formatString("No atoms found in pdb file %s\n", inputConfFile_.c_str());
1747 GMX_THROW(InconsistentInputError(message));
1750 printf("Analyzing pdb file\n");
1751 int nwaterchain = 0;
1753 modify_chain_numbers(&pdba_all, enumChainSep_);
1755 int nchainmerges = 0;
1757 this_atomname = nullptr;
1758 this_atomnum = -1;
1759 this_resname = nullptr;
1760 this_resnum = -1;
1761 this_chainid = '?';
1762 this_chainnumber = -1;
1763 this_chainstart = 0;
1764 /* Keep the compiler happy */
1765 prev_chainstart = 0;
1767 int numChains = 0;
1768 std::vector<t_pdbchain> pdb_ch;
1770 t_resinfo* ri;
1771 bool bMerged = false;
1772 for (int i = 0; (i < natom); i++)
1774 ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
1776 /* TODO this should live in a helper object, and consolidate
1777 that with code in modify_chain_numbers */
1778 prev_atomname = this_atomname;
1779 prev_atomnum = this_atomnum;
1780 prev_resname = this_resname;
1781 prev_resnum = this_resnum;
1782 prev_chainid = this_chainid;
1783 prev_chainnumber = this_chainnumber;
1784 if (!bMerged)
1786 prev_chainstart = this_chainstart;
1789 this_atomname = *pdba_all.atomname[i];
1790 this_atomnum = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i + 1;
1791 this_resname = *ri->name;
1792 this_resnum = ri->nr;
1793 this_chainid = ri->chainid;
1794 this_chainnumber = ri->chainnum;
1796 bWat_ = gmx::equalCaseInsensitive(*ri->name, watres);
1798 if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat_ != bPrevWat_))
1800 GMX_RELEASE_ASSERT(
1801 pdba_all.pdbinfo,
1802 "Must have pdbinfo from reading a PDB file if chain number is changing");
1803 this_chainstart = pdba_all.atom[i].resind;
1804 bMerged = false;
1805 if (i > 0 && !bWat_)
1807 if (!strncmp(MergeEnum[enumMerge_], "int", 3))
1809 printf("Merge chain ending with residue %s%d (chain id '%c', atom %d %s) and "
1810 "chain starting with\n"
1811 "residue %s%d (chain id '%c', atom %d %s) into a single moleculetype "
1812 "(keeping termini)? [n/y]\n",
1813 prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1814 this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1816 if (nullptr == fgets(select, STRLEN - 1, stdin))
1818 gmx_fatal(FARGS, "Error reading from stdin");
1820 bMerged = (select[0] == 'y');
1822 else if (!strncmp(MergeEnum[enumMerge_], "all", 3))
1824 bMerged = true;
1828 if (bMerged)
1830 pdb_ch[numChains - 1].chainstart[pdb_ch[numChains - 1].nterpairs] =
1831 pdba_all.atom[i].resind - prev_chainstart;
1832 pdb_ch[numChains - 1].nterpairs++;
1833 pdb_ch[numChains - 1].chainstart.resize(pdb_ch[numChains - 1].nterpairs + 1);
1834 nchainmerges++;
1836 else
1838 /* set natom for previous chain */
1839 if (numChains > 0)
1841 pdb_ch[numChains - 1].natom = i - pdb_ch[numChains - 1].start;
1843 if (bWat_)
1845 nwaterchain++;
1846 ri->chainid = ' ';
1848 /* check if chain identifier was used before */
1849 for (int j = 0; (j < numChains); j++)
1851 if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
1853 printf("WARNING: Chain identifier '%c' is used in two non-sequential "
1854 "blocks.\n"
1855 "They will be treated as separate chains unless you reorder your "
1856 "file.\n",
1857 ri->chainid);
1860 t_pdbchain newChain;
1861 newChain.chainid = ri->chainid;
1862 newChain.chainnum = ri->chainnum;
1863 newChain.start = i;
1864 newChain.bAllWat = bWat_;
1865 if (bWat_)
1867 newChain.nterpairs = 0;
1869 else
1871 newChain.nterpairs = 1;
1873 newChain.chainstart.resize(newChain.nterpairs + 1);
1874 /* modified [numChains] to [0] below */
1875 newChain.chainstart[0] = 0;
1876 pdb_ch.push_back(newChain);
1877 numChains++;
1880 bPrevWat_ = bWat_;
1882 pdb_ch.back().natom = natom - pdb_ch.back().start;
1884 /* set all the water blocks at the end of the chain */
1885 std::vector<int> swap_index(numChains);
1886 int j = 0;
1887 for (int i = 0; i < numChains; i++)
1889 if (!pdb_ch[i].bAllWat)
1891 swap_index[j] = i;
1892 j++;
1895 for (int i = 0; i < numChains; i++)
1897 if (pdb_ch[i].bAllWat)
1899 swap_index[j] = i;
1900 j++;
1903 if (nwaterchain > 1)
1905 printf("Moved all the water blocks to the end\n");
1908 t_atoms* pdba;
1909 std::vector<t_chain> chains(numChains);
1910 /* copy pdb data and x for all chains */
1911 for (int i = 0; (i < numChains); i++)
1913 int si = swap_index[i];
1914 chains[i].chainid = pdb_ch[si].chainid;
1915 chains[i].chainnum = pdb_ch[si].chainnum;
1916 chains[i].bAllWat = pdb_ch[si].bAllWat;
1917 chains[i].nterpairs = pdb_ch[si].nterpairs;
1918 chains[i].chainstart = pdb_ch[si].chainstart;
1919 chains[i].ntdb.clear();
1920 chains[i].ctdb.clear();
1921 chains[i].r_start.resize(pdb_ch[si].nterpairs);
1922 chains[i].r_end.resize(pdb_ch[si].nterpairs);
1924 snew(chains[i].pdba, 1);
1925 init_t_atoms(chains[i].pdba, pdb_ch[si].natom, true);
1926 for (j = 0; j < chains[i].pdba->nr; j++)
1928 chains[i].pdba->atom[j] = pdba_all.atom[pdb_ch[si].start + j];
1929 chains[i].pdba->atomname[j] = put_symtab(&symtab, *pdba_all.atomname[pdb_ch[si].start + j]);
1930 chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start + j];
1931 chains[i].x.emplace_back(pdbx[pdb_ch[si].start + j]);
1933 /* Re-index the residues assuming that the indices are continuous */
1934 int k = chains[i].pdba->atom[0].resind;
1935 int nres = chains[i].pdba->atom[chains[i].pdba->nr - 1].resind - k + 1;
1936 chains[i].pdba->nres = nres;
1937 for (int j = 0; j < chains[i].pdba->nr; j++)
1939 chains[i].pdba->atom[j].resind -= k;
1941 srenew(chains[i].pdba->resinfo, nres);
1942 for (int j = 0; j < nres; j++)
1944 chains[i].pdba->resinfo[j] = pdba_all.resinfo[k + j];
1945 chains[i].pdba->resinfo[j].name = put_symtab(&symtab, *pdba_all.resinfo[k + j].name);
1946 /* make all chain identifiers equal to that of the chain */
1947 chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
1951 if (nchainmerges > 0)
1953 printf("\nMerged chains into joint molecule definitions at %d places.\n\n", nchainmerges);
1956 printf("There are %d chains and %d blocks of water and "
1957 "%d residues with %d atoms\n",
1958 numChains - nwaterchain, nwaterchain, pdba_all.nres, natom);
1960 printf("\n %5s %4s %6s\n", "chain", "#res", "#atoms");
1961 for (int i = 0; (i < numChains); i++)
1963 printf(" %d '%c' %5d %6d %s\n", i + 1, chains[i].chainid ? chains[i].chainid : '-',
1964 chains[i].pdba->nres, chains[i].pdba->nr, chains[i].bAllWat ? "(only water)" : "");
1966 printf("\n");
1968 check_occupancy(&pdba_all, inputConfFile_.c_str(), bVerbose_);
1970 /* Read atomtypes... */
1971 PreprocessingAtomTypes atype = read_atype(ffdir_, &symtab);
1973 /* read residue database */
1974 printf("Reading residue database... (%s)\n", forcefield_);
1975 std::vector<std::string> rtpf = fflib_search_file_end(ffdir_, ".rtp", true);
1976 std::vector<PreprocessResidue> rtpFFDB;
1977 for (const auto& filename : rtpf)
1979 readResidueDatabase(filename, &rtpFFDB, &atype, &symtab, false);
1981 if (bNewRTP_)
1983 /* Not correct with multiple rtp input files with different bonded types */
1984 FILE* fp = gmx_fio_fopen("new.rtp", "w");
1985 print_resall(fp, rtpFFDB, atype);
1986 gmx_fio_fclose(fp);
1989 /* read hydrogen database */
1990 std::vector<MoleculePatchDatabase> ah;
1991 read_h_db(ffdir_, &ah);
1993 /* Read Termini database... */
1994 std::vector<MoleculePatchDatabase> ntdb;
1995 std::vector<MoleculePatchDatabase> ctdb;
1996 std::vector<MoleculePatchDatabase*> tdblist;
1997 int nNtdb = read_ter_db(ffdir_, 'n', &ntdb, &atype);
1998 int nCtdb = read_ter_db(ffdir_, 'c', &ctdb, &atype);
2000 FILE* top_file = gmx_fio_fopen(topologyFile_.c_str(), "w");
2002 print_top_header(top_file, topologyFile_.c_str(), FALSE, ffdir_, mHmult_);
2004 t_chain* cc;
2005 std::vector<gmx::RVec> x;
2006 /* new pdb datastructure for sorting. */
2007 t_atoms** sortAtoms = nullptr;
2008 t_atoms** localAtoms = nullptr;
2009 snew(sortAtoms, numChains);
2010 snew(localAtoms, numChains);
2011 for (int chain = 0; (chain < numChains); chain++)
2013 cc = &(chains[chain]);
2015 /* set pdba, natom and nres to the current chain */
2016 pdba = cc->pdba;
2017 x = cc->x;
2018 natom = cc->pdba->nr;
2019 int nres = cc->pdba->nres;
2021 if (cc->chainid && (cc->chainid != ' '))
2023 printf("Processing chain %d '%c' (%d atoms, %d residues)\n", chain + 1, cc->chainid,
2024 natom, nres);
2026 else
2028 printf("Processing chain %d (%d atoms, %d residues)\n", chain + 1, natom, nres);
2031 process_chain(pdba, x, bUnA_, bUnA_, bUnA_, bLysMan_, bAspMan_, bGluMan_, bHisMan_,
2032 bArgMan_, bGlnMan_, angle_, distance_, &symtab, rtprename);
2034 cc->chainstart[cc->nterpairs] = pdba->nres;
2035 j = 0;
2036 for (int i = 0; i < cc->nterpairs; i++)
2038 find_nc_ter(pdba, cc->chainstart[i], cc->chainstart[i + 1], &(cc->r_start[j]),
2039 &(cc->r_end[j]), &rt);
2041 if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
2043 j++;
2046 cc->nterpairs = j;
2047 if (cc->nterpairs == 0)
2049 printf("Problem with chain definition, or missing terminal residues.\n"
2050 "This chain does not appear to contain a recognized chain molecule.\n"
2051 "If this is incorrect, you can edit residuetypes.dat to modify the behavior.\n");
2054 /* Check for disulfides and other special bonds */
2055 ssbonds = makeDisulfideBonds(pdba, gmx::as_rvec_array(x.data()), bCysMan_, bVerbose_);
2057 if (!rtprename.empty())
2059 rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, rtprename, &symtab, bVerbose_);
2062 for (int i = 0; i < cc->nterpairs; i++)
2064 /* Set termini.
2065 * We first apply a filter so we only have the
2066 * termini that can be applied to the residue in question
2067 * (or a generic terminus if no-residue specific is available).
2069 /* First the N terminus */
2070 if (nNtdb > 0)
2072 tdblist = filter_ter(ntdb, *pdba->resinfo[cc->r_start[i]].name);
2073 if (tdblist.empty())
2075 printf("No suitable end (N or 5') terminus found in database - assuming this "
2076 "residue\n"
2077 "is already in a terminus-specific form and skipping terminus "
2078 "selection.\n");
2079 cc->ntdb.push_back(nullptr);
2081 else
2083 if (bTerMan_ && !tdblist.empty())
2085 sprintf(select, "Select start terminus type for %s-%d",
2086 *pdba->resinfo[cc->r_start[i]].name, pdba->resinfo[cc->r_start[i]].nr);
2087 cc->ntdb.push_back(choose_ter(tdblist, select));
2089 else
2091 cc->ntdb.push_back(tdblist[0]);
2094 printf("Start terminus %s-%d: %s\n", *pdba->resinfo[cc->r_start[i]].name,
2095 pdba->resinfo[cc->r_start[i]].nr, (cc->ntdb[i])->name.c_str());
2096 tdblist.clear();
2099 else
2101 cc->ntdb.push_back(nullptr);
2104 /* And the C terminus */
2105 if (nCtdb > 0)
2107 tdblist = filter_ter(ctdb, *pdba->resinfo[cc->r_end[i]].name);
2108 if (tdblist.empty())
2110 printf("No suitable end (C or 3') terminus found in database - assuming this "
2111 "residue\n"
2112 "is already in a terminus-specific form and skipping terminus "
2113 "selection.\n");
2114 cc->ctdb.push_back(nullptr);
2116 else
2118 if (bTerMan_ && !tdblist.empty())
2120 sprintf(select, "Select end terminus type for %s-%d",
2121 *pdba->resinfo[cc->r_end[i]].name, pdba->resinfo[cc->r_end[i]].nr);
2122 cc->ctdb.push_back(choose_ter(tdblist, select));
2124 else
2126 cc->ctdb.push_back(tdblist[0]);
2128 printf("End terminus %s-%d: %s\n", *pdba->resinfo[cc->r_end[i]].name,
2129 pdba->resinfo[cc->r_end[i]].nr, (cc->ctdb[i])->name.c_str());
2130 tdblist.clear();
2133 else
2135 cc->ctdb.push_back(nullptr);
2138 std::vector<MoleculePatchDatabase> hb_chain;
2139 /* lookup hackblocks and rtp for all residues */
2140 std::vector<PreprocessResidue> restp_chain;
2141 get_hackblocks_rtp(&hb_chain, &restp_chain, rtpFFDB, pdba->nres, pdba->resinfo, cc->nterpairs,
2142 &symtab, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end, bAllowMissing_);
2143 /* ideally, now we would not need the rtp itself anymore, but do
2144 everything using the hb and restp arrays. Unfortunately, that
2145 requires some re-thinking of code in gen_vsite.c, which I won't
2146 do now :( AF 26-7-99 */
2148 rename_atoms(nullptr, ffdir_, pdba, &symtab, restp_chain, false, &rt, false, bVerbose_);
2150 match_atomnames_with_rtp(restp_chain, hb_chain, pdba, &symtab, x, bVerbose_);
2152 if (bSort_)
2154 char** gnames;
2155 t_blocka* block = new_blocka();
2156 snew(gnames, 1);
2157 sort_pdbatoms(restp_chain, natom, &pdba, &sortAtoms[chain], &x, block, &gnames);
2158 remove_duplicate_atoms(pdba, x, bVerbose_);
2159 if (bIndexSet_)
2161 if (bRemoveH_)
2163 fprintf(stderr,
2164 "WARNING: with the -remh option the generated "
2165 "index file (%s) might be useless\n"
2166 "(the index file is generated before hydrogens are added)",
2167 indexOutputFile_.c_str());
2169 write_index(indexOutputFile_.c_str(), block, gnames, false, 0);
2171 for (int i = 0; i < block->nr; i++)
2173 sfree(gnames[i]);
2175 sfree(gnames);
2176 done_blocka(block);
2177 sfree(block);
2179 else
2181 fprintf(stderr,
2182 "WARNING: "
2183 "without sorting no check for duplicate atoms can be done\n");
2186 /* Generate Hydrogen atoms (and termini) in the sequence */
2187 printf("Generating any missing hydrogen atoms and/or adding termini.\n");
2188 add_h(&pdba, &localAtoms[chain], &x, ah, &symtab, cc->nterpairs, cc->ntdb, cc->ctdb,
2189 cc->r_start, cc->r_end, bAllowMissing_);
2190 printf("Now there are %d residues with %d atoms\n", pdba->nres, pdba->nr);
2192 /* make up molecule name(s) */
2194 int k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2196 std::string restype = rt.typeOfNamedDatabaseResidue(*pdba->resinfo[k].name);
2198 std::string molname;
2199 std::string suffix;
2200 if (cc->bAllWat)
2202 molname = "Water";
2204 else
2206 this_chainid = cc->chainid;
2208 /* Add the chain id if we have one */
2209 if (this_chainid != ' ')
2211 suffix.append(formatString("_chain_%c", this_chainid));
2214 /* Check if there have been previous chains with the same id */
2215 int nid_used = 0;
2216 for (int k = 0; k < chain; k++)
2218 if (cc->chainid == chains[k].chainid)
2220 nid_used++;
2223 /* Add the number for this chain identifier if there are multiple copies */
2224 if (nid_used > 0)
2226 suffix.append(formatString("%d", nid_used + 1));
2229 if (suffix.length() > 0)
2231 molname.append(restype);
2232 molname.append(suffix);
2234 else
2236 molname = restype;
2239 std::string itp_fn = topologyFile_;
2241 std::string posre_fn = includeTopologyFile_;
2242 if ((numChains - nwaterchain > 1) && !cc->bAllWat)
2244 bITP_ = true;
2245 printf("Chain time...\n");
2246 // construct the itp file name
2247 itp_fn = stripSuffixIfPresent(itp_fn, ".top");
2248 itp_fn.append("_");
2249 itp_fn.append(molname);
2250 itp_fn.append(".itp");
2251 // now do the same for posre
2252 posre_fn = stripSuffixIfPresent(posre_fn, ".itp");
2253 posre_fn.append("_");
2254 posre_fn.append(molname);
2255 posre_fn.append(".itp");
2256 if (posre_fn == itp_fn)
2258 posre_fn = Path::concatenateBeforeExtension(posre_fn, "_pr");
2260 incls_.emplace_back();
2261 incls_.back() = itp_fn;
2262 itp_file_ = gmx_fio_fopen(itp_fn.c_str(), "w");
2264 else
2266 bITP_ = false;
2269 mols_.emplace_back();
2270 if (cc->bAllWat)
2272 mols_.back().name = "SOL";
2273 mols_.back().nr = pdba->nres;
2275 else
2277 mols_.back().name = molname;
2278 mols_.back().nr = 1;
2281 if (bITP_)
2283 print_top_comment(itp_file_, itp_fn.c_str(), ffdir_, true);
2286 FILE* top_file2;
2287 if (cc->bAllWat)
2289 top_file2 = nullptr;
2291 else if (bITP_)
2293 top_file2 = itp_file_;
2295 else
2297 top_file2 = top_file;
2300 pdb2top(top_file2, posre_fn.c_str(), molname.c_str(), pdba, &x, &atype, &symtab, rtpFFDB,
2301 restp_chain, hb_chain, bAllowMissing_, bVsites_, bVsiteAromatics_, ffdir_, mHmult_,
2302 ssbonds, long_bond_dist_, short_bond_dist_, bDeuterate_, bChargeGroups_, bCmap_,
2303 bRenumRes_, bRTPresname_);
2305 if (!cc->bAllWat)
2307 write_posres(posre_fn.c_str(), pdba, posre_fc_);
2310 if (bITP_)
2312 gmx_fio_fclose(itp_file_);
2315 /* pdba and natom have been reassigned somewhere so: */
2316 cc->pdba = pdba;
2317 cc->x = x;
2320 if (watermodel_ == nullptr)
2322 for (int chain = 0; chain < numChains; chain++)
2324 if (chains[chain].bAllWat)
2326 auto message = formatString(
2327 "You have chosen not to include a water model, "
2328 "but there is water in the input file. Select a "
2329 "water model or remove the water from your input file.");
2330 GMX_THROW(InconsistentInputError(message));
2334 else
2336 std::string waterFile = formatString("%s%c%s.itp", ffdir_, DIR_SEPARATOR, watermodel_);
2337 if (!fflib_fexist(waterFile))
2339 auto message = formatString(
2340 "The topology file '%s' for the selected water "
2341 "model '%s' can not be found in the force field "
2342 "directory. Select a different water model.",
2343 waterFile.c_str(), watermodel_);
2344 GMX_THROW(InconsistentInputError(message));
2348 print_top_mols(top_file, title, ffdir_, watermodel_, incls_, mols_);
2349 gmx_fio_fclose(top_file);
2351 /* now merge all chains back together */
2352 natom = 0;
2353 int nres = 0;
2354 for (int i = 0; (i < numChains); i++)
2356 natom += chains[i].pdba->nr;
2357 nres += chains[i].pdba->nres;
2359 t_atoms* atoms;
2360 snew(atoms, 1);
2361 init_t_atoms(atoms, natom, false);
2362 for (int i = 0; i < atoms->nres; i++)
2364 sfree(atoms->resinfo[i].name);
2366 atoms->nres = nres;
2367 srenew(atoms->resinfo, nres);
2368 x.clear();
2369 int k = 0;
2370 int l = 0;
2371 for (int i = 0; (i < numChains); i++)
2373 if (numChains > 1)
2375 printf("Including chain %d in system: %d atoms %d residues\n", i + 1,
2376 chains[i].pdba->nr, chains[i].pdba->nres);
2378 for (int j = 0; (j < chains[i].pdba->nr); j++)
2380 atoms->atom[k] = chains[i].pdba->atom[j];
2381 atoms->atom[k].resind += l; /* l is processed nr of residues */
2382 atoms->atomname[k] = chains[i].pdba->atomname[j];
2383 atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2384 x.push_back(chains[i].x[j]);
2385 k++;
2387 for (int j = 0; (j < chains[i].pdba->nres); j++)
2389 atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2390 if (bRTPresname_)
2392 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2394 l++;
2396 done_atom(chains[i].pdba);
2399 if (numChains > 1)
2401 fprintf(stderr, "Now there are %d atoms and %d residues\n", k, l);
2402 print_sums(atoms, true);
2405 rvec box_space;
2406 fprintf(stderr, "\nWriting coordinate file...\n");
2407 clear_rvec(box_space);
2408 if (box[0][0] == 0)
2410 make_new_box(atoms->nr, gmx::as_rvec_array(x.data()), box, box_space, false);
2412 write_sto_conf(outputConfFile_.c_str(), title, atoms, gmx::as_rvec_array(x.data()), nullptr,
2413 pbcType, box);
2415 done_symtab(&symtab);
2416 done_atom(&pdba_all);
2417 done_atom(atoms);
2418 for (int chain = 0; chain < numChains; chain++)
2420 sfree(sortAtoms[chain]);
2421 sfree(localAtoms[chain]);
2423 sfree(sortAtoms);
2424 sfree(localAtoms);
2425 sfree(atoms);
2426 sfree(title);
2427 sfree(pdbx);
2428 printf("\t\t--------- PLEASE NOTE ------------\n");
2429 printf("You have successfully generated a topology from: %s.\n", inputConfFile_.c_str());
2430 if (watermodel_ != nullptr)
2432 printf("The %s force field and the %s water model are used.\n", ffname_, watermodel_);
2433 sfree(watermodel_);
2435 else
2437 printf("The %s force field is used.\n", ffname_);
2439 printf("\t\t--------- ETON ESAELP ------------\n");
2441 return 0;
2444 } // namespace
2446 const char pdb2gmxInfo::name[] = "pdb2gmx";
2447 const char pdb2gmxInfo::shortDescription[] =
2448 "Convert coordinate files to topology and FF-compliant coordinate files";
2449 ICommandLineOptionsModulePointer pdb2gmxInfo::create()
2451 return std::make_unique<pdb2gmx>();
2454 } // namespace gmx