src/gromacs/gmxpreprocess/pdb2gmx.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "pdb2gmx.h"
  40
  41 #include <cctype>
  42 #include <cstdlib>
  43 #include <cstring>
  44 #include <ctime>
  45
  46 #include <algorithm>
  47 #include <memory>
  48 #include <string>
  49 #include <vector>
  50
  51 #include "gromacs/commandline/cmdlineoptionsmodule.h"
  52 #include "gromacs/fileio/confio.h"
  53 #include "gromacs/fileio/filetypes.h"
  54 #include "gromacs/fileio/gmxfio.h"
  55 #include "gromacs/fileio/pdbio.h"
  56 #include "gromacs/gmxlib/conformation_utilities.h"
  57 #include "gromacs/gmxpreprocess/fflibutil.h"
  58 #include "gromacs/gmxpreprocess/genhydro.h"
  59 #include "gromacs/gmxpreprocess/gpp_atomtype.h"
  60 #include "gromacs/gmxpreprocess/grompp_impl.h"
  61 #include "gromacs/gmxpreprocess/h_db.h"
  62 #include "gromacs/gmxpreprocess/hizzie.h"
  63 #include "gromacs/gmxpreprocess/pdb2top.h"
  64 #include "gromacs/gmxpreprocess/pgutil.h"
  65 #include "gromacs/gmxpreprocess/specbond.h"
  66 #include "gromacs/gmxpreprocess/ter_db.h"
  67 #include "gromacs/gmxpreprocess/toputil.h"
  68 #include "gromacs/gmxpreprocess/xlate.h"
  69 #include "gromacs/math/vec.h"
  70 #include "gromacs/options/basicoptions.h"
  71 #include "gromacs/options/filenameoption.h"
  72 #include "gromacs/options/ioptionscontainer.h"
  73 #include "gromacs/topology/atomprop.h"
  74 #include "gromacs/topology/block.h"
  75 #include "gromacs/topology/index.h"
  76 #include "gromacs/topology/residuetypes.h"
  77 #include "gromacs/topology/symtab.h"
  78 #include "gromacs/topology/topology.h"
  79 #include "gromacs/utility/dir_separator.h"
  80 #include "gromacs/utility/exceptions.h"
  81 #include "gromacs/utility/fatalerror.h"
  82 #include "gromacs/utility/path.h"
  83 #include "gromacs/utility/smalloc.h"
  84 #include "gromacs/utility/strdb.h"
  85 #include "gromacs/utility/stringutil.h"
  86
  87 #include "hackblock.h"
  88 #include "resall.h"
  89
  90 struct RtpRename{
  91     RtpRename(const char *newGmx, const char *newMain,
  92               const char *newNter, const char *newCter,
  93               const char *newBter) :
  94         gmx(newGmx), main(newMain), nter(newNter), cter(newCter), bter(newBter)
  95     {}
  96     std::string gmx;
  97     std::string main;
  98     std::string nter;
  99     std::string cter;
 100     std::string bter;
 101 };
 102
 103 static const char *res2bb_notermini(const std::string             &name,
 104                                     gmx::ArrayRef<const RtpRename> rr)
 105 {
 106     /* NOTE: This function returns the main building block name,
 107      *       it does not take terminal renaming into account.
 108      */
 109     auto found = std::find_if(rr.begin(), rr.end(), [&name](const auto &rename)
 110                               { return gmx::equalCaseInsensitive(name, rename.gmx); });
 111     return found != rr.end() ? found->main.c_str() : name.c_str();
 112 }
 113
 114 static const char *select_res(int nr, int resnr,
 115                               const char *name[], const char *expl[],
 116                               const char *title,
 117                               gmx::ArrayRef<const RtpRename> rr)
 118 {
 119     printf("Which %s type do you want for residue %d\n", title, resnr+1);
 120     for (int sel = 0; (sel < nr); sel++)
 121     {
 122         printf("%d. %s (%s)\n",
 123                sel, expl[sel], res2bb_notermini(name[sel], rr));
 124     }
 125     printf("\nType a number:"); fflush(stdout);
 126
 127     int userSelection;
 128     if (scanf("%d", &userSelection) != 1)
 129     {
 130         gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr+1);
 131     }
 132
 133     return name[userSelection];
 134 }
 135
 136 static const char *get_asptp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 137 {
 138     enum {
 139         easp, easpH, easpNR
 140     };
 141     const char *lh[easpNR]   = { "ASP", "ASPH" };
 142     const char *expl[easpNR] = {
 143         "Not protonated (charge -1)",
 144         "Protonated (charge 0)"
 145     };
 146
 147     return select_res(easpNR, resnr, lh, expl, "ASPARTIC ACID", rr);
 148 }
 149
 150 static const char *get_glutp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 151 {
 152     enum {
 153         eglu, egluH, egluNR
 154     };
 155     const char *lh[egluNR]   = { "GLU", "GLUH" };
 156     const char *expl[egluNR] = {
 157         "Not protonated (charge -1)",
 158         "Protonated (charge 0)"
 159     };
 160
 161     return select_res(egluNR, resnr, lh, expl, "GLUTAMIC ACID", rr);
 162 }
 163
 164 static const char *get_glntp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 165 {
 166     enum {
 167         egln, eglnH, eglnNR
 168     };
 169     const char *lh[eglnNR]   = { "GLN", "QLN" };
 170     const char *expl[eglnNR] = {
 171         "Not protonated (charge 0)",
 172         "Protonated (charge +1)"
 173     };
 174
 175     return select_res(eglnNR, resnr, lh, expl, "GLUTAMINE", rr);
 176 }
 177
 178 static const char *get_lystp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 179 {
 180     enum {
 181         elys, elysH, elysNR
 182     };
 183     const  char *lh[elysNR]   = { "LYSN", "LYS" };
 184     const char  *expl[elysNR] = {
 185         "Not protonated (charge 0)",
 186         "Protonated (charge +1)"
 187     };
 188
 189     return select_res(elysNR, resnr, lh, expl, "LYSINE", rr);
 190 }
 191
 192 static const char *get_argtp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 193 {
 194     enum {
 195         earg, eargH, eargNR
 196     };
 197     const  char *lh[eargNR]   = { "ARGN", "ARG" };
 198     const char  *expl[eargNR] = {
 199         "Not protonated (charge 0)",
 200         "Protonated (charge +1)"
 201     };
 202
 203     return select_res(eargNR, resnr, lh, expl, "ARGININE", rr);
 204 }
 205
 206 static const char *get_histp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 207 {
 208     const char *expl[ehisNR] = {
 209         "H on ND1 only",
 210         "H on NE2 only",
 211         "H on ND1 and NE2",
 212         "Coupled to Heme"
 213     };
 214
 215     return select_res(ehisNR, resnr, hh, expl, "HISTIDINE", rr);
 216 }
 217
 218 static void read_rtprename(const char *fname, FILE *fp,
 219                            std::vector<RtpRename> *rtprename)
 220 {
 221     char         line[STRLEN], buf[STRLEN];
 222
 223     int          ncol = 0;
 224     while (get_a_line(fp, line, STRLEN))
 225     {
 226         /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
 227          * For other args, we read up to 6 chars (so we can detect if the length is > 5).
 228          * Note that the buffer length has been increased to 7 to allow this,
 229          * so we just need to make sure the strings have zero-length initially.
 230          */
 231         char gmx[STRLEN];
 232         char main[STRLEN];
 233         char nter[STRLEN];
 234         char cter[STRLEN];
 235         char bter[STRLEN];
 236         gmx[0] = '\0'; main[0] = '\0'; nter[0] = '\0'; cter[0] = '\0'; bter[0] = '\0';
 237         int  nc            = sscanf(line, "%6s %6s %6s %6s %6s %s",
 238                                     gmx, main, nter,
 239                                     cter, bter, buf);
 240         RtpRename newEntry(gmx, main, nter, cter, bter);
 241         if (ncol == 0)
 242         {
 243             if (nc != 2 && nc != 5)
 244             {
 245                 gmx_fatal(FARGS, "Residue renaming database '%s' has %d columns instead of %d or %d", fname, ncol, 2, 5);
 246             }
 247             ncol = nc;
 248         }
 249         else if (nc != ncol)
 250         {
 251             gmx_fatal(FARGS, "A line in residue renaming database '%s' has %d columns, while previous lines have %d columns", fname, nc, ncol);
 252         }
 253
 254         if (nc == 2)
 255         {
 256             /* This file does not have special termini names, copy them from main */
 257             newEntry.nter = newEntry.main;
 258             newEntry.cter = newEntry.main;
 259             newEntry.bter = newEntry.main;
 260         }
 261         rtprename->push_back(newEntry);
 262     }
 263 }
 264
 265 static std::string search_resrename(gmx::ArrayRef<const RtpRename> rr,
 266                                     const char *name,
 267                                     bool bStart, bool bEnd,
 268                                     bool bCompareFFRTPname)
 269 {
 270     auto found = std::find_if(rr.begin(), rr.end(), [&name, &bCompareFFRTPname](const auto &rename)
 271                               { return ((!bCompareFFRTPname && (name == rename.gmx)) ||
 272                                         (bCompareFFRTPname && (name == rename.main))); });
 273
 274     std::string newName;
 275     /* If found in the database, rename this residue's rtp building block,
 276      * otherwise keep the old name.
 277      */
 278     if (found != rr.end())
 279     {
 280         if (bStart && bEnd)
 281         {
 282             newName  = found->bter;
 283         }
 284         else if (bStart)
 285         {
 286             newName = found->nter;
 287         }
 288         else if (bEnd)
 289         {
 290             newName = found->cter;
 291         }
 292         else
 293         {
 294             newName = found->main;
 295         }
 296
 297         if (newName[0] == '-')
 298         {
 299             gmx_fatal(FARGS, "In the chosen force field there is no residue type for '%s'%s", name, bStart ? ( bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus") : (bEnd ? " as an ending terminus" : ""));
 300         }
 301     }
 302
 303     return newName;
 304 }
 305
 306 static void rename_resrtp(t_atoms                       *pdba,
 307                           int                            nterpairs,
 308                           gmx::ArrayRef<const int>       r_start,
 309                           gmx::ArrayRef<const int>       r_end,
 310                           gmx::ArrayRef<const RtpRename> rr,
 311                           t_symtab                      *symtab,
 312                           bool                           bVerbose)
 313 {
 314     bool bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
 315
 316     for (int r = 0; r < pdba->nres; r++)
 317     {
 318         bool bStart = false;
 319         bool bEnd   = false;
 320         for (int j = 0; j < nterpairs; j++)
 321         {
 322             if (r == r_start[j])
 323             {
 324                 bStart = true;
 325             }
 326         }
 327         for (int j = 0; j < nterpairs; j++)
 328         {
 329             if (r == r_end[j])
 330             {
 331                 bEnd = true;
 332             }
 333         }
 334
 335         std::string newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, false);
 336
 337         if (bFFRTPTERRNM && newName.empty() && (bStart || bEnd))
 338         {
 339             /* This is a terminal residue, but the residue name,
 340              * currently stored in .rtp, is not a standard residue name,
 341              * but probably a force field specific rtp name.
 342              * Check if we need to rename it because it is terminal.
 343              */
 344             newName = search_resrename(rr,
 345                                        *pdba->resinfo[r].rtp, bStart, bEnd, true);
 346         }
 347
 348         if (!newName.empty() && newName != *pdba->resinfo[r].rtp)
 349         {
 350             if (bVerbose)
 351             {
 352                 printf("Changing rtp entry of residue %d %s to '%s'\n",
 353                        pdba->resinfo[r].nr, *pdba->resinfo[r].name, newName.c_str());
 354             }
 355             pdba->resinfo[r].rtp = put_symtab(symtab, newName.c_str());
 356         }
 357     }
 358 }
 359
 360 static void pdbres_to_gmxrtp(t_atoms *pdba)
 361 {
 362     int i;
 363
 364     for (i = 0; (i < pdba->nres); i++)
 365     {
 366         if (pdba->resinfo[i].rtp == nullptr)
 367         {
 368             pdba->resinfo[i].rtp = pdba->resinfo[i].name;
 369         }
 370     }
 371 }
 372
 373 static void rename_pdbres(t_atoms *pdba, const char *oldnm, const char *newnm,
 374                           bool bFullCompare, t_symtab *symtab)
 375 {
 376     char *resnm;
 377     int   i;
 378
 379     for (i = 0; (i < pdba->nres); i++)
 380     {
 381         resnm = *pdba->resinfo[i].name;
 382         if ((bFullCompare && (gmx::equalCaseInsensitive(resnm, oldnm))) ||
 383             (!bFullCompare && strstr(resnm, oldnm) != nullptr))
 384         {
 385             /* Rename the residue name (not the rtp name) */
 386             pdba->resinfo[i].name = put_symtab(symtab, newnm);
 387         }
 388     }
 389 }
 390
 391 static void rename_bb(t_atoms *pdba, const char *oldnm, const char *newnm,
 392                       bool bFullCompare, t_symtab *symtab)
 393 {
 394     char *bbnm;
 395     int   i;
 396
 397     for (i = 0; (i < pdba->nres); i++)
 398     {
 399         /* We have not set the rtp name yes, use the residue name */
 400         bbnm = *pdba->resinfo[i].name;
 401         if ((bFullCompare && (gmx::equalCaseInsensitive(bbnm, oldnm))) ||
 402             (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
 403         {
 404             /* Change the rtp builing block name */
 405             pdba->resinfo[i].rtp = put_symtab(symtab, newnm);
 406         }
 407     }
 408 }
 409
 410 static void rename_bbint(t_atoms *pdba, const char *oldnm,
 411                          const char *gettp(int, gmx::ArrayRef<const RtpRename>),
 412                          bool bFullCompare,
 413                          t_symtab *symtab,
 414                          gmx::ArrayRef<const RtpRename> rr)
 415 {
 416     int         i;
 417     const char *ptr;
 418     char       *bbnm;
 419
 420     for (i = 0; i < pdba->nres; i++)
 421     {
 422         /* We have not set the rtp name yet, use the residue name */
 423         bbnm = *pdba->resinfo[i].name;
 424         if ((bFullCompare && (strcmp(bbnm, oldnm) == 0)) ||
 425             (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
 426         {
 427             ptr                  = gettp(i, rr);
 428             pdba->resinfo[i].rtp = put_symtab(symtab, ptr);
 429         }
 430     }
 431 }
 432
 433 static void check_occupancy(t_atoms *atoms, const char *filename, bool bVerbose)
 434 {
 435     int i, ftp;
 436     int nzero   = 0;
 437     int nnotone = 0;
 438
 439     ftp = fn2ftp(filename);
 440     if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
 441     {
 442         fprintf(stderr, "No occupancies in %s\n", filename);
 443     }
 444     else
 445     {
 446         for (i = 0; (i < atoms->nr); i++)
 447         {
 448             if (atoms->pdbinfo[i].occup != 1)
 449             {
 450                 if (bVerbose)
 451                 {
 452                     fprintf(stderr, "Occupancy for atom %s%d-%s is %f rather than 1\n",
 453                             *atoms->resinfo[atoms->atom[i].resind].name,
 454                             atoms->resinfo[ atoms->atom[i].resind].nr,
 455                             *atoms->atomname[i],
 456                             atoms->pdbinfo[i].occup);
 457                 }
 458                 if (atoms->pdbinfo[i].occup == 0)
 459                 {
 460                     nzero++;
 461                 }
 462                 else
 463                 {
 464                     nnotone++;
 465                 }
 466             }
 467         }
 468         if (nzero == atoms->nr)
 469         {
 470             fprintf(stderr, "All occupancy fields zero. This is probably not an X-Ray structure\n");
 471         }
 472         else if ((nzero > 0) || (nnotone > 0))
 473         {
 474             fprintf(stderr,
 475                     "\n"
 476                     "WARNING: there were %d atoms with zero occupancy and %d atoms with\n"
 477                     "         occupancy unequal to one (out of %d atoms). Check your pdb file.\n"
 478                     "\n",
 479                     nzero, nnotone, atoms->nr);
 480         }
 481         else
 482         {
 483             fprintf(stderr, "All occupancies are one\n");
 484         }
 485     }
 486 }
 487
 488 static void write_posres(const char *fn, t_atoms *pdba, real fc)
 489 {
 490     FILE *fp;
 491     int   i;
 492
 493     fp = gmx_fio_fopen(fn, "w");
 494     fprintf(fp,
 495             "; In this topology include file, you will find position restraint\n"
 496             "; entries for all the heavy atoms in your original pdb file.\n"
 497             "; This means that all the protons which were added by pdb2gmx are\n"
 498             "; not restrained.\n"
 499             "\n"
 500             "[ position_restraints ]\n"
 501             "; %4s%6s%8s%8s%8s\n", "atom", "type", "fx", "fy", "fz"
 502             );
 503     for (i = 0; (i < pdba->nr); i++)
 504     {
 505         if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
 506         {
 507             fprintf(fp, "%6d%6d  %g  %g  %g\n", i+1, 1, fc, fc, fc);
 508         }
 509     }
 510     gmx_fio_fclose(fp);
 511 }
 512
 513 static int read_pdball(const char *inf, bool bOutput, const char *outf, char **title,
 514                        t_atoms *atoms, rvec **x,
 515                        int *ePBC, matrix box, bool bRemoveH,
 516                        t_symtab *symtab, ResidueType *rt, const char *watres,
 517                        AtomProperties *aps, bool bVerbose)
 518 /* Read a pdb file. (containing proteins) */
 519 {
 520     int natom, new_natom, i;
 521
 522     /* READ IT */
 523     printf("Reading %s...\n", inf);
 524     readConfAndAtoms(inf, symtab, title, atoms, ePBC, x, nullptr, box);
 525     natom           = atoms->nr;
 526     if (atoms->pdbinfo == nullptr)
 527     {
 528         snew(atoms->pdbinfo, atoms->nr);
 529     }
 530     if (fn2ftp(inf) == efPDB)
 531     {
 532         get_pdb_atomnumber(atoms, aps);
 533     }
 534     if (bRemoveH)
 535     {
 536         new_natom = 0;
 537         for (i = 0; i < atoms->nr; i++)
 538         {
 539             if (!is_hydrogen(*atoms->atomname[i]))
 540             {
 541                 atoms->atom[new_natom]     = atoms->atom[i];
 542                 atoms->atomname[new_natom] = atoms->atomname[i];
 543                 atoms->pdbinfo[new_natom]  = atoms->pdbinfo[i];
 544                 copy_rvec((*x)[i], (*x)[new_natom]);
 545                 new_natom++;
 546             }
 547         }
 548         atoms->nr = new_natom;
 549         natom     = new_natom;
 550     }
 551
 552     printf("Read");
 553     if (title[0])
 554     {
 555         printf(" '%s',", *title);
 556     }
 557     printf(" %d atoms\n", natom);
 558
 559     /* Rename residues */
 560     rename_pdbres(atoms, "HOH", watres, false, symtab);
 561     rename_pdbres(atoms, "SOL", watres, false, symtab);
 562     rename_pdbres(atoms, "WAT", watres, false, symtab);
 563
 564     rename_atoms("xlateat.dat", nullptr,
 565                  atoms, symtab, {}, true,
 566                  rt, true, bVerbose);
 567
 568     if (natom == 0)
 569     {
 570         return 0;
 571     }
 572     if (bOutput)
 573     {
 574         write_sto_conf(outf, *title, atoms, *x, nullptr, *ePBC, box);
 575     }
 576
 577     return natom;
 578 }
 579
 580 static void process_chain(t_atoms *pdba, gmx::ArrayRef<gmx::RVec> x,
 581                           bool bTrpU, bool bPheU, bool bTyrU,
 582                           bool bLysMan, bool bAspMan, bool bGluMan,
 583                           bool bHisMan, bool bArgMan, bool bGlnMan,
 584                           real angle, real distance, t_symtab *symtab,
 585                           gmx::ArrayRef<const RtpRename> rr)
 586 {
 587     /* Rename aromatics, lys, asp and histidine */
 588     if (bTyrU)
 589     {
 590         rename_bb(pdba, "TYR", "TYRU", false, symtab);
 591     }
 592     if (bTrpU)
 593     {
 594         rename_bb(pdba, "TRP", "TRPU", false, symtab);
 595     }
 596     if (bPheU)
 597     {
 598         rename_bb(pdba, "PHE", "PHEU", false, symtab);
 599     }
 600     if (bLysMan)
 601     {
 602         rename_bbint(pdba, "LYS", get_lystp, false, symtab, rr);
 603     }
 604     if (bArgMan)
 605     {
 606         rename_bbint(pdba, "ARG", get_argtp, false, symtab, rr);
 607     }
 608     if (bGlnMan)
 609     {
 610         rename_bbint(pdba, "GLN", get_glntp, false, symtab, rr);
 611     }
 612     if (bAspMan)
 613     {
 614         rename_bbint(pdba, "ASP", get_asptp, false, symtab, rr);
 615     }
 616     else
 617     {
 618         rename_bb(pdba, "ASPH", "ASP", false, symtab);
 619     }
 620     if (bGluMan)
 621     {
 622         rename_bbint(pdba, "GLU", get_glutp, false, symtab, rr);
 623     }
 624     else
 625     {
 626         rename_bb(pdba, "GLUH", "GLU", false, symtab);
 627     }
 628
 629     if (!bHisMan)
 630     {
 631         set_histp(pdba, gmx::as_rvec_array(x.data()), symtab, angle, distance);
 632     }
 633     else
 634     {
 635         rename_bbint(pdba, "HIS", get_histp, true, symtab, rr);
 636     }
 637
 638     /* Initialize the rtp builing block names with the residue names
 639      * for the residues that have not been processed above.
 640      */
 641     pdbres_to_gmxrtp(pdba);
 642
 643     /* Now we have all rtp names set.
 644      * The rtp names will conform to Gromacs naming,
 645      * unless the input pdb file contained one or more force field specific
 646      * rtp names as residue names.
 647      */
 648 }
 649
 650 /* struct for sorting the atoms from the pdb file */
 651 typedef struct {
 652     int  resnr;  /* residue number               */
 653     int  j;      /* database order index         */
 654     int  index;  /* original atom number         */
 655     char anm1;   /* second letter of atom name   */
 656     char altloc; /* alternate location indicator */
 657 } t_pdbindex;
 658
 659 static bool pdbicomp(const t_pdbindex &a, const t_pdbindex &b)
 660 {
 661     int d = (a.resnr - b.resnr);
 662     if (d == 0)
 663     {
 664         d = (a.j - b.j);
 665         if (d == 0)
 666         {
 667             d = (a.anm1 - b.anm1);
 668             if (d == 0)
 669             {
 670                 d = (a.altloc - b.altloc);
 671             }
 672         }
 673     }
 674     return d < 0;
 675 }
 676
 677 static void sort_pdbatoms(gmx::ArrayRef<const PreprocessResidue> restp_chain,
 678                           int natoms,
 679                           t_atoms **pdbaptr,
 680                           t_atoms **newPdbAtoms,
 681                           std::vector<gmx::RVec> *x,
 682                           t_blocka *block, char ***gnames)
 683 {
 684     t_atoms               *pdba = *pdbaptr;
 685     std::vector<gmx::RVec> xnew;
 686     t_pdbindex            *pdbi;
 687     char                  *atomnm;
 688
 689     natoms = pdba->nr;
 690     snew(pdbi, natoms);
 691
 692     for (int i = 0; i < natoms; i++)
 693     {
 694         atomnm = *pdba->atomname[i];
 695         const PreprocessResidue *localPpResidue   = &restp_chain[pdba->atom[i].resind];
 696         auto                     found            = std::find_if(localPpResidue->atomname.begin(), localPpResidue->atomname.end(),
 697                                                                  [&atomnm](char** it){return gmx::equalCaseInsensitive(atomnm, *it); });
 698         if (found == localPpResidue->atomname.end())
 699         {
 700             char buf[STRLEN];
 701
 702             sprintf(buf,
 703                     "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
 704                     "while sorting atoms.\n%s", atomnm,
 705                     *pdba->resinfo[pdba->atom[i].resind].name,
 706                     pdba->resinfo[pdba->atom[i].resind].nr,
 707                     localPpResidue->resname.c_str(),
 708                     localPpResidue->natom(),
 709                     is_hydrogen(atomnm) ?
 710                     "\nFor a hydrogen, this can be a different protonation state, or it\n"
 711                     "might have had a different number in the PDB file and was rebuilt\n"
 712                     "(it might for instance have been H3, and we only expected H1 & H2).\n"
 713                     "Note that hydrogens might have been added to the entry for the N-terminus.\n"
 714                     "Remove this hydrogen or choose a different protonation state to solve it.\n"
 715                     "Option -ignh will ignore all hydrogens in the input." : ".");
 716             gmx_fatal(FARGS, "%s", buf);
 717         }
 718         /* make shadow array to be sorted into indexgroup */
 719         pdbi[i].resnr  = pdba->atom[i].resind;
 720         pdbi[i].j      = std::distance(localPpResidue->atomname.begin(), found);
 721         pdbi[i].index  = i;
 722         pdbi[i].anm1   = atomnm[1];
 723         pdbi[i].altloc = pdba->pdbinfo[i].altloc;
 724     }
 725     std::sort(pdbi, pdbi+natoms, pdbicomp);
 726
 727     /* pdba is sorted in pdbnew using the pdbi index */
 728     std::vector<int> a(natoms);
 729     srenew(*newPdbAtoms, 1);
 730     init_t_atoms((*newPdbAtoms), natoms, true);
 731     (*newPdbAtoms)->nr   = pdba->nr;
 732     (*newPdbAtoms)->nres = pdba->nres;
 733     srenew((*newPdbAtoms)->resinfo, pdba->nres);
 734     std::copy(pdba->resinfo, pdba->resinfo + pdba->nres, (*newPdbAtoms)->resinfo);
 735     for (int i = 0; i < natoms; i++)
 736     {
 737         (*newPdbAtoms)->atom[i]     = pdba->atom[pdbi[i].index];
 738         (*newPdbAtoms)->atomname[i] = pdba->atomname[pdbi[i].index];
 739         (*newPdbAtoms)->pdbinfo[i]  = pdba->pdbinfo[pdbi[i].index];
 740         xnew.push_back(x->at(pdbi[i].index));
 741         /* make indexgroup in block */
 742         a[i] = pdbi[i].index;
 743     }
 744     /* clean up */
 745     done_atom(pdba);
 746     sfree(pdba);
 747     /* copy the sorted pdbnew back to pdba */
 748     *pdbaptr = *newPdbAtoms;
 749     *x       = xnew;
 750     add_grp(block, gnames, a, "prot_sort");
 751     sfree(pdbi);
 752 }
 753
 754 static int remove_duplicate_atoms(t_atoms *pdba, gmx::ArrayRef<gmx::RVec> x, bool bVerbose)
 755 {
 756     int        i, j, oldnatoms, ndel;
 757     t_resinfo *ri;
 758
 759     printf("Checking for duplicate atoms....\n");
 760     oldnatoms    = pdba->nr;
 761     ndel         = 0;
 762     /* NOTE: pdba->nr is modified inside the loop */
 763     for (i = 1; (i < pdba->nr); i++)
 764     {
 765         /* compare 'i' and 'i-1', throw away 'i' if they are identical
 766            this is a 'while' because multiple alternate locations can be present */
 767         while ( (i < pdba->nr) &&
 768                 (pdba->atom[i-1].resind == pdba->atom[i].resind) &&
 769                 (strcmp(*pdba->atomname[i-1], *pdba->atomname[i]) == 0) )
 770         {
 771             ndel++;
 772             if (bVerbose)
 773             {
 774                 ri = &pdba->resinfo[pdba->atom[i].resind];
 775                 printf("deleting duplicate atom %4s  %s%4d%c",
 776                        *pdba->atomname[i], *ri->name, ri->nr, ri->ic);
 777                 if (ri->chainid && (ri->chainid != ' '))
 778                 {
 779                     printf(" ch %c", ri->chainid);
 780                 }
 781                 if (pdba->pdbinfo)
 782                 {
 783                     if (pdba->pdbinfo[i].atomnr)
 784                     {
 785                         printf("  pdb nr %4d", pdba->pdbinfo[i].atomnr);
 786                     }
 787                     if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
 788                     {
 789                         printf("  altloc %c", pdba->pdbinfo[i].altloc);
 790                     }
 791                 }
 792                 printf("\n");
 793             }
 794             pdba->nr--;
 795             /* We can not free, since it might be in the symtab */
 796             /* sfree(pdba->atomname[i]); */
 797             for (j = i; j < pdba->nr; j++)
 798             {
 799                 pdba->atom[j]     = pdba->atom[j+1];
 800                 pdba->atomname[j] = pdba->atomname[j+1];
 801                 if (pdba->pdbinfo)
 802                 {
 803                     pdba->pdbinfo[j]  = pdba->pdbinfo[j+1];
 804                 }
 805                 copy_rvec(x[j+1], x[j]);
 806             }
 807             srenew(pdba->atom,     pdba->nr);
 808             /* srenew(pdba->atomname, pdba->nr); */
 809             srenew(pdba->pdbinfo,  pdba->nr);
 810         }
 811     }
 812     if (pdba->nr != oldnatoms)
 813     {
 814         printf("Now there are %d atoms. Deleted %d duplicates.\n", pdba->nr, ndel);
 815     }
 816
 817     return pdba->nr;
 818 }
 819
 820 static void
 821 checkResidueTypeSanity(t_atoms     *pdba,
 822                        int          r0,
 823                        int          r1,
 824                        ResidueType *rt)
 825 {
 826     std::string startResidueString = gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
 827     std::string endResidueString   = gmx::formatString("%s%d", *pdba->resinfo[r1-1].name, pdba->resinfo[r1-1].nr);
 828
 829     // Check whether all residues in chain have the same chain ID.
 830     bool         allResiduesHaveSameChainID = true;
 831     char         chainID0                   = pdba->resinfo[r0].chainid;
 832     char         chainID;
 833     std::string  residueString;
 834
 835     for (int i = r0 + 1; i < r1; i++)
 836     {
 837         chainID = pdba->resinfo[i].chainid;
 838         if (chainID != chainID0)
 839         {
 840             allResiduesHaveSameChainID  = false;
 841             residueString               = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 842             break;
 843         }
 844     }
 845
 846     if (!allResiduesHaveSameChainID)
 847     {
 848         gmx_fatal(FARGS,
 849                   "The chain covering the range %s--%s does not have a consistent chain ID. "
 850                   "The first residue has ID '%c', while residue %s has ID '%c'.",
 851                   startResidueString.c_str(), endResidueString.c_str(),
 852                   chainID0, residueString.c_str(), chainID);
 853     }
 854
 855     // At this point all residues have the same ID. If they are also non-blank
 856     // we can be a bit more aggressive and require the types match too.
 857     if (chainID0 != ' ')
 858     {
 859         bool        allResiduesHaveSameType = true;
 860         std::string restype;
 861         std::string restype0 = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[r0].name);
 862
 863         for (int i = r0 + 1; i < r1; i++)
 864         {
 865             restype = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
 866             if (!gmx::equalCaseInsensitive(restype, restype0))
 867             {
 868                 allResiduesHaveSameType = false;
 869                 residueString           = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 870                 break;
 871             }
 872         }
 873
 874         if (!allResiduesHaveSameType)
 875         {
 876             gmx_fatal(FARGS,
 877                       "The residues in the chain %s--%s do not have a consistent type. "
 878                       "The first residue has type '%s', while residue %s is of type '%s'. "
 879                       "Either there is a mistake in your chain, or it includes nonstandard "
 880                       "residue names that have not yet been added to the residuetypes.dat "
 881                       "file in the GROMACS library directory. If there are other molecules "
 882                       "such as ligands, they should not have the same chain ID as the "
 883                       "adjacent protein chain since it's a separate molecule.",
 884                       startResidueString.c_str(), endResidueString.c_str(),
 885                       restype0.c_str(), residueString.c_str(), restype.c_str());
 886         }
 887     }
 888 }
 889
 890 static void find_nc_ter(t_atoms *pdba, int r0, int r1, int *r_start, int *r_end,
 891                         ResidueType *rt)
 892 {
 893     int         i;
 894     gmx::compat::optional<std::string> startrestype;
 895
 896     *r_start = -1;
 897     *r_end   = -1;
 898
 899     int startWarnings = 0;
 900     int endWarnings   = 0;
 901     int ionNotes      = 0;
 902
 903     // Check that all residues have the same chain identifier, and if it is
 904     // non-blank we also require the residue types to match.
 905     checkResidueTypeSanity(pdba, r0, r1, rt);
 906
 907     // If we return correctly from checkResidueTypeSanity(), the only
 908     // remaining cases where we can have non-matching residue types is if
 909     // the chain ID was blank, which could be the case e.g. for a structure
 910     // read from a GRO file or other file types without chain information.
 911     // In that case we need to be a bit more liberal and detect chains based
 912     // on the residue type.
 913
 914     // If we get here, the chain ID must be identical for all residues
 915     char chainID = pdba->resinfo[r0].chainid;
 916
 917     /* Find the starting terminus (typially N or 5') */
 918     for (i = r0; i < r1 && *r_start == -1; i++)
 919     {
 920         startrestype = rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
 921         if (!startrestype)
 922         {
 923             continue;
 924         }
 925         if (gmx::equalCaseInsensitive(*startrestype, "Protein") ||
 926             gmx::equalCaseInsensitive(*startrestype, "DNA") ||
 927             gmx::equalCaseInsensitive(*startrestype, "RNA") )
 928         {
 929             printf("Identified residue %s%d as a starting terminus.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 930             *r_start = i;
 931         }
 932         else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
 933         {
 934             if (ionNotes < 5)
 935             {
 936                 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 937             }
 938             if (ionNotes == 4)
 939             {
 940                 printf("Disabling further notes about ions.\n");
 941             }
 942             ionNotes++;
 943         }
 944         else
 945         {
 946             // Either no known residue type, or one not needing special handling
 947             if (startWarnings < 5)
 948             {
 949                 if (chainID == ' ')
 950                 {
 951                     printf("\nWarning: Starting residue %s%d in chain not identified as Protein/RNA/DNA.\n"
 952                            "This chain lacks identifiers, which makes it impossible to do strict\n"
 953                            "classification of the start/end residues. Here we need to guess this residue\n"
 954                            "should not be part of the chain and instead introduce a break, but that will\n"
 955                            "be catastrophic if they should in fact be linked. Please check your structure,\n"
 956                            "and add %s to residuetypes.dat if this was not correct.\n\n",
 957                            *pdba->resinfo[i].name, pdba->resinfo[i].nr, *pdba->resinfo[i].name);
 958                 }
 959                 else
 960                 {
 961                     printf("\nWarning: No residues in chain starting at %s%d identified as Protein/RNA/DNA.\n"
 962                            "This makes it impossible to link them into a molecule, which could either be\n"
 963                            "correct or a catastrophic error. Please check your structure, and add all\n"
 964                            "necessary residue names to residuetypes.dat if this was not correct.\n\n",
 965                            *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 966                 }
 967             }
 968             if (startWarnings == 4)
 969             {
 970                 printf("Disabling further warnings about unidentified residues at start of chain.\n");
 971             }
 972             startWarnings++;
 973         }
 974     }
 975
 976     if (*r_start >= 0)
 977     {
 978         /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
 979         for (int i = *r_start; i < r1; i++)
 980         {
 981             gmx::compat::optional<std::string> restype =
 982                 rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
 983             if (!restype)
 984             {
 985                 continue;
 986             }
 987             if (gmx::equalCaseInsensitive(*restype, *startrestype) && endWarnings == 0)
 988             {
 989                 *r_end = i;
 990             }
 991             else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
 992             {
 993                 if (ionNotes < 5)
 994                 {
 995                     printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 996                 }
 997                 if (ionNotes == 4)
 998                 {
 999                     printf("Disabling further notes about ions.\n");
1000                 }
1001                 ionNotes++;
1002             }
1003             else
1004             {
1005                 // Either no known residue type, or one not needing special handling.
1006                 GMX_RELEASE_ASSERT(chainID == ' ', "Chain ID must be blank");
1007                 // Otherwise the call to checkResidueTypeSanity() will
1008                 // have caught the problem.
1009                 if (endWarnings < 5)
1010                 {
1011                     printf("\nWarning: Residue %s%d in chain has different type ('%s') from\n"
1012                            "residue %s%d ('%s'). This chain lacks identifiers, which makes\n"
1013                            "it impossible to do strict classification of the start/end residues. Here we\n"
1014                            "need to guess this residue should not be part of the chain and instead\n"
1015                            "introduce a break, but that will be catastrophic if they should in fact be\n"
1016                            "linked. Please check your structure, and add %s to residuetypes.dat\n"
1017                            "if this was not correct.\n\n",
1018                            *pdba->resinfo[i].name, pdba->resinfo[i].nr, restype->c_str(),
1019                            *pdba->resinfo[*r_start].name, pdba->resinfo[*r_start].nr, startrestype->c_str(), *pdba->resinfo[i].name);
1020                 }
1021                 if (endWarnings == 4)
1022                 {
1023                     printf("Disabling further warnings about unidentified residues at end of chain.\n");
1024                 }
1025                 endWarnings++;
1026             }
1027         }
1028     }
1029
1030     if (*r_end >= 0)
1031     {
1032         printf("Identified residue %s%d as a ending terminus.\n", *pdba->resinfo[*r_end].name, pdba->resinfo[*r_end].nr);
1033     }
1034 }
1035
1036 /* enum for chain separation */
1037 enum ChainSepType {
1038     enChainSep_id_or_ter, enChainSep_id_and_ter, enChainSep_ter,
1039     enChainSep_id, enChainSep_interactive
1040 };
1041 static const char *ChainSepEnum[] = {"id_or_ter", "id_and_ter", "ter", "id", "interactive"};
1042 static const char *ChainSepInfoString[] =
1043 {
1044     "Splitting chemical chains based on TER records or chain id changing.\n",
1045     "Splitting chemical chains based on TER records and chain id changing.\n",
1046     "Splitting chemical chains based on TER records only (ignoring chain id).\n",
1047     "Splitting chemical chains based on changing chain id only (ignoring TER records).\n",
1048     "Splitting chemical chains interactively.\n"
1049 };
1050
1051 static void
1052 modify_chain_numbers(t_atoms *       pdba,
1053                      ChainSepType    enumChainSep)
1054 {
1055     int           i;
1056     char          old_prev_chainid;
1057     char          old_this_chainid;
1058     int           old_prev_chainnum;
1059     int           old_this_chainnum;
1060     t_resinfo    *ri;
1061     char          select[STRLEN];
1062     int           new_chainnum;
1063     int           this_atomnum;
1064     int           prev_atomnum;
1065     const char *  prev_atomname;
1066     const char *  this_atomname;
1067     const char *  prev_resname;
1068     const char *  this_resname;
1069     int           prev_resnum;
1070     int           this_resnum;
1071     char          prev_chainid;
1072     char          this_chainid;
1073
1074     /* The default chain enumeration is based on TER records only */
1075     printf("%s", ChainSepInfoString[enumChainSep]);
1076
1077     old_prev_chainid  = '?';
1078     old_prev_chainnum = -1;
1079     new_chainnum      = -1;
1080
1081     this_atomname       = nullptr;
1082     this_atomnum        = -1;
1083     this_resname        = nullptr;
1084     this_resnum         = -1;
1085     this_chainid        = '?';
1086
1087     for (i = 0; i < pdba->nres; i++)
1088     {
1089         ri                 = &pdba->resinfo[i];
1090         old_this_chainid   = ri->chainid;
1091         old_this_chainnum  = ri->chainnum;
1092
1093         prev_atomname      = this_atomname;
1094         prev_atomnum       = this_atomnum;
1095         prev_resname       = this_resname;
1096         prev_resnum        = this_resnum;
1097         prev_chainid       = this_chainid;
1098
1099         this_atomname      = *(pdba->atomname[i]);
1100         this_atomnum       = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i+1;
1101         this_resname       = *ri->name;
1102         this_resnum        = ri->nr;
1103         this_chainid       = ri->chainid;
1104
1105         switch (enumChainSep)
1106         {
1107             case enChainSep_id_or_ter:
1108                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1109                 {
1110                     new_chainnum++;
1111                 }
1112                 break;
1113
1114             case enChainSep_id_and_ter:
1115                 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1116                 {
1117                     new_chainnum++;
1118                 }
1119                 break;
1120
1121             case enChainSep_id:
1122                 if (old_this_chainid != old_prev_chainid)
1123                 {
1124                     new_chainnum++;
1125                 }
1126                 break;
1127
1128             case enChainSep_ter:
1129                 if (old_this_chainnum != old_prev_chainnum)
1130                 {
1131                     new_chainnum++;
1132                 }
1133                 break;
1134             case enChainSep_interactive:
1135                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1136                 {
1137                     if (i > 0)
1138                     {
1139                         printf("Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\
1140 \n"
1141                                "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]\n",
1142                                prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1143                                this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1144
1145                         if (nullptr == fgets(select, STRLEN-1, stdin))
1146                         {
1147                             gmx_fatal(FARGS, "Error reading from stdin");
1148                         }
1149                     }
1150                     if (i == 0 || select[0] == 'y')
1151                     {
1152                         new_chainnum++;
1153                     }
1154                 }
1155                 break;
1156             default:
1157                 gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1158         }
1159         old_prev_chainid  = old_this_chainid;
1160         old_prev_chainnum = old_this_chainnum;
1161
1162         ri->chainnum = new_chainnum;
1163     }
1164 }
1165
1166 struct t_pdbchain {
1167     char             chainid   = ' ';
1168     char             chainnum  = ' ';
1169     int              start     = -1;
1170     int              natom     = -1;
1171     bool             bAllWat   = false;
1172     int              nterpairs = -1;
1173     std::vector<int> chainstart;
1174 };
1175
1176 struct t_chain {
1177     char                                 chainid   = ' ';
1178     int                                  chainnum  = ' ';
1179     bool                                 bAllWat   = false;
1180     int                                  nterpairs = -1;
1181     std::vector<int>                     chainstart;
1182     std::vector<MoleculePatchDatabase *> ntdb;
1183     std::vector<MoleculePatchDatabase *> ctdb;
1184     std::vector<int>                     r_start;
1185     std::vector<int>                     r_end;
1186     t_atoms                             *pdba;
1187     std::vector<gmx::RVec>               x;
1188 };
1189
1190 // TODO make all enums into scoped enums
1191 /* enum for vsites */
1192 enum VSitesType {
1193     enVSites_none, enVSites_hydrogens, enVSites_aromatics
1194 };
1195 static const char *VSitesEnum[] = {"none", "hydrogens", "aromatics"};
1196
1197 /* enum for water model */
1198 enum WaterType {
1199     enWater_select, enWater_none, enWater_spc, enWater_spce,
1200     enWater_tip3p, enWater_tip4p, enWater_tip5p, enWater_tips3p
1201 };
1202 static const char *WaterEnum[] = {
1203     "select", "none", "spc", "spce",
1204     "tip3p", "tip4p", "tip5p", "tips3p"
1205 };
1206
1207 /* enum for merge */
1208 enum MergeType {
1209     enMerge_no, enMerge_all, enMerge_interactive
1210 };
1211 static const char *MergeEnum[] = {"no", "all", "interactive"};
1212
1213 namespace gmx
1214 {
1215
1216 namespace
1217 {
1218
1219 class pdb2gmx : public ICommandLineOptionsModule
1220 {
1221     public:
1222         pdb2gmx() :
1223             bVsites_(FALSE), bPrevWat_(FALSE), bVsiteAromatics_(FALSE),
1224             enumChainSep_(enChainSep_id_or_ter),
1225             enumVSites_(enVSites_none),
1226             enumWater_(enWater_select),
1227             enumMerge_(enMerge_no),
1228             itp_file_(nullptr),
1229             mHmult_(0)
1230         {
1231         }
1232
1233         // From ICommandLineOptionsModule
1234         void init(CommandLineModuleSettings * /*settings*/) override
1235         {
1236         }
1237
1238         void initOptions(IOptionsContainer                 *options,
1239                          ICommandLineOptionsModuleSettings *settings) override;
1240
1241         void optionsFinished() override;
1242
1243         int run() override;
1244
1245     private:
1246         bool                     bNewRTP_;
1247         bool                     bInter_;
1248         bool                     bCysMan_;
1249         bool                     bLysMan_;
1250         bool                     bAspMan_;
1251         bool                     bGluMan_;
1252         bool                     bHisMan_;
1253         bool                     bGlnMan_;
1254         bool                     bArgMan_;
1255         bool                     bTerMan_;
1256         bool                     bUnA_;
1257         bool                     bHeavyH_;
1258         bool                     bSort_;
1259         bool                     bAllowMissing_;
1260         bool                     bRemoveH_;
1261         bool                     bDeuterate_;
1262         bool                     bVerbose_;
1263         bool                     bChargeGroups_;
1264         bool                     bCmap_;
1265         bool                     bRenumRes_;
1266         bool                     bRTPresname_;
1267         bool                     bIndexSet_;
1268         bool                     bOutputSet_;
1269         bool                     bVsites_;
1270         bool                     bWat_;
1271         bool                     bPrevWat_;
1272         bool                     bITP_;
1273         bool                     bVsiteAromatics_;
1274         real                     angle_;
1275         real                     distance_;
1276         real                     posre_fc_;
1277         real                     long_bond_dist_;
1278         real                     short_bond_dist_;
1279
1280         std::string              indexOutputFile_;
1281         std::string              outputFile_;
1282         std::string              topologyFile_;
1283         std::string              includeTopologyFile_;
1284         std::string              outputConfFile_;
1285         std::string              inputConfFile_;
1286         std::string              outFile_;
1287         std::string              ff_;
1288
1289         ChainSepType             enumChainSep_;
1290         VSitesType               enumVSites_;
1291         WaterType                enumWater_;
1292         MergeType                enumMerge_;
1293
1294         FILE                    *itp_file_;
1295         char                     forcefield_[STRLEN];
1296         char                     ffdir_[STRLEN];
1297         char                    *ffname_;
1298         char                    *watermodel_;
1299         std::vector<std::string> incls_;
1300         std::vector<t_mols>      mols_;
1301         real                     mHmult_;
1302 };
1303
1304 void pdb2gmx::initOptions(IOptionsContainer                 *options,
1305                           ICommandLineOptionsModuleSettings *settings)
1306 {
1307     const char *desc[] = {
1308         "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1309         "some database files, adds hydrogens to the molecules and generates",
1310         "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1311         "and a topology in GROMACS format.",
1312         "These files can subsequently be processed to generate a run input file.",
1313         "[PAR]",
1314         "[THISMODULE] will search for force fields by looking for",
1315         "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1316         "of the current working directory and of the GROMACS library directory",
1317         "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1318         "variable.",
1319         "By default the forcefield selection is interactive,",
1320         "but you can use the [TT]-ff[tt] option to specify one of the short names",
1321         "in the list on the command line instead. In that case [THISMODULE] just looks",
1322         "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1323         "[PAR]",
1324         "After choosing a force field, all files will be read only from",
1325         "the corresponding force field directory.",
1326         "If you want to modify or add a residue types, you can copy the force",
1327         "field directory from the GROMACS library directory to your current",
1328         "working directory. If you want to add new protein residue types,",
1329         "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1330         "or copy the whole library directory to a local directory and set",
1331         "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1332         "Check Chapter 5 of the manual for more information about file formats.",
1333         "[PAR]",
1334
1335         "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1336         "need not necessarily contain a protein structure. Every kind of",
1337         "molecule for which there is support in the database can be converted.",
1338         "If there is no support in the database, you can add it yourself.[PAR]",
1339
1340         "The program has limited intelligence, it reads a number of database",
1341         "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1342         "if necessary this can be done manually. The program can prompt the",
1343         "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1344         "desired. For Lys the choice is between neutral (two protons on NZ) or",
1345         "protonated (three protons, default), for Asp and Glu unprotonated",
1346         "(default) or protonated, for His the proton can be either on ND1,",
1347         "on NE2 or on both. By default these selections are done automatically.",
1348         "For His, this is based on an optimal hydrogen bonding",
1349         "conformation. Hydrogen bonds are defined based on a simple geometric",
1350         "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1351         "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1352         "[TT]-dist[tt] respectively.[PAR]",
1353
1354         "The protonation state of N- and C-termini can be chosen interactively",
1355         "with the [TT]-ter[tt] flag.  Default termini are ionized (NH3+ and COO-),",
1356         "respectively.  Some force fields support zwitterionic forms for chains of",
1357         "one residue, but for polypeptides these options should NOT be selected.",
1358         "The AMBER force fields have unique forms for the terminal residues,",
1359         "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1360         "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1361         "respectively to use these forms, making sure you preserve the format",
1362         "of the coordinate file. Alternatively, use named terminating residues",
1363         "(e.g. ACE, NME).[PAR]",
1364
1365         "The separation of chains is not entirely trivial since the markup",
1366         "in user-generated PDB files frequently varies and sometimes it",
1367         "is desirable to merge entries across a TER record, for instance",
1368         "if you want a disulfide bridge or distance restraints between",
1369         "two protein chains or if you have a HEME group bound to a protein.",
1370         "In such cases multiple chains should be contained in a single",
1371         "[TT]moleculetype[tt] definition.",
1372         "To handle this, [THISMODULE] uses two separate options.",
1373         "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1374         "start, and termini added when applicable. This can be done based on the",
1375         "existence of TER records, when the chain id changes, or combinations of either",
1376         "or both of these. You can also do the selection fully interactively.",
1377         "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1378         "are merged into one moleculetype, after adding all the chemical termini (or not).",
1379         "This can be turned off (no merging), all non-water chains can be merged into a",
1380         "single molecule, or the selection can be done interactively.[PAR]",
1381
1382         "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1383         "If any of the occupancies are not one, indicating that the atom is",
1384         "not resolved well in the structure, a warning message is issued.",
1385         "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1386         "all occupancy fields may be zero. Either way, it is up to the user",
1387         "to verify the correctness of the input data (read the article!).[PAR]",
1388
1389         "During processing the atoms will be reordered according to GROMACS",
1390         "conventions. With [TT]-n[tt] an index file can be generated that",
1391         "contains one group reordered in the same way. This allows you to",
1392         "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1393         "one limitation: reordering is done after the hydrogens are stripped",
1394         "from the input and before new hydrogens are added. This means that",
1395         "you should not use [TT]-ignh[tt].[PAR]",
1396
1397         "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1398         "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1399         "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1400         "[PAR]",
1401
1402         "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1403         "motions. Angular and out-of-plane motions can be removed by changing",
1404         "hydrogens into virtual sites and fixing angles, which fixes their",
1405         "position relative to neighboring atoms. Additionally, all atoms in the",
1406         "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1407         "can be converted into virtual sites, eliminating the fast improper dihedral",
1408         "fluctuations in these rings (but this feature is deprecated).",
1409         "[BB]Note[bb] that in this case all other hydrogen",
1410         "atoms are also converted to virtual sites. The mass of all atoms that are",
1411         "converted into virtual sites, is added to the heavy atoms.[PAR]",
1412         "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1413         "done by increasing the hydrogen-mass by a factor of 4. This is also",
1414         "done for water hydrogens to slow down the rotational motion of water.",
1415         "The increase in mass of the hydrogens is subtracted from the bonded",
1416         "(heavy) atom so that the total mass of the system remains the same."
1417     };
1418
1419     settings->setHelpText(desc);
1420
1421     options->addOption(BooleanOption("newrtp")
1422                            .store(&bNewRTP_).defaultValue(false).hidden()
1423                            .description("Write the residue database in new format to [TT]new.rtp[tt]"));
1424     options->addOption(RealOption("lb")
1425                            .store(&long_bond_dist_).defaultValue(0.25).hidden()
1426                            .description("Long bond warning distance"));
1427     options->addOption(RealOption("sb")
1428                            .store(&short_bond_dist_).defaultValue(0.05).hidden()
1429                            .description("Short bond warning distance"));
1430     options->addOption(EnumOption<ChainSepType>("chainsep").enumValue(ChainSepEnum)
1431                            .store(&enumChainSep_)
1432                            .description("Condition in PDB files when a new chain should be started (adding termini)"));
1433     options->addOption(EnumOption<MergeType>("merge").enumValue(MergeEnum)
1434                            .store(&enumMerge_)
1435                            .description("Merge multiple chains into a single [moleculetype]"));
1436     options->addOption(StringOption("ff")
1437                            .store(&ff_).defaultValue("select")
1438                            .description("Force field, interactive by default. Use [TT]-h[tt] for information."));
1439     options->addOption(EnumOption<WaterType>("water")
1440                            .store(&enumWater_).enumValue(WaterEnum)
1441                            .description("Water model to use"));
1442     options->addOption(BooleanOption("inter")
1443                            .store(&bInter_).defaultValue(false)
1444                            .description("Set the next 8 options to interactive"));
1445     options->addOption(BooleanOption("ss")
1446                            .store(&bCysMan_).defaultValue(false)
1447                            .description("Interactive SS bridge selection"));
1448     options->addOption(BooleanOption("ter")
1449                            .store(&bTerMan_).defaultValue(false)
1450                            .description("Interactive termini selection, instead of charged (default)"));
1451     options->addOption(BooleanOption("lys")
1452                            .store(&bLysMan_).defaultValue(false)
1453                            .description("Interactive lysine selection, instead of charged"));
1454     options->addOption(BooleanOption("arg")
1455                            .store(&bArgMan_).defaultValue(false)
1456                            .description("Interactive arginine selection, instead of charged"));
1457     options->addOption(BooleanOption("asp")
1458                            .store(&bAspMan_).defaultValue(false)
1459                            .description("Interactive aspartic acid selection, instead of charged"));
1460     options->addOption(BooleanOption("glu")
1461                            .store(&bGluMan_).defaultValue(false)
1462                            .description("Interactive glutamic acid selection, instead of charged"));
1463     options->addOption(BooleanOption("gln")
1464                            .store(&bGlnMan_).defaultValue(false)
1465                            .description("Interactive glutamine selection, instead of charged"));
1466     options->addOption(BooleanOption("his")
1467                            .store(&bHisMan_).defaultValue(false)
1468                            .description("Interactive histidine selection, instead of checking H-bonds"));
1469     options->addOption(RealOption("angle")
1470                            .store(&angle_).defaultValue(135.0)
1471                            .description("Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)"));
1472     options->addOption(RealOption("dist")
1473                            .store(&distance_).defaultValue(0.3)
1474                            .description("Maximum donor-acceptor distance for a H-bond (nm)"));
1475     options->addOption(BooleanOption("una")
1476                            .store(&bUnA_).defaultValue(false)
1477                            .description("Select aromatic rings with united CH atoms on phenylalanine, tryptophane and tyrosine"));
1478     options->addOption(BooleanOption("sort")
1479                            .store(&bSort_).defaultValue(true).hidden()
1480                            .description("Sort the residues according to database, turning this off is dangerous as charge groups might be broken in parts"));
1481     options->addOption(BooleanOption("ignh")
1482                            .store(&bRemoveH_).defaultValue(false)
1483                            .description("Ignore hydrogen atoms that are in the coordinate file"));
1484     options->addOption(BooleanOption("missing")
1485                            .store(&bAllowMissing_).defaultValue(false)
1486                            .description("Continue when atoms are missing and bonds cannot be made, dangerous"));
1487     options->addOption(BooleanOption("v")
1488                            .store(&bVerbose_).defaultValue(false)
1489                            .description("Be slightly more verbose in messages"));
1490     options->addOption(RealOption("posrefc")
1491                            .store(&posre_fc_).defaultValue(1000)
1492                            .description("Force constant for position restraints"));
1493     options->addOption(EnumOption<VSitesType>("vsite")
1494                            .store(&enumVSites_).enumValue(VSitesEnum)
1495                            .description("Convert atoms to virtual sites"));
1496     options->addOption(BooleanOption("heavyh")
1497                            .store(&bHeavyH_).defaultValue(false)
1498                            .description("Make hydrogen atoms heavy"));
1499     options->addOption(BooleanOption("deuterate")
1500                            .store(&bDeuterate_).defaultValue(false)
1501                            .description("Change the mass of hydrogens to 2 amu"));
1502     options->addOption(BooleanOption("chargegrp")
1503                            .store(&bChargeGroups_).defaultValue(true)
1504                            .description("Use charge groups in the [REF].rtp[ref] file"));
1505     options->addOption(BooleanOption("cmap")
1506                            .store(&bCmap_).defaultValue(true)
1507                            .description("Use cmap torsions (if enabled in the [REF].rtp[ref] file)"));
1508     options->addOption(BooleanOption("renum")
1509                            .store(&bRenumRes_).defaultValue(false)
1510                            .description("Renumber the residues consecutively in the output"));
1511     options->addOption(BooleanOption("rtpres")
1512                            .store(&bRTPresname_).defaultValue(false)
1513                            .description("Use [REF].rtp[ref] entry names as residue names"));
1514     options->addOption(FileNameOption("f")
1515                            .legacyType(efSTX).inputFile()
1516                            .store(&inputConfFile_).required()
1517                            .defaultBasename("protein").defaultType(efPDB)
1518                            .description("Structure file"));
1519     options->addOption(FileNameOption("o")
1520                            .legacyType(efSTO).outputFile()
1521                            .store(&outputConfFile_).required()
1522                            .defaultBasename("conf")
1523                            .description("Structure file"));
1524     options->addOption(FileNameOption("p")
1525                            .legacyType(efTOP).outputFile()
1526                            .store(&topologyFile_).required()
1527                            .defaultBasename("topol")
1528                            .description("Topology file"));
1529     options->addOption(FileNameOption("i")
1530                            .legacyType(efITP).outputFile()
1531                            .store(&includeTopologyFile_).required()
1532                            .defaultBasename("posre")
1533                            .description("Include file for topology"));
1534     options->addOption(FileNameOption("n")
1535                            .legacyType(efNDX).outputFile()
1536                            .store(&indexOutputFile_).storeIsSet(&bIndexSet_)
1537                            .defaultBasename("index")
1538                            .description("Index file"));
1539     options->addOption(FileNameOption("q")
1540                            .legacyType(efSTO).outputFile()
1541                            .store(&outFile_).storeIsSet(&bOutputSet_)
1542                            .defaultBasename("clean").defaultType(efPDB)
1543                            .description("Structure file"));
1544 }
1545
1546 void pdb2gmx::optionsFinished()
1547 {
1548     if (inputConfFile_.empty())
1549     {
1550         GMX_THROW(InconsistentInputError("You must supply an input file"));
1551     }
1552     if (bInter_)
1553     {
1554         /* if anything changes here, also change description of -inter */
1555         bCysMan_ = true;
1556         bTerMan_ = true;
1557         bLysMan_ = true;
1558         bArgMan_ = true;
1559         bAspMan_ = true;
1560         bGluMan_ = true;
1561         bGlnMan_ = true;
1562         bHisMan_ = true;
1563     }
1564
1565     if (bHeavyH_)
1566     {
1567         mHmult_ = 4.0;
1568     }
1569     else if (bDeuterate_)
1570     {
1571         mHmult_ = 2.0;
1572     }
1573     else
1574     {
1575         mHmult_ = 1.0;
1576     }
1577
1578     /* Force field selection, interactive or direct */
1579     choose_ff(strcmp(ff_.c_str(), "select") == 0 ? nullptr : ff_.c_str(),
1580               forcefield_, sizeof(forcefield_),
1581               ffdir_, sizeof(ffdir_));
1582
1583     if (strlen(forcefield_) > 0)
1584     {
1585         ffname_    = forcefield_;
1586         ffname_[0] = std::toupper(ffname_[0]);
1587     }
1588     else
1589     {
1590         gmx_fatal(FARGS, "Empty forcefield string");
1591     }
1592 }
1593
1594 int pdb2gmx::run()
1595 {
1596     char                       select[STRLEN];
1597     std::vector<DisulfideBond> ssbonds;
1598
1599     int                        this_atomnum;
1600     int                        prev_atomnum;
1601     const char                *prev_atomname;
1602     const char                *this_atomname;
1603     const char                *prev_resname;
1604     const char                *this_resname;
1605     int                        prev_resnum;
1606     int                        this_resnum;
1607     char                       prev_chainid;
1608     char                       this_chainid;
1609     int                        prev_chainnumber;
1610     int                        this_chainnumber;
1611     int                        this_chainstart;
1612     int                        prev_chainstart;
1613
1614     printf("\nUsing the %s force field in directory %s\n\n",
1615            ffname_, ffdir_);
1616
1617     choose_watermodel(WaterEnum[enumWater_], ffdir_, &watermodel_);
1618
1619     switch (enumVSites_)
1620     {
1621         case enVSites_none:
1622             bVsites_         = false;
1623             bVsiteAromatics_ = false;
1624             break;
1625         case enVSites_hydrogens:
1626             bVsites_         = true;
1627             bVsiteAromatics_ = false;
1628             break;
1629         case enVSites_aromatics:
1630             bVsites_         = true;
1631             bVsiteAromatics_ = true;
1632             break;
1633         default:
1634             gmx_fatal(FARGS, "Internal inconsistency: VSitesEnum='%s'", VSitesEnum[enumVSites_]);
1635     }       /* end switch */
1636
1637     /* Open the symbol table */
1638     t_symtab symtab;
1639     open_symtab(&symtab);
1640
1641     /* Residue type database */
1642     ResidueType rt;
1643
1644     /* Read residue renaming database(s), if present */
1645     std::vector<std::string> rrn = fflib_search_file_end(ffdir_, ".r2b", FALSE);
1646
1647     std::vector<RtpRename>   rtprename;
1648     for (const auto &filename : rrn)
1649     {
1650         printf("going to rename %s\n", filename.c_str());
1651         FILE *fp = fflib_open(filename);
1652         read_rtprename(filename.c_str(), fp, &rtprename);
1653         gmx_ffclose(fp);
1654     }
1655
1656     /* Add all alternative names from the residue renaming database to the list
1657        of recognized amino/nucleic acids. */
1658     for (const auto &rename : rtprename)
1659     {
1660         /* Only add names if the 'standard' gromacs/iupac base name was found */
1661         if (auto restype = rt.optionalTypeOfNamedDatabaseResidue(rename.gmx))
1662         {
1663             rt.addResidue(rename.main, *restype);
1664             rt.addResidue(rename.nter, *restype);
1665             rt.addResidue(rename.cter, *restype);
1666             rt.addResidue(rename.bter, *restype);
1667         }
1668     }
1669
1670     matrix      box;
1671     const char *watres;
1672     clear_mat(box);
1673     if (watermodel_ != nullptr && (strstr(watermodel_, "4p") ||
1674                                    strstr(watermodel_, "4P")))
1675     {
1676         watres = "HO4";
1677     }
1678     else if (watermodel_ != nullptr && (strstr(watermodel_, "5p") ||
1679                                         strstr(watermodel_, "5P")))
1680     {
1681         watres = "HO5";
1682     }
1683     else
1684     {
1685         watres = "HOH";
1686     }
1687
1688     AtomProperties aps;
1689     char          *title = nullptr;
1690     int            ePBC;
1691     t_atoms        pdba_all;
1692     rvec          *pdbx;
1693     int            natom = read_pdball(inputConfFile_.c_str(), bOutputSet_, outFile_.c_str(),
1694                                        &title, &pdba_all, &pdbx, &ePBC, box, bRemoveH_,
1695                                        &symtab, &rt, watres, &aps, bVerbose_);
1696
1697     if (natom == 0)
1698     {
1699         std::string message = formatString("No atoms found in pdb file %s\n", inputConfFile_.c_str());
1700         GMX_THROW(InconsistentInputError(message));
1701     }
1702
1703     printf("Analyzing pdb file\n");
1704     int nwaterchain = 0;
1705
1706     modify_chain_numbers(&pdba_all, enumChainSep_);
1707
1708     int nchainmerges        = 0;
1709
1710     this_atomname       = nullptr;
1711     this_atomnum        = -1;
1712     this_resname        = nullptr;
1713     this_resnum         = -1;
1714     this_chainid        = '?';
1715     this_chainnumber    = -1;
1716     this_chainstart     = 0;
1717     /* Keep the compiler happy */
1718     prev_chainstart     = 0;
1719
1720     int                     numChains = 0;
1721     std::vector<t_pdbchain> pdb_ch;
1722
1723     t_resinfo              *ri;
1724     bool                    bMerged = false;
1725     for (int i = 0; (i < natom); i++)
1726     {
1727         ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
1728
1729         /* TODO this should live in a helper object, and consolidate
1730            that with code in modify_chain_numbers */
1731         prev_atomname      = this_atomname;
1732         prev_atomnum       = this_atomnum;
1733         prev_resname       = this_resname;
1734         prev_resnum        = this_resnum;
1735         prev_chainid       = this_chainid;
1736         prev_chainnumber   = this_chainnumber;
1737         if (!bMerged)
1738         {
1739             prev_chainstart    = this_chainstart;
1740         }
1741
1742         this_atomname      = *pdba_all.atomname[i];
1743         this_atomnum       = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i+1;
1744         this_resname       = *ri->name;
1745         this_resnum        = ri->nr;
1746         this_chainid       = ri->chainid;
1747         this_chainnumber   = ri->chainnum;
1748
1749         bWat_ = gmx::equalCaseInsensitive(*ri->name, watres);
1750
1751         if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat_ != bPrevWat_))
1752         {
1753             GMX_RELEASE_ASSERT(pdba_all.pdbinfo, "Must have pdbinfo from reading a PDB file if chain number is changing");
1754             this_chainstart = pdba_all.atom[i].resind;
1755             bMerged         = false;
1756             if (i > 0 && !bWat_)
1757             {
1758                 if (!strncmp(MergeEnum[enumMerge_], "int", 3))
1759                 {
1760                     printf("Merge chain ending with residue %s%d (chain id '%c', atom %d %s) and chain starting with\n"
1761                            "residue %s%d (chain id '%c', atom %d %s) into a single moleculetype (keeping termini)? [n/y]\n",
1762                            prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1763                            this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1764
1765                     if (nullptr == fgets(select, STRLEN-1, stdin))
1766                     {
1767                         gmx_fatal(FARGS, "Error reading from stdin");
1768                     }
1769                     bMerged = (select[0] == 'y');
1770                 }
1771                 else if (!strncmp(MergeEnum[enumMerge_], "all", 3))
1772                 {
1773                     bMerged = true;
1774                 }
1775             }
1776
1777             if (bMerged)
1778             {
1779                 pdb_ch[numChains-1].chainstart[pdb_ch[numChains-1].nterpairs] =
1780                     pdba_all.atom[i].resind - prev_chainstart;
1781                 pdb_ch[numChains-1].nterpairs++;
1782                 pdb_ch[numChains-1].chainstart.resize(pdb_ch[numChains-1].nterpairs+1);
1783                 nchainmerges++;
1784             }
1785             else
1786             {
1787                 /* set natom for previous chain */
1788                 if (numChains > 0)
1789                 {
1790                     pdb_ch[numChains-1].natom = i-pdb_ch[numChains-1].start;
1791                 }
1792                 if (bWat_)
1793                 {
1794                     nwaterchain++;
1795                     ri->chainid = ' ';
1796                 }
1797                 /* check if chain identifier was used before */
1798                 for (int j = 0; (j < numChains); j++)
1799                 {
1800                     if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
1801                     {
1802                         printf("WARNING: Chain identifier '%c' is used in two non-sequential blocks.\n"
1803                                "They will be treated as separate chains unless you reorder your file.\n",
1804                                ri->chainid);
1805                     }
1806                 }
1807                 t_pdbchain newChain;
1808                 newChain.chainid  = ri->chainid;
1809                 newChain.chainnum = ri->chainnum;
1810                 newChain.start    = i;
1811                 newChain.bAllWat  = bWat_;
1812                 if (bWat_)
1813                 {
1814                     newChain.nterpairs = 0;
1815                 }
1816                 else
1817                 {
1818                     newChain.nterpairs = 1;
1819                 }
1820                 newChain.chainstart.resize(newChain.nterpairs+1);
1821                 /* modified [numChains] to [0] below */
1822                 newChain.chainstart[0] = 0;
1823                 pdb_ch.push_back(newChain);
1824                 numChains++;
1825             }
1826         }
1827         bPrevWat_ = bWat_;
1828     }
1829     pdb_ch.back().natom = natom-pdb_ch.back().start;
1830
1831     /* set all the water blocks at the end of the chain */
1832     std::vector<int> swap_index(numChains);
1833     int              j = 0;
1834     for (int i = 0; i < numChains; i++)
1835     {
1836         if (!pdb_ch[i].bAllWat)
1837         {
1838             swap_index[j] = i;
1839             j++;
1840         }
1841     }
1842     for (int i = 0; i < numChains; i++)
1843     {
1844         if (pdb_ch[i].bAllWat)
1845         {
1846             swap_index[j] = i;
1847             j++;
1848         }
1849     }
1850     if (nwaterchain > 1)
1851     {
1852         printf("Moved all the water blocks to the end\n");
1853     }
1854
1855     t_atoms             *pdba;
1856     std::vector<t_chain> chains(numChains);
1857     /* copy pdb data and x for all chains */
1858     for (int i = 0; (i < numChains); i++)
1859     {
1860         int si                   = swap_index[i];
1861         chains[i].chainid    = pdb_ch[si].chainid;
1862         chains[i].chainnum   = pdb_ch[si].chainnum;
1863         chains[i].bAllWat    = pdb_ch[si].bAllWat;
1864         chains[i].nterpairs  = pdb_ch[si].nterpairs;
1865         chains[i].chainstart = pdb_ch[si].chainstart;
1866         chains[i].ntdb.clear();
1867         chains[i].ctdb.clear();
1868         chains[i].r_start.resize(pdb_ch[si].nterpairs);
1869         chains[i].r_end.resize(pdb_ch[si].nterpairs);
1870
1871         snew(chains[i].pdba, 1);
1872         init_t_atoms(chains[i].pdba, pdb_ch[si].natom, true);
1873         for (j = 0; j < chains[i].pdba->nr; j++)
1874         {
1875             chains[i].pdba->atom[j]     = pdba_all.atom[pdb_ch[si].start+j];
1876             chains[i].pdba->atomname[j] =
1877                 put_symtab(&symtab, *pdba_all.atomname[pdb_ch[si].start+j]);
1878             chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start+j];
1879             chains[i].x.emplace_back(pdbx[pdb_ch[si].start+j]);
1880         }
1881         /* Re-index the residues assuming that the indices are continuous */
1882         int k                = chains[i].pdba->atom[0].resind;
1883         int nres             = chains[i].pdba->atom[chains[i].pdba->nr-1].resind - k + 1;
1884         chains[i].pdba->nres = nres;
1885         for (int j = 0; j < chains[i].pdba->nr; j++)
1886         {
1887             chains[i].pdba->atom[j].resind -= k;
1888         }
1889         srenew(chains[i].pdba->resinfo, nres);
1890         for (int j = 0; j < nres; j++)
1891         {
1892             chains[i].pdba->resinfo[j]      = pdba_all.resinfo[k+j];
1893             chains[i].pdba->resinfo[j].name = put_symtab(&symtab, *pdba_all.resinfo[k+j].name);
1894             /* make all chain identifiers equal to that of the chain */
1895             chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
1896         }
1897     }
1898
1899     if (nchainmerges > 0)
1900     {
1901         printf("\nMerged chains into joint molecule definitions at %d places.\n\n",
1902                nchainmerges);
1903     }
1904
1905     printf("There are %d chains and %d blocks of water and "
1906            "%d residues with %d atoms\n",
1907            numChains-nwaterchain, nwaterchain,
1908            pdba_all.nres, natom);
1909
1910     printf("\n  %5s  %4s %6s\n", "chain", "#res", "#atoms");
1911     for (int i = 0; (i < numChains); i++)
1912     {
1913         printf("  %d '%c' %5d %6d  %s\n",
1914                i+1, chains[i].chainid ? chains[i].chainid : '-',
1915                chains[i].pdba->nres, chains[i].pdba->nr,
1916                chains[i].bAllWat ? "(only water)" : "");
1917     }
1918     printf("\n");
1919
1920     check_occupancy(&pdba_all, inputConfFile_.c_str(), bVerbose_);
1921
1922     /* Read atomtypes... */
1923     PreprocessingAtomTypes atype = read_atype(ffdir_, &symtab);
1924
1925     /* read residue database */
1926     printf("Reading residue database... (%s)\n", forcefield_);
1927     std::vector<std::string>       rtpf  = fflib_search_file_end(ffdir_, ".rtp", true);
1928     std::vector<PreprocessResidue> rtpFFDB;
1929     for (const auto &filename : rtpf)
1930     {
1931         readResidueDatabase(filename, &rtpFFDB, &atype, &symtab, false);
1932     }
1933     if (bNewRTP_)
1934     {
1935         /* Not correct with multiple rtp input files with different bonded types */
1936         FILE *fp = gmx_fio_fopen("new.rtp", "w");
1937         print_resall(fp, rtpFFDB, atype);
1938         gmx_fio_fclose(fp);
1939     }
1940
1941     /* read hydrogen database */
1942     std::vector<MoleculePatchDatabase> ah;
1943     read_h_db(ffdir_, &ah);
1944
1945     /* Read Termini database... */
1946     std::vector<MoleculePatchDatabase>                  ntdb;
1947     std::vector<MoleculePatchDatabase>                  ctdb;
1948     std::vector<MoleculePatchDatabase *>                tdblist;
1949     int                                                 nNtdb = read_ter_db(ffdir_, 'n', &ntdb, &atype);
1950     int                                                 nCtdb = read_ter_db(ffdir_, 'c', &ctdb, &atype);
1951
1952     FILE                                               *top_file = gmx_fio_fopen(topologyFile_.c_str(), "w");
1953
1954     print_top_header(top_file, topologyFile_.c_str(), FALSE, ffdir_, mHmult_);
1955
1956     t_chain                *cc;
1957     std::vector<gmx::RVec>  x;
1958     /* new pdb datastructure for sorting. */
1959     t_atoms               **sortAtoms  = nullptr;
1960     t_atoms               **localAtoms = nullptr;
1961     snew(sortAtoms, numChains);
1962     snew(localAtoms, numChains);
1963     for (int chain = 0; (chain < numChains); chain++)
1964     {
1965         cc = &(chains[chain]);
1966
1967         /* set pdba, natom and nres to the current chain */
1968         pdba     = cc->pdba;
1969         x        = cc->x;
1970         natom    = cc->pdba->nr;
1971         int nres = cc->pdba->nres;
1972
1973         if (cc->chainid && ( cc->chainid != ' ' ) )
1974         {
1975             printf("Processing chain %d '%c' (%d atoms, %d residues)\n",
1976                    chain+1, cc->chainid, natom, nres);
1977         }
1978         else
1979         {
1980             printf("Processing chain %d (%d atoms, %d residues)\n",
1981                    chain+1, natom, nres);
1982         }
1983
1984         process_chain(pdba, x, bUnA_, bUnA_, bUnA_, bLysMan_, bAspMan_, bGluMan_,
1985                       bHisMan_, bArgMan_, bGlnMan_, angle_, distance_, &symtab,
1986                       rtprename);
1987
1988         cc->chainstart[cc->nterpairs] = pdba->nres;
1989         j = 0;
1990         for (int i = 0; i < cc->nterpairs; i++)
1991         {
1992             find_nc_ter(pdba, cc->chainstart[i], cc->chainstart[i+1],
1993                         &(cc->r_start[j]), &(cc->r_end[j]), &rt);
1994
1995             if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
1996             {
1997                 j++;
1998             }
1999         }
2000         cc->nterpairs = j;
2001         if (cc->nterpairs == 0)
2002         {
2003             printf("Problem with chain definition, or missing terminal residues.\n"
2004                    "This chain does not appear to contain a recognized chain molecule.\n"
2005                    "If this is incorrect, you can edit residuetypes.dat to modify the behavior.\n");
2006         }
2007
2008         /* Check for disulfides and other special bonds */
2009         ssbonds = makeDisulfideBonds(pdba, gmx::as_rvec_array(x.data()), bCysMan_, bVerbose_);
2010
2011         if (!rtprename.empty())
2012         {
2013             rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, rtprename,
2014                           &symtab, bVerbose_);
2015         }
2016
2017         for (int i = 0; i < cc->nterpairs; i++)
2018         {
2019             /* Set termini.
2020              * We first apply a filter so we only have the
2021              * termini that can be applied to the residue in question
2022              * (or a generic terminus if no-residue specific is available).
2023              */
2024             /* First the N terminus */
2025             if (nNtdb > 0)
2026             {
2027                 tdblist = filter_ter(ntdb,
2028                                      *pdba->resinfo[cc->r_start[i]].name);
2029                 if (tdblist.empty())
2030                 {
2031                     printf("No suitable end (N or 5') terminus found in database - assuming this residue\n"
2032                            "is already in a terminus-specific form and skipping terminus selection.\n");
2033                     cc->ntdb.push_back(nullptr);
2034                 }
2035                 else
2036                 {
2037                     if (bTerMan_ && !tdblist.empty())
2038                     {
2039                         sprintf(select, "Select start terminus type for %s-%d",
2040                                 *pdba->resinfo[cc->r_start[i]].name,
2041                                 pdba->resinfo[cc->r_start[i]].nr);
2042                         cc->ntdb.push_back(choose_ter(tdblist, select));
2043                     }
2044                     else
2045                     {
2046                         cc->ntdb.push_back(tdblist[0]);
2047                     }
2048
2049                     printf("Start terminus %s-%d: %s\n",
2050                            *pdba->resinfo[cc->r_start[i]].name,
2051                            pdba->resinfo[cc->r_start[i]].nr,
2052                            (cc->ntdb[i])->name.c_str());
2053                     tdblist.clear();
2054                 }
2055             }
2056             else
2057             {
2058                 cc->ntdb.push_back(nullptr);
2059             }
2060
2061             /* And the C terminus */
2062             if (nCtdb > 0)
2063             {
2064                 tdblist = filter_ter(ctdb,
2065                                      *pdba->resinfo[cc->r_end[i]].name);
2066                 if (tdblist.empty())
2067                 {
2068                     printf("No suitable end (C or 3') terminus found in database - assuming this residue\n"
2069                            "is already in a terminus-specific form and skipping terminus selection.\n");
2070                     cc->ctdb.push_back(nullptr);
2071                 }
2072                 else
2073                 {
2074                     if (bTerMan_ && !tdblist.empty())
2075                     {
2076                         sprintf(select, "Select end terminus type for %s-%d",
2077                                 *pdba->resinfo[cc->r_end[i]].name,
2078                                 pdba->resinfo[cc->r_end[i]].nr);
2079                         cc->ctdb.push_back(choose_ter(tdblist, select));
2080                     }
2081                     else
2082                     {
2083                         cc->ctdb.push_back(tdblist[0]);
2084                     }
2085                     printf("End terminus %s-%d: %s\n",
2086                            *pdba->resinfo[cc->r_end[i]].name,
2087                            pdba->resinfo[cc->r_end[i]].nr,
2088                            (cc->ctdb[i])->name.c_str());
2089                     tdblist.clear();
2090                 }
2091             }
2092             else
2093             {
2094                 cc->ctdb.push_back(nullptr);
2095             }
2096         }
2097         std::vector<MoleculePatchDatabase> hb_chain;
2098         /* lookup hackblocks and rtp for all residues */
2099         std::vector<PreprocessResidue>     restp_chain;
2100         get_hackblocks_rtp(&hb_chain, &restp_chain,
2101                            rtpFFDB, pdba->nres, pdba->resinfo,
2102                            cc->nterpairs, &symtab, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end,
2103                            bAllowMissing_);
2104         /* ideally, now we would not need the rtp itself anymore, but do
2105            everything using the hb and restp arrays. Unfortunately, that
2106            requires some re-thinking of code in gen_vsite.c, which I won't
2107            do now :( AF 26-7-99 */
2108
2109         rename_atoms(nullptr, ffdir_,
2110                      pdba, &symtab, restp_chain, false, &rt, false, bVerbose_);
2111
2112         match_atomnames_with_rtp(restp_chain, hb_chain, pdba, &symtab, x, bVerbose_);
2113
2114         if (bSort_)
2115         {
2116             char     **gnames;
2117             t_blocka  *block = new_blocka();
2118             snew(gnames, 1);
2119             sort_pdbatoms(restp_chain, natom, &pdba, &sortAtoms[chain], &x, block, &gnames);
2120             remove_duplicate_atoms(pdba, x, bVerbose_);
2121             if (bIndexSet_)
2122             {
2123                 if (bRemoveH_)
2124                 {
2125                     fprintf(stderr, "WARNING: with the -remh option the generated "
2126                             "index file (%s) might be useless\n"
2127                             "(the index file is generated before hydrogens are added)",
2128                             indexOutputFile_.c_str());
2129                 }
2130                 write_index(indexOutputFile_.c_str(), block, gnames, false, 0);
2131             }
2132             for (int i = 0; i < block->nr; i++)
2133             {
2134                 sfree(gnames[i]);
2135             }
2136             sfree(gnames);
2137             done_blocka(block);
2138             sfree(block);
2139         }
2140         else
2141         {
2142             fprintf(stderr, "WARNING: "
2143                     "without sorting no check for duplicate atoms can be done\n");
2144         }
2145
2146         /* Generate Hydrogen atoms (and termini) in the sequence */
2147         printf("Generating any missing hydrogen atoms and/or adding termini.\n");
2148         add_h(&pdba, &localAtoms[chain], &x, ah, &symtab,
2149               cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end, bAllowMissing_);
2150         printf("Now there are %d residues with %d atoms\n",
2151                pdba->nres, pdba->nr);
2152
2153         /* make up molecule name(s) */
2154
2155         int         k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2156
2157         std::string restype = rt.typeOfNamedDatabaseResidue(*pdba->resinfo[k].name);
2158
2159         std::string molname;
2160         std::string suffix;
2161         if (cc->bAllWat)
2162         {
2163             molname = "Water";
2164         }
2165         else
2166         {
2167             this_chainid = cc->chainid;
2168
2169             /* Add the chain id if we have one */
2170             if (this_chainid != ' ')
2171             {
2172                 suffix.append(formatString("_chain_%c", this_chainid));
2173             }
2174
2175             /* Check if there have been previous chains with the same id */
2176             int nid_used = 0;
2177             for (int k = 0; k < chain; k++)
2178             {
2179                 if (cc->chainid == chains[k].chainid)
2180                 {
2181                     nid_used++;
2182                 }
2183             }
2184             /* Add the number for this chain identifier if there are multiple copies */
2185             if (nid_used > 0)
2186             {
2187                 suffix.append(formatString("%d", nid_used+1));
2188             }
2189
2190             if (suffix.length() > 0)
2191             {
2192                 molname.append(restype);
2193                 molname.append(suffix);
2194             }
2195             else
2196             {
2197                 molname = restype;
2198             }
2199         }
2200         std::string itp_fn   = topologyFile_;;
2201         std::string posre_fn = includeTopologyFile_;
2202         if ((numChains-nwaterchain > 1) && !cc->bAllWat)
2203         {
2204             bITP_  = true;
2205             printf("Chain time...\n");
2206             //construct the itp file name
2207             itp_fn = stripSuffixIfPresent(itp_fn, ".top");
2208             itp_fn.append("_");
2209             itp_fn.append(molname);
2210             itp_fn.append(".itp");
2211             //now do the same for posre
2212             posre_fn = stripSuffixIfPresent(posre_fn, ".itp");
2213             posre_fn.append("_");
2214             posre_fn.append(molname);
2215             posre_fn.append(".itp");
2216             if (posre_fn == itp_fn)
2217             {
2218                 posre_fn = Path::concatenateBeforeExtension(posre_fn, "_pr");
2219             }
2220             incls_.emplace_back();
2221             incls_.back()    = itp_fn;
2222             itp_file_        = gmx_fio_fopen(itp_fn.c_str(), "w");
2223         }
2224         else
2225         {
2226             bITP_ = false;
2227         }
2228
2229         mols_.emplace_back();
2230         if (cc->bAllWat)
2231         {
2232             mols_.back().name = "SOL";
2233             mols_.back().nr   = pdba->nres;
2234         }
2235         else
2236         {
2237             mols_.back().name = molname;
2238             mols_.back().nr   = 1;
2239         }
2240
2241         if (bITP_)
2242         {
2243             print_top_comment(itp_file_, itp_fn.c_str(), ffdir_, true);
2244         }
2245
2246         FILE *top_file2;
2247         if (cc->bAllWat)
2248         {
2249             top_file2 = nullptr;
2250         }
2251         else if (bITP_)
2252         {
2253             top_file2 = itp_file_;
2254         }
2255         else
2256         {
2257             top_file2 = top_file;
2258         }
2259
2260         pdb2top(top_file2, posre_fn.c_str(), molname.c_str(), pdba, &x, &atype, &symtab,
2261                 rtpFFDB,
2262                 restp_chain, hb_chain,
2263                 bAllowMissing_,
2264                 bVsites_, bVsiteAromatics_, ffdir_,
2265                 mHmult_, ssbonds,
2266                 long_bond_dist_, short_bond_dist_, bDeuterate_, bChargeGroups_, bCmap_,
2267                 bRenumRes_, bRTPresname_);
2268
2269         if (!cc->bAllWat)
2270         {
2271             write_posres(posre_fn.c_str(), pdba, posre_fc_);
2272         }
2273
2274         if (bITP_)
2275         {
2276             gmx_fio_fclose(itp_file_);
2277         }
2278
2279         /* pdba and natom have been reassigned somewhere so: */
2280         cc->pdba = pdba;
2281         cc->x    = x;
2282     }
2283
2284     if (watermodel_ == nullptr)
2285     {
2286         for (int chain = 0; chain < numChains; chain++)
2287         {
2288             if (chains[chain].bAllWat)
2289             {
2290                 auto message = formatString("You have chosen not to include a water model, "
2291                                             "but there is water in the input file. Select a "
2292                                             "water model or remove the water from your input file.");
2293                 GMX_THROW(InconsistentInputError(message));
2294             }
2295         }
2296     }
2297     else
2298     {
2299         std::string waterFile = formatString("%s%c%s.itp", ffdir_, DIR_SEPARATOR, watermodel_);
2300         if (!fflib_fexist(waterFile))
2301         {
2302             auto message = formatString("The topology file '%s' for the selected water "
2303                                         "model '%s' can not be found in the force field "
2304                                         "directory. Select a different water model.",
2305                                         waterFile.c_str(), watermodel_);
2306             GMX_THROW(InconsistentInputError(message));
2307         }
2308     }
2309
2310     print_top_mols(top_file, title, ffdir_, watermodel_, incls_, mols_);
2311     gmx_fio_fclose(top_file);
2312
2313     /* now merge all chains back together */
2314     natom     = 0;
2315     int nres  = 0;
2316     for (int i = 0; (i < numChains); i++)
2317     {
2318         natom += chains[i].pdba->nr;
2319         nres  += chains[i].pdba->nres;
2320     }
2321     t_atoms *atoms;
2322     snew(atoms, 1);
2323     init_t_atoms(atoms, natom, false);
2324     for (int i = 0; i < atoms->nres; i++)
2325     {
2326         sfree(atoms->resinfo[i].name);
2327     }
2328     atoms->nres = nres;
2329     srenew(atoms->resinfo, nres);
2330     x.clear();
2331     int k = 0;
2332     int l = 0;
2333     for (int i = 0; (i < numChains); i++)
2334     {
2335         if (numChains > 1)
2336         {
2337             printf("Including chain %d in system: %d atoms %d residues\n",
2338                    i+1, chains[i].pdba->nr, chains[i].pdba->nres);
2339         }
2340         for (int j = 0; (j < chains[i].pdba->nr); j++)
2341         {
2342             atoms->atom[k]         = chains[i].pdba->atom[j];
2343             atoms->atom[k].resind += l; /* l is processed nr of residues */
2344             atoms->atomname[k]     = chains[i].pdba->atomname[j];
2345             atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2346             x.push_back(chains[i].x[j]);
2347             k++;
2348         }
2349         for (int j = 0; (j < chains[i].pdba->nres); j++)
2350         {
2351             atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2352             if (bRTPresname_)
2353             {
2354                 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2355             }
2356             l++;
2357         }
2358         done_atom(chains[i].pdba);
2359     }
2360
2361     if (numChains > 1)
2362     {
2363         fprintf(stderr, "Now there are %d atoms and %d residues\n", k, l);
2364         print_sums(atoms, true);
2365     }
2366
2367     rvec box_space;
2368     fprintf(stderr, "\nWriting coordinate file...\n");
2369     clear_rvec(box_space);
2370     if (box[0][0] == 0)
2371     {
2372         make_new_box(atoms->nr, gmx::as_rvec_array(x.data()), box, box_space, false);
2373     }
2374     write_sto_conf(outputConfFile_.c_str(), title, atoms, gmx::as_rvec_array(x.data()), nullptr, ePBC, box);
2375
2376     done_symtab(&symtab);
2377     done_atom(&pdba_all);
2378     done_atom(atoms);
2379     for (int chain = 0; chain < numChains; chain++)
2380     {
2381         sfree(sortAtoms[chain]);
2382         sfree(localAtoms[chain]);
2383     }
2384     sfree(sortAtoms);
2385     sfree(localAtoms);
2386     sfree(atoms);
2387     sfree(title);
2388     sfree(pdbx);
2389     printf("\t\t--------- PLEASE NOTE ------------\n");
2390     printf("You have successfully generated a topology from: %s.\n",
2391            inputConfFile_.c_str());
2392     if (watermodel_ != nullptr)
2393     {
2394         printf("The %s force field and the %s water model are used.\n",
2395                ffname_, watermodel_);
2396         sfree(watermodel_);
2397     }
2398     else
2399     {
2400         printf("The %s force field is used.\n",
2401                ffname_);
2402     }
2403     printf("\t\t--------- ETON ESAELP ------------\n");
2404
2405     return 0;
2406 }
2407
2408 }   // namespace
2409
2410 const char pdb2gmxInfo::name[]             = "pdb2gmx";
2411 const char pdb2gmxInfo::shortDescription[] =
2412     "Convert coordinate files to topology and FF-compliant coordinate files";
2413 ICommandLineOptionsModulePointer pdb2gmxInfo::create()
2414 {
2415     return std::make_unique<pdb2gmx>();
2416 }
2417
2418 } // namespace gmx