src/gromacs/gmxpreprocess/pdb2gmx.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "pdb2gmx.h"
  40
  41 #include <ctype.h>
  42 #include <stdlib.h>
  43 #include <string.h>
  44 #include <time.h>
  45
  46 #include "gromacs/commandline/pargs.h"
  47 #include "gromacs/fileio/confio.h"
  48 #include "gromacs/fileio/gmxfio.h"
  49 #include "gromacs/fileio/pdbio.h"
  50 #include "gromacs/gmxlib/conformation-utilities.h"
  51 #include "gromacs/gmxpreprocess/fflibutil.h"
  52 #include "gromacs/gmxpreprocess/genhydro.h"
  53 #include "gromacs/gmxpreprocess/h_db.h"
  54 #include "gromacs/gmxpreprocess/hizzie.h"
  55 #include "gromacs/gmxpreprocess/pdb2top.h"
  56 #include "gromacs/gmxpreprocess/pgutil.h"
  57 #include "gromacs/gmxpreprocess/resall.h"
  58 #include "gromacs/gmxpreprocess/specbond.h"
  59 #include "gromacs/gmxpreprocess/ter_db.h"
  60 #include "gromacs/gmxpreprocess/toputil.h"
  61 #include "gromacs/gmxpreprocess/xlate.h"
  62 #include "gromacs/math/vec.h"
  63 #include "gromacs/topology/atomprop.h"
  64 #include "gromacs/topology/block.h"
  65 #include "gromacs/topology/index.h"
  66 #include "gromacs/topology/residuetypes.h"
  67 #include "gromacs/topology/symtab.h"
  68 #include "gromacs/topology/topology.h"
  69 #include "gromacs/utility/arraysize.h"
  70 #include "gromacs/utility/cstringutil.h"
  71 #include "gromacs/utility/dir_separator.h"
  72 #include "gromacs/utility/fatalerror.h"
  73 #include "gromacs/utility/gmxassert.h"
  74 #include "gromacs/utility/smalloc.h"
  75 #include "gromacs/utility/strdb.h"
  76
  77 #define RTP_MAXCHAR 5
  78 typedef struct {
  79     char gmx[RTP_MAXCHAR+2];
  80     char main[RTP_MAXCHAR+2];
  81     char nter[RTP_MAXCHAR+2];
  82     char cter[RTP_MAXCHAR+2];
  83     char bter[RTP_MAXCHAR+2];
  84 } rtprename_t;
  85
  86
  87 static const char *res2bb_notermini(const char *name,
  88                                     int nrr, const rtprename_t *rr)
  89 {
  90     /* NOTE: This function returns the main building block name,
  91      *       it does not take terminal renaming into account.
  92      */
  93     int i;
  94
  95     i = 0;
  96     while (i < nrr && gmx_strcasecmp(name, rr[i].gmx) != 0)
  97     {
  98         i++;
  99     }
 100
 101     return (i < nrr ? rr[i].main : name);
 102 }
 103
 104 static const char *select_res(int nr, int resnr,
 105                               const char *name[], const char *expl[],
 106                               const char *title,
 107                               int nrr, const rtprename_t *rr)
 108 {
 109     int sel = 0;
 110
 111     printf("Which %s type do you want for residue %d\n", title, resnr+1);
 112     for (sel = 0; (sel < nr); sel++)
 113     {
 114         printf("%d. %s (%s)\n",
 115                sel, expl[sel], res2bb_notermini(name[sel], nrr, rr));
 116     }
 117     printf("\nType a number:"); fflush(stdout);
 118
 119     if (scanf("%d", &sel) != 1)
 120     {
 121         gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr+1);
 122     }
 123
 124     return name[sel];
 125 }
 126
 127 static const char *get_asptp(int resnr, int nrr, const rtprename_t *rr)
 128 {
 129     enum {
 130         easp, easpH, easpNR
 131     };
 132     const char *lh[easpNR]   = { "ASP", "ASPH" };
 133     const char *expl[easpNR] = {
 134         "Not protonated (charge -1)",
 135         "Protonated (charge 0)"
 136     };
 137
 138     return select_res(easpNR, resnr, lh, expl, "ASPARTIC ACID", nrr, rr);
 139 }
 140
 141 static const char *get_glutp(int resnr, int nrr, const rtprename_t *rr)
 142 {
 143     enum {
 144         eglu, egluH, egluNR
 145     };
 146     const char *lh[egluNR]   = { "GLU", "GLUH" };
 147     const char *expl[egluNR] = {
 148         "Not protonated (charge -1)",
 149         "Protonated (charge 0)"
 150     };
 151
 152     return select_res(egluNR, resnr, lh, expl, "GLUTAMIC ACID", nrr, rr);
 153 }
 154
 155 static const char *get_glntp(int resnr, int nrr, const rtprename_t *rr)
 156 {
 157     enum {
 158         egln, eglnH, eglnNR
 159     };
 160     const char *lh[eglnNR]   = { "GLN", "QLN" };
 161     const char *expl[eglnNR] = {
 162         "Not protonated (charge 0)",
 163         "Protonated (charge +1)"
 164     };
 165
 166     return select_res(eglnNR, resnr, lh, expl, "GLUTAMINE", nrr, rr);
 167 }
 168
 169 static const char *get_lystp(int resnr, int nrr, const rtprename_t *rr)
 170 {
 171     enum {
 172         elys, elysH, elysNR
 173     };
 174     const  char *lh[elysNR]   = { "LYSN", "LYS" };
 175     const char  *expl[elysNR] = {
 176         "Not protonated (charge 0)",
 177         "Protonated (charge +1)"
 178     };
 179
 180     return select_res(elysNR, resnr, lh, expl, "LYSINE", nrr, rr);
 181 }
 182
 183 static const char *get_argtp(int resnr, int nrr, const rtprename_t *rr)
 184 {
 185     enum {
 186         earg, eargH, eargNR
 187     };
 188     const  char *lh[eargNR]   = { "ARGN", "ARG" };
 189     const char  *expl[eargNR] = {
 190         "Not protonated (charge 0)",
 191         "Protonated (charge +1)"
 192     };
 193
 194     return select_res(eargNR, resnr, lh, expl, "ARGININE", nrr, rr);
 195 }
 196
 197 static const char *get_histp(int resnr, int nrr, const rtprename_t *rr)
 198 {
 199     const char *expl[ehisNR] = {
 200         "H on ND1 only",
 201         "H on NE2 only",
 202         "H on ND1 and NE2",
 203         "Coupled to Heme"
 204     };
 205
 206     return select_res(ehisNR, resnr, hh, expl, "HISTIDINE", nrr, rr);
 207 }
 208
 209 static void read_rtprename(const char *fname, FILE *fp,
 210                            int *nrtprename, rtprename_t **rtprename)
 211 {
 212     char         line[STRLEN], buf[STRLEN];
 213     int          n;
 214     rtprename_t *rr;
 215     int          ncol, nc;
 216
 217     n  = *nrtprename;
 218     rr = *rtprename;
 219
 220     ncol = 0;
 221     while (get_a_line(fp, line, STRLEN))
 222     {
 223         srenew(rr, n+1);
 224         /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
 225          * For other args, we read up to 6 chars (so we can detect if the length is > 5).
 226          * Note that the buffer length has been increased to 7 to allow this,
 227          * so we just need to make sure the strings have zero-length initially.
 228          */
 229         rr[n].gmx[0]  = '\0';
 230         rr[n].main[0] = '\0';
 231         rr[n].nter[0] = '\0';
 232         rr[n].cter[0] = '\0';
 233         rr[n].bter[0] = '\0';
 234         nc            = sscanf(line, "%6s %6s %6s %6s %6s %s",
 235                                rr[n].gmx, rr[n].main, rr[n].nter, rr[n].cter, rr[n].bter, buf);
 236         if (strlen(rr[n].gmx) > RTP_MAXCHAR || strlen(rr[n].main) > RTP_MAXCHAR ||
 237             strlen(rr[n].nter) > RTP_MAXCHAR || strlen(rr[n].cter) > RTP_MAXCHAR || strlen(rr[n].bter) > RTP_MAXCHAR)
 238         {
 239             gmx_fatal(FARGS, "Residue renaming database '%s' has strings longer than %d chars in first 5 columns:\n%s",
 240                       fname, RTP_MAXCHAR, line);
 241         }
 242
 243         if (ncol == 0)
 244         {
 245             if (nc != 2 && nc != 5)
 246             {
 247                 gmx_fatal(FARGS, "Residue renaming database '%s' has %d columns instead of %d, %d or %d", fname, ncol, 2, 5);
 248             }
 249             ncol = nc;
 250         }
 251         else if (nc != ncol)
 252         {
 253             gmx_fatal(FARGS, "A line in residue renaming database '%s' has %d columns, while previous lines have %d columns", fname, nc, ncol);
 254         }
 255
 256         if (nc == 2)
 257         {
 258             /* This file does not have special termini names, copy them from main */
 259             strcpy(rr[n].nter, rr[n].main);
 260             strcpy(rr[n].cter, rr[n].main);
 261             strcpy(rr[n].bter, rr[n].main);
 262         }
 263
 264         n++;
 265     }
 266
 267     *nrtprename = n;
 268     *rtprename  = rr;
 269 }
 270
 271 static char *search_resrename(int nrr, rtprename_t *rr,
 272                               const char *name,
 273                               gmx_bool bStart, gmx_bool bEnd,
 274                               gmx_bool bCompareFFRTPname)
 275 {
 276     char *nn;
 277     int   i;
 278
 279     nn = nullptr;
 280
 281     i = 0;
 282     while (i < nrr && ((!bCompareFFRTPname && strcmp(name, rr[i].gmx)  != 0) ||
 283                        ( bCompareFFRTPname && strcmp(name, rr[i].main) != 0)))
 284     {
 285         i++;
 286     }
 287
 288     /* If found in the database, rename this residue's rtp building block,
 289      * otherwise keep the old name.
 290      */
 291     if (i < nrr)
 292     {
 293         if (bStart && bEnd)
 294         {
 295             nn = rr[i].bter;
 296         }
 297         else if (bStart)
 298         {
 299             nn = rr[i].nter;
 300         }
 301         else if (bEnd)
 302         {
 303             nn = rr[i].cter;
 304         }
 305         else
 306         {
 307             nn = rr[i].main;
 308         }
 309
 310         if (nn[0] == '-')
 311         {
 312             gmx_fatal(FARGS, "In the chosen force field there is no residue type for '%s'%s", name, bStart ? ( bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus") : (bEnd ? " as an ending terminus" : ""));
 313         }
 314     }
 315
 316     return nn;
 317 }
 318
 319
 320 static void rename_resrtp(t_atoms *pdba, int nterpairs, int *r_start, int *r_end,
 321                           int nrr, rtprename_t *rr, t_symtab *symtab,
 322                           gmx_bool bVerbose)
 323 {
 324     int      r, j;
 325     gmx_bool bStart, bEnd;
 326     char    *nn;
 327     gmx_bool bFFRTPTERRNM;
 328
 329     bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
 330
 331     for (r = 0; r < pdba->nres; r++)
 332     {
 333         bStart = FALSE;
 334         bEnd   = FALSE;
 335         for (j = 0; j < nterpairs; j++)
 336         {
 337             if (r == r_start[j])
 338             {
 339                 bStart = TRUE;
 340             }
 341         }
 342         for (j = 0; j < nterpairs; j++)
 343         {
 344             if (r == r_end[j])
 345             {
 346                 bEnd = TRUE;
 347             }
 348         }
 349
 350         nn = search_resrename(nrr, rr, *pdba->resinfo[r].rtp, bStart, bEnd, FALSE);
 351
 352         if (bFFRTPTERRNM && nn == nullptr && (bStart || bEnd))
 353         {
 354             /* This is a terminal residue, but the residue name,
 355              * currently stored in .rtp, is not a standard residue name,
 356              * but probably a force field specific rtp name.
 357              * Check if we need to rename it because it is terminal.
 358              */
 359             nn = search_resrename(nrr, rr,
 360                                   *pdba->resinfo[r].rtp, bStart, bEnd, TRUE);
 361         }
 362
 363         if (nn != nullptr && strcmp(*pdba->resinfo[r].rtp, nn) != 0)
 364         {
 365             if (bVerbose)
 366             {
 367                 printf("Changing rtp entry of residue %d %s to '%s'\n",
 368                        pdba->resinfo[r].nr, *pdba->resinfo[r].name, nn);
 369             }
 370             pdba->resinfo[r].rtp = put_symtab(symtab, nn);
 371         }
 372     }
 373 }
 374
 375 static void pdbres_to_gmxrtp(t_atoms *pdba)
 376 {
 377     int i;
 378
 379     for (i = 0; (i < pdba->nres); i++)
 380     {
 381         if (pdba->resinfo[i].rtp == nullptr)
 382         {
 383             pdba->resinfo[i].rtp = pdba->resinfo[i].name;
 384         }
 385     }
 386 }
 387
 388 static void rename_pdbres(t_atoms *pdba, const char *oldnm, const char *newnm,
 389                           gmx_bool bFullCompare, t_symtab *symtab)
 390 {
 391     char *resnm;
 392     int   i;
 393
 394     for (i = 0; (i < pdba->nres); i++)
 395     {
 396         resnm = *pdba->resinfo[i].name;
 397         if ((bFullCompare && (gmx_strcasecmp(resnm, oldnm) == 0)) ||
 398             (!bFullCompare && strstr(resnm, oldnm) != nullptr))
 399         {
 400             /* Rename the residue name (not the rtp name) */
 401             pdba->resinfo[i].name = put_symtab(symtab, newnm);
 402         }
 403     }
 404 }
 405
 406 static void rename_bb(t_atoms *pdba, const char *oldnm, const char *newnm,
 407                       gmx_bool bFullCompare, t_symtab *symtab)
 408 {
 409     char *bbnm;
 410     int   i;
 411
 412     for (i = 0; (i < pdba->nres); i++)
 413     {
 414         /* We have not set the rtp name yes, use the residue name */
 415         bbnm = *pdba->resinfo[i].name;
 416         if ((bFullCompare && (gmx_strcasecmp(bbnm, oldnm) == 0)) ||
 417             (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
 418         {
 419             /* Change the rtp builing block name */
 420             pdba->resinfo[i].rtp = put_symtab(symtab, newnm);
 421         }
 422     }
 423 }
 424
 425 static void rename_bbint(t_atoms *pdba, const char *oldnm,
 426                          const char *gettp(int, int, const rtprename_t *),
 427                          gmx_bool bFullCompare,
 428                          t_symtab *symtab,
 429                          int nrr, const rtprename_t *rr)
 430 {
 431     int         i;
 432     const char *ptr;
 433     char       *bbnm;
 434
 435     for (i = 0; i < pdba->nres; i++)
 436     {
 437         /* We have not set the rtp name yes, use the residue name */
 438         bbnm = *pdba->resinfo[i].name;
 439         if ((bFullCompare && (strcmp(bbnm, oldnm) == 0)) ||
 440             (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
 441         {
 442             ptr                  = gettp(i, nrr, rr);
 443             pdba->resinfo[i].rtp = put_symtab(symtab, ptr);
 444         }
 445     }
 446 }
 447
 448 static void check_occupancy(t_atoms *atoms, const char *filename, gmx_bool bVerbose)
 449 {
 450     int i, ftp;
 451     int nzero   = 0;
 452     int nnotone = 0;
 453
 454     ftp = fn2ftp(filename);
 455     if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
 456     {
 457         fprintf(stderr, "No occupancies in %s\n", filename);
 458     }
 459     else
 460     {
 461         for (i = 0; (i < atoms->nr); i++)
 462         {
 463             if (atoms->pdbinfo[i].occup != 1)
 464             {
 465                 if (bVerbose)
 466                 {
 467                     fprintf(stderr, "Occupancy for atom %s%d-%s is %f rather than 1\n",
 468                             *atoms->resinfo[atoms->atom[i].resind].name,
 469                             atoms->resinfo[ atoms->atom[i].resind].nr,
 470                             *atoms->atomname[i],
 471                             atoms->pdbinfo[i].occup);
 472                 }
 473                 if (atoms->pdbinfo[i].occup == 0)
 474                 {
 475                     nzero++;
 476                 }
 477                 else
 478                 {
 479                     nnotone++;
 480                 }
 481             }
 482         }
 483         if (nzero == atoms->nr)
 484         {
 485             fprintf(stderr, "All occupancy fields zero. This is probably not an X-Ray structure\n");
 486         }
 487         else if ((nzero > 0) || (nnotone > 0))
 488         {
 489             fprintf(stderr,
 490                     "\n"
 491                     "WARNING: there were %d atoms with zero occupancy and %d atoms with\n"
 492                     "         occupancy unequal to one (out of %d atoms). Check your pdb file.\n"
 493                     "\n",
 494                     nzero, nnotone, atoms->nr);
 495         }
 496         else
 497         {
 498             fprintf(stderr, "All occupancies are one\n");
 499         }
 500     }
 501 }
 502
 503 void write_posres(char *fn, t_atoms *pdba, real fc)
 504 {
 505     FILE *fp;
 506     int   i;
 507
 508     fp = gmx_fio_fopen(fn, "w");
 509     fprintf(fp,
 510             "; In this topology include file, you will find position restraint\n"
 511             "; entries for all the heavy atoms in your original pdb file.\n"
 512             "; This means that all the protons which were added by pdb2gmx are\n"
 513             "; not restrained.\n"
 514             "\n"
 515             "[ position_restraints ]\n"
 516             "; %4s%6s%8s%8s%8s\n", "atom", "type", "fx", "fy", "fz"
 517             );
 518     for (i = 0; (i < pdba->nr); i++)
 519     {
 520         if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
 521         {
 522             fprintf(fp, "%6d%6d  %g  %g  %g\n", i+1, 1, fc, fc, fc);
 523         }
 524     }
 525     gmx_fio_fclose(fp);
 526 }
 527
 528 static int read_pdball(const char *inf, const char *outf, char *title,
 529                        t_atoms *atoms, rvec **x,
 530                        int *ePBC, matrix box, gmx_bool bRemoveH,
 531                        t_symtab *symtab, gmx_residuetype_t *rt, const char *watres,
 532                        gmx_atomprop_t aps, gmx_bool bVerbose)
 533 /* Read a pdb file. (containing proteins) */
 534 {
 535     int  natom, new_natom, i;
 536
 537     /* READ IT */
 538     printf("Reading %s...\n", inf);
 539     t_topology *top;
 540     snew(top, 1);
 541     read_tps_conf(inf, top, ePBC, x, nullptr, box, FALSE);
 542     strncpy(title, *top->name, STRLEN);
 543     title[STRLEN-1] = '\0';
 544     *atoms          = top->atoms;
 545     sfree(top);
 546     natom = atoms->nr;
 547     if (atoms->pdbinfo == nullptr)
 548     {
 549         snew(atoms->pdbinfo, atoms->nr);
 550     }
 551     if (fn2ftp(inf) == efPDB)
 552     {
 553         get_pdb_atomnumber(atoms, aps);
 554     }
 555     if (bRemoveH)
 556     {
 557         new_natom = 0;
 558         for (i = 0; i < atoms->nr; i++)
 559         {
 560             if (!is_hydrogen(*atoms->atomname[i]))
 561             {
 562                 atoms->atom[new_natom]     = atoms->atom[i];
 563                 atoms->atomname[new_natom] = atoms->atomname[i];
 564                 atoms->pdbinfo[new_natom]  = atoms->pdbinfo[i];
 565                 copy_rvec((*x)[i], (*x)[new_natom]);
 566                 new_natom++;
 567             }
 568         }
 569         atoms->nr = new_natom;
 570         natom     = new_natom;
 571     }
 572
 573     printf("Read");
 574     if (title[0])
 575     {
 576         printf(" '%s',", title);
 577     }
 578     printf(" %d atoms\n", natom);
 579
 580     /* Rename residues */
 581     rename_pdbres(atoms, "HOH", watres, FALSE, symtab);
 582     rename_pdbres(atoms, "SOL", watres, FALSE, symtab);
 583     rename_pdbres(atoms, "WAT", watres, FALSE, symtab);
 584
 585     rename_atoms("xlateat.dat", nullptr,
 586                  atoms, symtab, nullptr, TRUE, rt, TRUE, bVerbose);
 587
 588     if (natom == 0)
 589     {
 590         return 0;
 591     }
 592
 593     if (outf)
 594     {
 595         write_sto_conf(outf, title, atoms, *x, nullptr, *ePBC, box);
 596     }
 597
 598     return natom;
 599 }
 600
 601 void process_chain(t_atoms *pdba, rvec *x,
 602                    gmx_bool bTrpU, gmx_bool bPheU, gmx_bool bTyrU,
 603                    gmx_bool bLysMan, gmx_bool bAspMan, gmx_bool bGluMan,
 604                    gmx_bool bHisMan, gmx_bool bArgMan, gmx_bool bGlnMan,
 605                    real angle, real distance, t_symtab *symtab,
 606                    int nrr, const rtprename_t *rr)
 607 {
 608     /* Rename aromatics, lys, asp and histidine */
 609     if (bTyrU)
 610     {
 611         rename_bb(pdba, "TYR", "TYRU", FALSE, symtab);
 612     }
 613     if (bTrpU)
 614     {
 615         rename_bb(pdba, "TRP", "TRPU", FALSE, symtab);
 616     }
 617     if (bPheU)
 618     {
 619         rename_bb(pdba, "PHE", "PHEU", FALSE, symtab);
 620     }
 621     if (bLysMan)
 622     {
 623         rename_bbint(pdba, "LYS", get_lystp, FALSE, symtab, nrr, rr);
 624     }
 625     if (bArgMan)
 626     {
 627         rename_bbint(pdba, "ARG", get_argtp, FALSE, symtab, nrr, rr);
 628     }
 629     if (bGlnMan)
 630     {
 631         rename_bbint(pdba, "GLN", get_glntp, FALSE, symtab, nrr, rr);
 632     }
 633     if (bAspMan)
 634     {
 635         rename_bbint(pdba, "ASP", get_asptp, FALSE, symtab, nrr, rr);
 636     }
 637     else
 638     {
 639         rename_bb(pdba, "ASPH", "ASP", FALSE, symtab);
 640     }
 641     if (bGluMan)
 642     {
 643         rename_bbint(pdba, "GLU", get_glutp, FALSE, symtab, nrr, rr);
 644     }
 645     else
 646     {
 647         rename_bb(pdba, "GLUH", "GLU", FALSE, symtab);
 648     }
 649
 650     if (!bHisMan)
 651     {
 652         set_histp(pdba, x, angle, distance);
 653     }
 654     else
 655     {
 656         rename_bbint(pdba, "HIS", get_histp, TRUE, symtab, nrr, rr);
 657     }
 658
 659     /* Initialize the rtp builing block names with the residue names
 660      * for the residues that have not been processed above.
 661      */
 662     pdbres_to_gmxrtp(pdba);
 663
 664     /* Now we have all rtp names set.
 665      * The rtp names will conform to Gromacs naming,
 666      * unless the input pdb file contained one or more force field specific
 667      * rtp names as residue names.
 668      */
 669 }
 670
 671 /* struct for sorting the atoms from the pdb file */
 672 typedef struct {
 673     int  resnr;  /* residue number               */
 674     int  j;      /* database order index         */
 675     int  index;  /* original atom number         */
 676     char anm1;   /* second letter of atom name   */
 677     char altloc; /* alternate location indicator */
 678 } t_pdbindex;
 679
 680 int pdbicomp(const void *a, const void *b)
 681 {
 682     t_pdbindex *pa, *pb;
 683     int         d;
 684
 685     pa = (t_pdbindex *)a;
 686     pb = (t_pdbindex *)b;
 687
 688     d = (pa->resnr - pb->resnr);
 689     if (d == 0)
 690     {
 691         d = (pa->j - pb->j);
 692         if (d == 0)
 693         {
 694             d = (pa->anm1 - pb->anm1);
 695             if (d == 0)
 696             {
 697                 d = (pa->altloc - pb->altloc);
 698             }
 699         }
 700     }
 701
 702     return d;
 703 }
 704
 705 static void sort_pdbatoms(t_restp restp[],
 706                           int natoms, t_atoms **pdbaptr, rvec **x,
 707                           t_blocka *block, char ***gnames)
 708 {
 709     t_atoms     *pdba, *pdbnew;
 710     rvec       **xnew;
 711     int          i, j;
 712     t_restp     *rptr;
 713     t_pdbindex  *pdbi;
 714     int         *a;
 715     char        *atomnm;
 716
 717     pdba   = *pdbaptr;
 718     natoms = pdba->nr;
 719     pdbnew = nullptr;
 720     snew(xnew, 1);
 721     snew(pdbi, natoms);
 722
 723     for (i = 0; i < natoms; i++)
 724     {
 725         atomnm = *pdba->atomname[i];
 726         rptr   = &restp[pdba->atom[i].resind];
 727         for (j = 0; (j < rptr->natom); j++)
 728         {
 729             if (gmx_strcasecmp(atomnm, *(rptr->atomname[j])) == 0)
 730             {
 731                 break;
 732             }
 733         }
 734         if (j == rptr->natom)
 735         {
 736             char buf[STRLEN];
 737
 738             sprintf(buf,
 739                     "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
 740                     "while sorting atoms.\n%s", atomnm,
 741                     *pdba->resinfo[pdba->atom[i].resind].name,
 742                     pdba->resinfo[pdba->atom[i].resind].nr,
 743                     rptr->resname,
 744                     rptr->natom,
 745                     is_hydrogen(atomnm) ?
 746                     "\nFor a hydrogen, this can be a different protonation state, or it\n"
 747                     "might have had a different number in the PDB file and was rebuilt\n"
 748                     "(it might for instance have been H3, and we only expected H1 & H2).\n"
 749                     "Note that hydrogens might have been added to the entry for the N-terminus.\n"
 750                     "Remove this hydrogen or choose a different protonation state to solve it.\n"
 751                     "Option -ignh will ignore all hydrogens in the input." : ".");
 752             gmx_fatal(FARGS, buf);
 753         }
 754         /* make shadow array to be sorted into indexgroup */
 755         pdbi[i].resnr  = pdba->atom[i].resind;
 756         pdbi[i].j      = j;
 757         pdbi[i].index  = i;
 758         pdbi[i].anm1   = atomnm[1];
 759         pdbi[i].altloc = pdba->pdbinfo[i].altloc;
 760     }
 761     qsort(pdbi, natoms, (size_t)sizeof(pdbi[0]), pdbicomp);
 762
 763     /* pdba is sorted in pdbnew using the pdbi index */
 764     snew(a, natoms);
 765     snew(pdbnew, 1);
 766     init_t_atoms(pdbnew, natoms, TRUE);
 767     snew(*xnew, natoms);
 768     pdbnew->nr   = pdba->nr;
 769     pdbnew->nres = pdba->nres;
 770     sfree(pdbnew->resinfo);
 771     pdbnew->resinfo = pdba->resinfo;
 772     for (i = 0; i < natoms; i++)
 773     {
 774         pdbnew->atom[i]     = pdba->atom[pdbi[i].index];
 775         pdbnew->atomname[i] = pdba->atomname[pdbi[i].index];
 776         pdbnew->pdbinfo[i]  = pdba->pdbinfo[pdbi[i].index];
 777         copy_rvec((*x)[pdbi[i].index], (*xnew)[i]);
 778         /* make indexgroup in block */
 779         a[i] = pdbi[i].index;
 780     }
 781     /* clean up */
 782     sfree(pdba->atomname);
 783     sfree(pdba->atom);
 784     sfree(pdba->pdbinfo);
 785     sfree(pdba);
 786     sfree(*x);
 787     /* copy the sorted pdbnew back to pdba */
 788     *pdbaptr = pdbnew;
 789     *x       = *xnew;
 790     add_grp(block, gnames, natoms, a, "prot_sort");
 791     sfree(xnew);
 792     sfree(a);
 793     sfree(pdbi);
 794 }
 795
 796 static int remove_duplicate_atoms(t_atoms *pdba, rvec x[], gmx_bool bVerbose)
 797 {
 798     int        i, j, oldnatoms, ndel;
 799     t_resinfo *ri;
 800
 801     printf("Checking for duplicate atoms....\n");
 802     oldnatoms    = pdba->nr;
 803     ndel         = 0;
 804     /* NOTE: pdba->nr is modified inside the loop */
 805     for (i = 1; (i < pdba->nr); i++)
 806     {
 807         /* compare 'i' and 'i-1', throw away 'i' if they are identical
 808            this is a 'while' because multiple alternate locations can be present */
 809         while ( (i < pdba->nr) &&
 810                 (pdba->atom[i-1].resind == pdba->atom[i].resind) &&
 811                 (strcmp(*pdba->atomname[i-1], *pdba->atomname[i]) == 0) )
 812         {
 813             ndel++;
 814             if (bVerbose)
 815             {
 816                 ri = &pdba->resinfo[pdba->atom[i].resind];
 817                 printf("deleting duplicate atom %4s  %s%4d%c",
 818                        *pdba->atomname[i], *ri->name, ri->nr, ri->ic);
 819                 if (ri->chainid && (ri->chainid != ' '))
 820                 {
 821                     printf(" ch %c", ri->chainid);
 822                 }
 823                 if (pdba->pdbinfo)
 824                 {
 825                     if (pdba->pdbinfo[i].atomnr)
 826                     {
 827                         printf("  pdb nr %4d", pdba->pdbinfo[i].atomnr);
 828                     }
 829                     if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
 830                     {
 831                         printf("  altloc %c", pdba->pdbinfo[i].altloc);
 832                     }
 833                 }
 834                 printf("\n");
 835             }
 836             pdba->nr--;
 837             /* We can not free, since it might be in the symtab */
 838             /* sfree(pdba->atomname[i]); */
 839             for (j = i; j < pdba->nr; j++)
 840             {
 841                 pdba->atom[j]     = pdba->atom[j+1];
 842                 pdba->atomname[j] = pdba->atomname[j+1];
 843                 if (pdba->pdbinfo)
 844                 {
 845                     pdba->pdbinfo[j]  = pdba->pdbinfo[j+1];
 846                 }
 847                 copy_rvec(x[j+1], x[j]);
 848             }
 849             srenew(pdba->atom,     pdba->nr);
 850             /* srenew(pdba->atomname, pdba->nr); */
 851             srenew(pdba->pdbinfo,  pdba->nr);
 852         }
 853     }
 854     if (pdba->nr != oldnatoms)
 855     {
 856         printf("Now there are %d atoms. Deleted %d duplicates.\n", pdba->nr, ndel);
 857     }
 858
 859     return pdba->nr;
 860 }
 861
 862 void find_nc_ter(t_atoms *pdba, int r0, int r1, int *r_start, int *r_end,
 863                  gmx_residuetype_t *rt)
 864 {
 865     int         i;
 866     const char *p_startrestype;
 867     const char *p_restype;
 868     int         nstartwarn, nendwarn;
 869
 870     *r_start = -1;
 871     *r_end   = -1;
 872
 873     nstartwarn = 0;
 874     nendwarn   = 0;
 875
 876     /* Find the starting terminus (typially N or 5') */
 877     for (i = r0; i < r1 && *r_start == -1; i++)
 878     {
 879         gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &p_startrestype);
 880         if (!gmx_strcasecmp(p_startrestype, "Protein") || !gmx_strcasecmp(p_startrestype, "DNA") || !gmx_strcasecmp(p_startrestype, "RNA") )
 881         {
 882             printf("Identified residue %s%d as a starting terminus.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 883             *r_start = i;
 884         }
 885         else
 886         {
 887             if (nstartwarn < 5)
 888             {
 889                 printf("Warning: Starting residue %s%d in chain not identified as Protein/RNA/DNA.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 890             }
 891             if (nstartwarn == 5)
 892             {
 893                 printf("More than 5 unidentified residues at start of chain - disabling further warnings.\n");
 894             }
 895             nstartwarn++;
 896         }
 897     }
 898
 899     if (*r_start >= 0)
 900     {
 901         /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
 902         for (i = *r_start; i < r1; i++)
 903         {
 904             gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &p_restype);
 905             if (!gmx_strcasecmp(p_restype, p_startrestype) && nendwarn == 0)
 906             {
 907                 *r_end = i;
 908             }
 909             else
 910             {
 911                 if (nendwarn < 5)
 912                 {
 913                     printf("Warning: Residue %s%d in chain has different type (%s) from starting residue %s%d (%s).\n",
 914                            *pdba->resinfo[i].name, pdba->resinfo[i].nr, p_restype,
 915                            *pdba->resinfo[*r_start].name, pdba->resinfo[*r_start].nr, p_startrestype);
 916                 }
 917                 if (nendwarn == 5)
 918                 {
 919                     printf("More than 5 unidentified residues at end of chain - disabling further warnings.\n");
 920                 }
 921                 nendwarn++;
 922             }
 923         }
 924     }
 925
 926     if (*r_end >= 0)
 927     {
 928         printf("Identified residue %s%d as a ending terminus.\n", *pdba->resinfo[*r_end].name, pdba->resinfo[*r_end].nr);
 929     }
 930 }
 931
 932
 933 enum SplittingType
 934 {
 935     SPLIT_ID_OR_TER,
 936     SPLIT_ID_AND_TER,
 937     SPLIT_ID_ONLY,
 938     SPLIT_TER_ONLY,
 939     SPLIT_INTERACTIVE
 940 };
 941
 942 static SplittingType getSplittingType(const char *chainsep)
 943 {
 944     SplittingType splitting = SPLIT_TER_ONLY; /* keep compiler happy */
 945
 946     /* Be a bit flexible to catch typos */
 947     if (!strncmp(chainsep, "id_o", 4))
 948     {
 949         /* For later interactive splitting we tentatively assign new chain numbers at either changing id or ter records */
 950         splitting = SPLIT_ID_OR_TER;
 951         printf("Splitting chemical chains based on TER records or chain id changing.\n");
 952     }
 953     else if (!strncmp(chainsep, "int", 3))
 954     {
 955         /* For later interactive splitting we tentatively assign new chain numbers at either changing id or ter records */
 956         splitting = SPLIT_INTERACTIVE;
 957         printf("Splitting chemical chains interactively.\n");
 958     }
 959     else if (!strncmp(chainsep, "id_a", 4))
 960     {
 961         splitting = SPLIT_ID_AND_TER;
 962         printf("Splitting chemical chains based on TER records and chain id changing.\n");
 963     }
 964     else if (strlen(chainsep) == 2 && !strncmp(chainsep, "id", 4))
 965     {
 966         splitting = SPLIT_ID_ONLY;
 967         printf("Splitting chemical chains based on changing chain id only (ignoring TER records).\n");
 968     }
 969     else if (chainsep[0] == 't')
 970     {
 971         splitting = SPLIT_TER_ONLY;
 972         printf("Splitting chemical chains based on TER records only (ignoring chain id).\n");
 973     }
 974     else
 975     {
 976         gmx_fatal(FARGS, "Unidentified setting for chain separation: %s\n", chainsep);
 977     }
 978     return splitting;
 979 }
 980
 981 static void
 982 modify_chain_numbers(t_atoms *       pdba,
 983                      const char *    chainsep)
 984 {
 985     int           i;
 986     char          old_prev_chainid;
 987     char          old_this_chainid;
 988     int           old_prev_chainnum;
 989     int           old_this_chainnum;
 990     t_resinfo    *ri;
 991     char          select[STRLEN];
 992     int           new_chainnum;
 993     int           this_atomnum;
 994     int           prev_atomnum;
 995     const char *  prev_atomname;
 996     const char *  this_atomname;
 997     const char *  prev_resname;
 998     const char *  this_resname;
 999     int           prev_resnum;
1000     int           this_resnum;
1001     char          prev_chainid;
1002     char          this_chainid;
1003
1004     SplittingType splitting = getSplittingType(chainsep);
1005
1006     /* The default chain enumeration is based on TER records only, which is reflected in chainnum below */
1007
1008     old_prev_chainid  = '?';
1009     old_prev_chainnum = -1;
1010     new_chainnum      = -1;
1011
1012     this_atomname       = nullptr;
1013     this_atomnum        = -1;
1014     this_resname        = nullptr;
1015     this_resnum         = -1;
1016     this_chainid        = '?';
1017
1018     for (i = 0; i < pdba->nres; i++)
1019     {
1020         ri                 = &pdba->resinfo[i];
1021         old_this_chainid   = ri->chainid;
1022         old_this_chainnum  = ri->chainnum;
1023
1024         prev_atomname      = this_atomname;
1025         prev_atomnum       = this_atomnum;
1026         prev_resname       = this_resname;
1027         prev_resnum        = this_resnum;
1028         prev_chainid       = this_chainid;
1029
1030         this_atomname      = *(pdba->atomname[i]);
1031         this_atomnum       = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i+1;
1032         this_resname       = *ri->name;
1033         this_resnum        = ri->nr;
1034         this_chainid       = ri->chainid;
1035
1036         switch (splitting)
1037         {
1038             case SPLIT_ID_OR_TER:
1039                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1040                 {
1041                     new_chainnum++;
1042                 }
1043                 break;
1044
1045             case SPLIT_ID_AND_TER:
1046                 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1047                 {
1048                     new_chainnum++;
1049                 }
1050                 break;
1051
1052             case SPLIT_ID_ONLY:
1053                 if (old_this_chainid != old_prev_chainid)
1054                 {
1055                     new_chainnum++;
1056                 }
1057                 break;
1058
1059             case SPLIT_TER_ONLY:
1060                 if (old_this_chainnum != old_prev_chainnum)
1061                 {
1062                     new_chainnum++;
1063                 }
1064                 break;
1065             case SPLIT_INTERACTIVE:
1066                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1067                 {
1068                     if (i > 0)
1069                     {
1070                         printf("Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\n"
1071                                "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]\n",
1072                                prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1073                                this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1074
1075                         if (nullptr == fgets(select, STRLEN-1, stdin))
1076                         {
1077                             gmx_fatal(FARGS, "Error reading from stdin");
1078                         }
1079                     }
1080                     if (i == 0 || select[0] == 'y')
1081                     {
1082                         new_chainnum++;
1083                     }
1084                 }
1085                 break;
1086             default:
1087                 gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1088                 break;
1089         }
1090         old_prev_chainid  = old_this_chainid;
1091         old_prev_chainnum = old_this_chainnum;
1092
1093         ri->chainnum = new_chainnum;
1094     }
1095 }
1096
1097
1098 typedef struct {
1099     char     chainid;
1100     char     chainnum;
1101     int      start;
1102     int      natom;
1103     gmx_bool bAllWat;
1104     int      nterpairs;
1105     int     *chainstart;
1106 } t_pdbchain;
1107
1108 typedef struct {
1109     char          chainid;
1110     int           chainnum;
1111     gmx_bool      bAllWat;
1112     int           nterpairs;
1113     int          *chainstart;
1114     t_hackblock **ntdb;
1115     t_hackblock **ctdb;
1116     int          *r_start;
1117     int          *r_end;
1118     t_atoms      *pdba;
1119     rvec         *x;
1120 } t_chain;
1121
1122 int gmx_pdb2gmx(int argc, char *argv[])
1123 {
1124     const char *desc[] = {
1125         "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1126         "some database files, adds hydrogens to the molecules and generates",
1127         "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1128         "and a topology in GROMACS format.",
1129         "These files can subsequently be processed to generate a run input file.",
1130         "[PAR]",
1131         "[THISMODULE] will search for force fields by looking for",
1132         "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1133         "of the current working directory and of the GROMACS library directory",
1134         "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1135         "variable.",
1136         "By default the forcefield selection is interactive,",
1137         "but you can use the [TT]-ff[tt] option to specify one of the short names",
1138         "in the list on the command line instead. In that case [THISMODULE] just looks",
1139         "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1140         "[PAR]",
1141         "After choosing a force field, all files will be read only from",
1142         "the corresponding force field directory.",
1143         "If you want to modify or add a residue types, you can copy the force",
1144         "field directory from the GROMACS library directory to your current",
1145         "working directory. If you want to add new protein residue types,",
1146         "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1147         "or copy the whole library directory to a local directory and set",
1148         "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1149         "Check Chapter 5 of the manual for more information about file formats.",
1150         "[PAR]",
1151
1152         "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1153         "need not necessarily contain a protein structure. Every kind of",
1154         "molecule for which there is support in the database can be converted.",
1155         "If there is no support in the database, you can add it yourself.[PAR]",
1156
1157         "The program has limited intelligence, it reads a number of database",
1158         "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1159         "if necessary this can be done manually. The program can prompt the",
1160         "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1161         "desired. For Lys the choice is between neutral (two protons on NZ) or",
1162         "protonated (three protons, default), for Asp and Glu unprotonated",
1163         "(default) or protonated, for His the proton can be either on ND1,",
1164         "on NE2 or on both. By default these selections are done automatically.",
1165         "For His, this is based on an optimal hydrogen bonding",
1166         "conformation. Hydrogen bonds are defined based on a simple geometric",
1167         "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1168         "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1169         "[TT]-dist[tt] respectively.[PAR]",
1170
1171         "The protonation state of N- and C-termini can be chosen interactively",
1172         "with the [TT]-ter[tt] flag.  Default termini are ionized (NH3+ and COO-),",
1173         "respectively.  Some force fields support zwitterionic forms for chains of",
1174         "one residue, but for polypeptides these options should NOT be selected.",
1175         "The AMBER force fields have unique forms for the terminal residues,",
1176         "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1177         "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1178         "respectively to use these forms, making sure you preserve the format",
1179         "of the coordinate file. Alternatively, use named terminating residues",
1180         "(e.g. ACE, NME).[PAR]",
1181
1182         "The separation of chains is not entirely trivial since the markup",
1183         "in user-generated PDB files frequently varies and sometimes it",
1184         "is desirable to merge entries across a TER record, for instance",
1185         "if you want a disulfide bridge or distance restraints between",
1186         "two protein chains or if you have a HEME group bound to a protein.",
1187         "In such cases multiple chains should be contained in a single",
1188         "[TT]moleculetype[tt] definition.",
1189         "To handle this, [THISMODULE] uses two separate options.",
1190         "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1191         "start, and termini added when applicable. This can be done based on the",
1192         "existence of TER records, when the chain id changes, or combinations of either",
1193         "or both of these. You can also do the selection fully interactively.",
1194         "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1195         "are merged into one moleculetype, after adding all the chemical termini (or not).",
1196         "This can be turned off (no merging), all non-water chains can be merged into a",
1197         "single molecule, or the selection can be done interactively.[PAR]",
1198
1199         "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1200         "If any of the occupancies are not one, indicating that the atom is",
1201         "not resolved well in the structure, a warning message is issued.",
1202         "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1203         "all occupancy fields may be zero. Either way, it is up to the user",
1204         "to verify the correctness of the input data (read the article!).[PAR]",
1205
1206         "During processing the atoms will be reordered according to GROMACS",
1207         "conventions. With [TT]-n[tt] an index file can be generated that",
1208         "contains one group reordered in the same way. This allows you to",
1209         "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1210         "one limitation: reordering is done after the hydrogens are stripped",
1211         "from the input and before new hydrogens are added. This means that",
1212         "you should not use [TT]-ignh[tt].[PAR]",
1213
1214         "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1215         "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1216         "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1217         "[PAR]",
1218
1219         "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1220         "motions. Angular and out-of-plane motions can be removed by changing",
1221         "hydrogens into virtual sites and fixing angles, which fixes their",
1222         "position relative to neighboring atoms. Additionally, all atoms in the",
1223         "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1224         "can be converted into virtual sites, eliminating the fast improper dihedral",
1225         "fluctuations in these rings. [BB]Note[bb] that in this case all other hydrogen",
1226         "atoms are also converted to virtual sites. The mass of all atoms that are",
1227         "converted into virtual sites, is added to the heavy atoms.[PAR]",
1228         "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1229         "done by increasing the hydrogen-mass by a factor of 4. This is also",
1230         "done for water hydrogens to slow down the rotational motion of water.",
1231         "The increase in mass of the hydrogens is subtracted from the bonded",
1232         "(heavy) atom so that the total mass of the system remains the same."
1233     };
1234
1235
1236     FILE             *fp, *top_file, *top_file2, *itp_file = nullptr;
1237     int               natom, nres;
1238     t_atoms           pdba_all, *pdba;
1239     t_atoms          *atoms;
1240     t_resinfo        *ri;
1241     t_blocka         *block;
1242     int               chain, nch, maxch, nwaterchain;
1243     t_pdbchain       *pdb_ch;
1244     t_chain          *chains, *cc;
1245     char              select[STRLEN];
1246     int               nincl, nmol;
1247     char            **incls;
1248     t_mols           *mols;
1249     char            **gnames;
1250     int               ePBC;
1251     matrix            box;
1252     rvec              box_space;
1253     int               i, j, k, l, nrtp;
1254     int              *swap_index, si;
1255     t_restp          *restp;
1256     t_hackblock      *ah;
1257     t_symtab          symtab;
1258     gpp_atomtype_t    atype;
1259     gmx_residuetype_t*rt;
1260     const char       *top_fn;
1261     char              fn[256], itp_fn[STRLEN], posre_fn[STRLEN], buf_fn[STRLEN];
1262     char              molname[STRLEN], title[STRLEN];
1263     char             *c, forcefield[STRLEN], ffdir[STRLEN];
1264     char              ffname[STRLEN], suffix[STRLEN], buf[STRLEN];
1265     char             *watermodel;
1266     const char       *watres;
1267     int               nrtpf;
1268     char            **rtpf;
1269     int               nrrn;
1270     char            **rrn;
1271     int               nrtprename;
1272     rtprename_t      *rtprename = nullptr;
1273     int               nah, nNtdb, nCtdb, ntdblist;
1274     t_hackblock      *ntdb, *ctdb, **tdblist;
1275     int               nssbonds;
1276     t_ssbond         *ssbonds;
1277     rvec             *pdbx, *x;
1278     gmx_bool          bVsites = FALSE, bWat, bPrevWat = FALSE, bITP, bVsiteAromatics = FALSE;
1279     real              mHmult  = 0;
1280     t_hackblock      *hb_chain;
1281     t_restp          *restp_chain;
1282     gmx_output_env_t *oenv;
1283     const char       *p_restype;
1284     int               rc;
1285     int               this_atomnum;
1286     int               prev_atomnum;
1287     const char     *  prev_atomname;
1288     const char     *  this_atomname;
1289     const char     *  prev_resname;
1290     const char     *  this_resname;
1291     int               prev_resnum;
1292     int               this_resnum;
1293     char              prev_chainid;
1294     char              this_chainid;
1295     int               prev_chainnumber;
1296     int               this_chainnumber;
1297     int               nid_used;
1298     int               this_chainstart;
1299     int               prev_chainstart;
1300     gmx_bool          bMerged;
1301     int               nchainmerges;
1302
1303     gmx_atomprop_t    aps;
1304
1305     t_filenm          fnm[] = {
1306         { efSTX, "-f", "eiwit.pdb", ffREAD  },
1307         { efSTO, "-o", "conf",      ffWRITE },
1308         { efTOP, nullptr, nullptr,        ffWRITE },
1309         { efITP, "-i", "posre",     ffWRITE },
1310         { efNDX, "-n", "clean",     ffOPTWR },
1311         { efSTO, "-q", "clean.pdb", ffOPTWR }
1312     };
1313 #define NFILE asize(fnm)
1314
1315
1316     /* Command line arguments must be static */
1317     static gmx_bool    bNewRTP        = FALSE;
1318     static gmx_bool    bInter         = FALSE, bCysMan = FALSE;
1319     static gmx_bool    bLysMan        = FALSE, bAspMan = FALSE, bGluMan = FALSE, bHisMan = FALSE;
1320     static gmx_bool    bGlnMan        = FALSE, bArgMan = FALSE;
1321     static gmx_bool    bTerMan        = FALSE, bUnA = FALSE, bHeavyH;
1322     static gmx_bool    bSort          = TRUE, bAllowMissing = FALSE, bRemoveH = FALSE;
1323     static gmx_bool    bDeuterate     = FALSE, bVerbose = FALSE, bChargeGroups = TRUE, bCmap = TRUE;
1324     static gmx_bool    bRenumRes      = FALSE, bRTPresname = FALSE;
1325     static real        angle          = 135.0, distance = 0.3, posre_fc = 1000;
1326     static real        long_bond_dist = 0.25, short_bond_dist = 0.05;
1327     static const char *vsitestr[]     = { nullptr, "none", "hydrogens", "aromatics", nullptr };
1328     static const char *watstr[]       = { nullptr, "select", "none", "spc", "spce", "tip3p", "tip4p", "tip5p", nullptr };
1329     static const char *chainsep[]     = { nullptr, "id_or_ter", "id_and_ter", "ter", "id", "interactive", nullptr };
1330     static const char *merge[]        = {nullptr, "no", "all", "interactive", nullptr };
1331     static const char *ff             = "select";
1332
1333     t_pargs            pa[] = {
1334         { "-newrtp", FALSE, etBOOL, {&bNewRTP},
1335           "HIDDENWrite the residue database in new format to [TT]new.rtp[tt]"},
1336         { "-lb",     FALSE, etREAL, {&long_bond_dist},
1337           "HIDDENLong bond warning distance" },
1338         { "-sb",     FALSE, etREAL, {&short_bond_dist},
1339           "HIDDENShort bond warning distance" },
1340         { "-chainsep", FALSE, etENUM, {chainsep},
1341           "Condition in PDB files when a new chain should be started (adding termini)" },
1342         { "-merge",  FALSE, etENUM, {&merge},
1343           "Merge multiple chains into a single [moleculetype]" },
1344         { "-ff",     FALSE, etSTR,  {&ff},
1345           "Force field, interactive by default. Use [TT]-h[tt] for information." },
1346         { "-water",  FALSE, etENUM, {watstr},
1347           "Water model to use" },
1348         { "-inter",  FALSE, etBOOL, {&bInter},
1349           "Set the next 8 options to interactive"},
1350         { "-ss",     FALSE, etBOOL, {&bCysMan},
1351           "Interactive SS bridge selection" },
1352         { "-ter",    FALSE, etBOOL, {&bTerMan},
1353           "Interactive termini selection, instead of charged (default)" },
1354         { "-lys",    FALSE, etBOOL, {&bLysMan},
1355           "Interactive lysine selection, instead of charged" },
1356         { "-arg",    FALSE, etBOOL, {&bArgMan},
1357           "Interactive arginine selection, instead of charged" },
1358         { "-asp",    FALSE, etBOOL, {&bAspMan},
1359           "Interactive aspartic acid selection, instead of charged" },
1360         { "-glu",    FALSE, etBOOL, {&bGluMan},
1361           "Interactive glutamic acid selection, instead of charged" },
1362         { "-gln",    FALSE, etBOOL, {&bGlnMan},
1363           "Interactive glutamine selection, instead of neutral" },
1364         { "-his",    FALSE, etBOOL, {&bHisMan},
1365           "Interactive histidine selection, instead of checking H-bonds" },
1366         { "-angle",  FALSE, etREAL, {&angle},
1367           "Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)" },
1368         { "-dist",   FALSE, etREAL, {&distance},
1369           "Maximum donor-acceptor distance for a H-bond (nm)" },
1370         { "-una",    FALSE, etBOOL, {&bUnA},
1371           "Select aromatic rings with united CH atoms on phenylalanine, "
1372           "tryptophane and tyrosine" },
1373         { "-sort",   FALSE, etBOOL, {&bSort},
1374           "HIDDENSort the residues according to database, turning this off is dangerous as charge groups might be broken in parts" },
1375         { "-ignh",   FALSE, etBOOL, {&bRemoveH},
1376           "Ignore hydrogen atoms that are in the coordinate file" },
1377         { "-missing", FALSE, etBOOL, {&bAllowMissing},
1378           "Continue when atoms are missing, dangerous" },
1379         { "-v",      FALSE, etBOOL, {&bVerbose},
1380           "Be slightly more verbose in messages" },
1381         { "-posrefc", FALSE, etREAL, {&posre_fc},
1382           "Force constant for position restraints" },
1383         { "-vsite",  FALSE, etENUM, {vsitestr},
1384           "Convert atoms to virtual sites" },
1385         { "-heavyh", FALSE, etBOOL, {&bHeavyH},
1386           "Make hydrogen atoms heavy" },
1387         { "-deuterate", FALSE, etBOOL, {&bDeuterate},
1388           "Change the mass of hydrogens to 2 amu" },
1389         { "-chargegrp", TRUE, etBOOL, {&bChargeGroups},
1390           "Use charge groups in the [REF].rtp[ref] file"  },
1391         { "-cmap", TRUE, etBOOL, {&bCmap},
1392           "Use cmap torsions (if enabled in the [REF].rtp[ref] file)"  },
1393         { "-renum", TRUE, etBOOL, {&bRenumRes},
1394           "Renumber the residues consecutively in the output"  },
1395         { "-rtpres", TRUE, etBOOL, {&bRTPresname},
1396           "Use [REF].rtp[ref] entry names as residue names"  }
1397     };
1398 #define NPARGS asize(pa)
1399
1400     if (!parse_common_args(&argc, argv, 0, NFILE, fnm, asize(pa), pa, asize(desc), desc,
1401                            0, nullptr, &oenv))
1402     {
1403         return 0;
1404     }
1405
1406     /* Force field selection, interactive or direct */
1407     choose_ff(strcmp(ff, "select") == 0 ? nullptr : ff,
1408               forcefield, sizeof(forcefield),
1409               ffdir, sizeof(ffdir));
1410
1411     if (strlen(forcefield) > 0)
1412     {
1413         strcpy(ffname, forcefield);
1414         ffname[0] = toupper(ffname[0]);
1415     }
1416     else
1417     {
1418         gmx_fatal(FARGS, "Empty forcefield string");
1419     }
1420
1421     printf("\nUsing the %s force field in directory %s\n\n",
1422            ffname, ffdir);
1423
1424     choose_watermodel(watstr[0], ffdir, &watermodel);
1425
1426     if (bInter)
1427     {
1428         /* if anything changes here, also change description of -inter */
1429         bCysMan = TRUE;
1430         bTerMan = TRUE;
1431         bLysMan = TRUE;
1432         bArgMan = TRUE;
1433         bAspMan = TRUE;
1434         bGluMan = TRUE;
1435         bGlnMan = TRUE;
1436         bHisMan = TRUE;
1437     }
1438
1439     if (bHeavyH)
1440     {
1441         mHmult = 4.0;
1442     }
1443     else if (bDeuterate)
1444     {
1445         mHmult = 2.0;
1446     }
1447     else
1448     {
1449         mHmult = 1.0;
1450     }
1451
1452     /* parse_common_args ensures vsitestr has been selected, but
1453        clang-static-analyzer needs clues to know that */
1454     GMX_ASSERT(vsitestr[0], "-vsite default wasn't processed correctly");
1455     switch (vsitestr[0][0])
1456     {
1457         case 'n': /* none */
1458             bVsites         = FALSE;
1459             bVsiteAromatics = FALSE;
1460             break;
1461         case 'h': /* hydrogens */
1462             bVsites         = TRUE;
1463             bVsiteAromatics = FALSE;
1464             break;
1465         case 'a': /* aromatics */
1466             bVsites         = TRUE;
1467             bVsiteAromatics = TRUE;
1468             break;
1469         default:
1470             gmx_fatal(FARGS, "DEATH HORROR in $s (%d): vsitestr[0]='%s'",
1471                       __FILE__, __LINE__, vsitestr[0]);
1472     } /* end switch */
1473
1474     /* Open the symbol table */
1475     open_symtab(&symtab);
1476
1477     /* Residue type database */
1478     gmx_residuetype_init(&rt);
1479
1480     /* Read residue renaming database(s), if present */
1481     nrrn = fflib_search_file_end(ffdir, ".r2b", FALSE, &rrn);
1482
1483     nrtprename = 0;
1484     rtprename  = nullptr;
1485     for (i = 0; i < nrrn; i++)
1486     {
1487         fp = fflib_open(rrn[i]);
1488         read_rtprename(rrn[i], fp, &nrtprename, &rtprename);
1489         gmx_ffclose(fp);
1490         sfree(rrn[i]);
1491     }
1492     sfree(rrn);
1493
1494     /* Add all alternative names from the residue renaming database to the list of recognized amino/nucleic acids. */
1495     for (i = 0; i < nrtprename; i++)
1496     {
1497         rc = gmx_residuetype_get_type(rt, rtprename[i].gmx, &p_restype);
1498
1499         /* Only add names if the 'standard' gromacs/iupac base name was found */
1500         if (rc == 0)
1501         {
1502             gmx_residuetype_add(rt, rtprename[i].main, p_restype);
1503             gmx_residuetype_add(rt, rtprename[i].nter, p_restype);
1504             gmx_residuetype_add(rt, rtprename[i].cter, p_restype);
1505             gmx_residuetype_add(rt, rtprename[i].bter, p_restype);
1506         }
1507     }
1508
1509     clear_mat(box);
1510     if (watermodel != nullptr && (strstr(watermodel, "4p") ||
1511                                   strstr(watermodel, "4P")))
1512     {
1513         watres = "HO4";
1514     }
1515     else if (watermodel != nullptr && (strstr(watermodel, "5p") ||
1516                                        strstr(watermodel, "5P")))
1517     {
1518         watres = "HO5";
1519     }
1520     else
1521     {
1522         watres = "HOH";
1523     }
1524
1525     aps   = gmx_atomprop_init();
1526     natom = read_pdball(opt2fn("-f", NFILE, fnm), opt2fn_null("-q", NFILE, fnm), title,
1527                         &pdba_all, &pdbx, &ePBC, box, bRemoveH, &symtab, rt, watres,
1528                         aps, bVerbose);
1529
1530     if (natom == 0)
1531     {
1532         gmx_fatal(FARGS, "No atoms found in pdb file %s\n", opt2fn("-f", NFILE, fnm));
1533     }
1534
1535     printf("Analyzing pdb file\n");
1536     nwaterchain = 0;
1537
1538     modify_chain_numbers(&pdba_all, chainsep[0]);
1539
1540     nchainmerges        = 0;
1541
1542     this_atomname       = nullptr;
1543     this_atomnum        = -1;
1544     this_resname        = nullptr;
1545     this_resnum         = -1;
1546     this_chainid        = '?';
1547     this_chainnumber    = -1;
1548     this_chainstart     = 0;
1549     /* Keep the compiler happy */
1550     prev_chainstart     = 0;
1551
1552     nch   = 0;
1553     maxch = 16;
1554     snew(pdb_ch, maxch);
1555
1556     bMerged = FALSE;
1557     for (i = 0; (i < natom); i++)
1558     {
1559         ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
1560
1561         /* TODO this should live in a helper object, and consolidate
1562            that with code in modify_chain_numbers */
1563         prev_atomname      = this_atomname;
1564         prev_atomnum       = this_atomnum;
1565         prev_resname       = this_resname;
1566         prev_resnum        = this_resnum;
1567         prev_chainid       = this_chainid;
1568         prev_chainnumber   = this_chainnumber;
1569         if (!bMerged)
1570         {
1571             prev_chainstart    = this_chainstart;
1572         }
1573
1574         this_atomname      = *pdba_all.atomname[i];
1575         this_atomnum       = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i+1;
1576         this_resname       = *ri->name;
1577         this_resnum        = ri->nr;
1578         this_chainid       = ri->chainid;
1579         this_chainnumber   = ri->chainnum;
1580
1581         bWat = gmx_strcasecmp(*ri->name, watres) == 0;
1582         if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat != bPrevWat))
1583         {
1584             this_chainstart = pdba_all.atom[i].resind;
1585
1586             bMerged = FALSE;
1587             if (i > 0 && !bWat)
1588             {
1589                 if (!strncmp(merge[0], "int", 3))
1590                 {
1591                     printf("Merge chain ending with residue %s%d (chain id '%c', atom %d %s) and chain starting with\n"
1592                            "residue %s%d (chain id '%c', atom %d %s) into a single moleculetype (keeping termini)? [n/y]\n",
1593                            prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1594                            this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1595
1596                     if (nullptr == fgets(select, STRLEN-1, stdin))
1597                     {
1598                         gmx_fatal(FARGS, "Error reading from stdin");
1599                     }
1600                     bMerged = (select[0] == 'y');
1601                 }
1602                 else if (!strncmp(merge[0], "all", 3))
1603                 {
1604                     bMerged = TRUE;
1605                 }
1606             }
1607
1608             if (bMerged)
1609             {
1610                 pdb_ch[nch-1].chainstart[pdb_ch[nch-1].nterpairs] =
1611                     pdba_all.atom[i].resind - prev_chainstart;
1612                 pdb_ch[nch-1].nterpairs++;
1613                 srenew(pdb_ch[nch-1].chainstart, pdb_ch[nch-1].nterpairs+1);
1614                 nchainmerges++;
1615             }
1616             else
1617             {
1618                 /* set natom for previous chain */
1619                 if (nch > 0)
1620                 {
1621                     pdb_ch[nch-1].natom = i-pdb_ch[nch-1].start;
1622                 }
1623                 if (bWat)
1624                 {
1625                     nwaterchain++;
1626                     ri->chainid = ' ';
1627                 }
1628                 /* check if chain identifier was used before */
1629                 for (j = 0; (j < nch); j++)
1630                 {
1631                     if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
1632                     {
1633                         printf("WARNING: Chain identifier '%c' is used in two non-sequential blocks.\n"
1634                                "They will be treated as separate chains unless you reorder your file.\n",
1635                                ri->chainid);
1636                     }
1637                 }
1638                 // TODO This is too convoluted. Use a std::vector
1639                 if (nch == maxch)
1640                 {
1641                     maxch += 16;
1642                     srenew(pdb_ch, maxch);
1643                 }
1644                 pdb_ch[nch].chainid  = ri->chainid;
1645                 pdb_ch[nch].chainnum = ri->chainnum;
1646                 pdb_ch[nch].start    = i;
1647                 pdb_ch[nch].bAllWat  = bWat;
1648                 if (bWat)
1649                 {
1650                     pdb_ch[nch].nterpairs = 0;
1651                 }
1652                 else
1653                 {
1654                     pdb_ch[nch].nterpairs = 1;
1655                 }
1656                 snew(pdb_ch[nch].chainstart, pdb_ch[nch].nterpairs+1);
1657                 /* modified [nch] to [0] below */
1658                 pdb_ch[nch].chainstart[0] = 0;
1659                 nch++;
1660             }
1661         }
1662         bPrevWat = bWat;
1663     }
1664     pdb_ch[nch-1].natom = natom-pdb_ch[nch-1].start;
1665
1666     /* set all the water blocks at the end of the chain */
1667     snew(swap_index, nch);
1668     j = 0;
1669     for (i = 0; i < nch; i++)
1670     {
1671         if (!pdb_ch[i].bAllWat)
1672         {
1673             swap_index[j] = i;
1674             j++;
1675         }
1676     }
1677     for (i = 0; i < nch; i++)
1678     {
1679         if (pdb_ch[i].bAllWat)
1680         {
1681             swap_index[j] = i;
1682             j++;
1683         }
1684     }
1685     if (nwaterchain > 1)
1686     {
1687         printf("Moved all the water blocks to the end\n");
1688     }
1689
1690     snew(chains, nch);
1691     /* copy pdb data and x for all chains */
1692     for (i = 0; (i < nch); i++)
1693     {
1694         si                   = swap_index[i];
1695         chains[i].chainid    = pdb_ch[si].chainid;
1696         chains[i].chainnum   = pdb_ch[si].chainnum;
1697         chains[i].bAllWat    = pdb_ch[si].bAllWat;
1698         chains[i].nterpairs  = pdb_ch[si].nterpairs;
1699         chains[i].chainstart = pdb_ch[si].chainstart;
1700         snew(chains[i].ntdb, pdb_ch[si].nterpairs);
1701         snew(chains[i].ctdb, pdb_ch[si].nterpairs);
1702         snew(chains[i].r_start, pdb_ch[si].nterpairs);
1703         snew(chains[i].r_end, pdb_ch[si].nterpairs);
1704
1705         snew(chains[i].pdba, 1);
1706         init_t_atoms(chains[i].pdba, pdb_ch[si].natom, TRUE);
1707         snew(chains[i].x, chains[i].pdba->nr);
1708         for (j = 0; j < chains[i].pdba->nr; j++)
1709         {
1710             chains[i].pdba->atom[j] = pdba_all.atom[pdb_ch[si].start+j];
1711             snew(chains[i].pdba->atomname[j], 1);
1712             *chains[i].pdba->atomname[j] =
1713                 gmx_strdup(*pdba_all.atomname[pdb_ch[si].start+j]);
1714             chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start+j];
1715             copy_rvec(pdbx[pdb_ch[si].start+j], chains[i].x[j]);
1716         }
1717         /* Re-index the residues assuming that the indices are continuous */
1718         k                    = chains[i].pdba->atom[0].resind;
1719         nres                 = chains[i].pdba->atom[chains[i].pdba->nr-1].resind - k + 1;
1720         chains[i].pdba->nres = nres;
1721         for (j = 0; j < chains[i].pdba->nr; j++)
1722         {
1723             chains[i].pdba->atom[j].resind -= k;
1724         }
1725         srenew(chains[i].pdba->resinfo, nres);
1726         for (j = 0; j < nres; j++)
1727         {
1728             chains[i].pdba->resinfo[j] = pdba_all.resinfo[k+j];
1729             snew(chains[i].pdba->resinfo[j].name, 1);
1730             *chains[i].pdba->resinfo[j].name = gmx_strdup(*pdba_all.resinfo[k+j].name);
1731             /* make all chain identifiers equal to that of the chain */
1732             chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
1733         }
1734     }
1735
1736     if (nchainmerges > 0)
1737     {
1738         printf("\nMerged chains into joint molecule definitions at %d places.\n\n",
1739                nchainmerges);
1740     }
1741
1742     printf("There are %d chains and %d blocks of water and "
1743            "%d residues with %d atoms\n",
1744            nch-nwaterchain, nwaterchain,
1745            pdba_all.nres, natom);
1746
1747     printf("\n  %5s  %4s %6s\n", "chain", "#res", "#atoms");
1748     for (i = 0; (i < nch); i++)
1749     {
1750         printf("  %d '%c' %5d %6d  %s\n",
1751                i+1, chains[i].chainid ? chains[i].chainid : '-',
1752                chains[i].pdba->nres, chains[i].pdba->nr,
1753                chains[i].bAllWat ? "(only water)" : "");
1754     }
1755     printf("\n");
1756
1757     check_occupancy(&pdba_all, opt2fn("-f", NFILE, fnm), bVerbose);
1758
1759     /* Read atomtypes... */
1760     atype = read_atype(ffdir, &symtab);
1761
1762     /* read residue database */
1763     printf("Reading residue database... (%s)\n", forcefield);
1764     nrtpf = fflib_search_file_end(ffdir, ".rtp", TRUE, &rtpf);
1765     nrtp  = 0;
1766     restp = nullptr;
1767     for (i = 0; i < nrtpf; i++)
1768     {
1769         read_resall(rtpf[i], &nrtp, &restp, atype, &symtab, FALSE);
1770         sfree(rtpf[i]);
1771     }
1772     sfree(rtpf);
1773     if (bNewRTP)
1774     {
1775         /* Not correct with multiple rtp input files with different bonded types */
1776         fp = gmx_fio_fopen("new.rtp", "w");
1777         print_resall(fp, nrtp, restp, atype);
1778         gmx_fio_fclose(fp);
1779     }
1780
1781     /* read hydrogen database */
1782     nah = read_h_db(ffdir, &ah);
1783
1784     /* Read Termini database... */
1785     nNtdb = read_ter_db(ffdir, 'n', &ntdb, atype);
1786     nCtdb = read_ter_db(ffdir, 'c', &ctdb, atype);
1787
1788     top_fn   = ftp2fn(efTOP, NFILE, fnm);
1789     top_file = gmx_fio_fopen(top_fn, "w");
1790
1791     print_top_header(top_file, top_fn, FALSE, ffdir, mHmult);
1792
1793     nincl = 0;
1794     nmol  = 0;
1795     incls = nullptr;
1796     mols  = nullptr;
1797     for (chain = 0; (chain < nch); chain++)
1798     {
1799         cc = &(chains[chain]);
1800
1801         /* set pdba, natom and nres to the current chain */
1802         pdba  = cc->pdba;
1803         x     = cc->x;
1804         natom = cc->pdba->nr;
1805         nres  = cc->pdba->nres;
1806
1807         if (cc->chainid && ( cc->chainid != ' ' ) )
1808         {
1809             printf("Processing chain %d '%c' (%d atoms, %d residues)\n",
1810                    chain+1, cc->chainid, natom, nres);
1811         }
1812         else
1813         {
1814             printf("Processing chain %d (%d atoms, %d residues)\n",
1815                    chain+1, natom, nres);
1816         }
1817
1818         process_chain(pdba, x, bUnA, bUnA, bUnA, bLysMan, bAspMan, bGluMan,
1819                       bHisMan, bArgMan, bGlnMan, angle, distance, &symtab,
1820                       nrtprename, rtprename);
1821
1822         cc->chainstart[cc->nterpairs] = pdba->nres;
1823         j = 0;
1824         for (i = 0; i < cc->nterpairs; i++)
1825         {
1826             find_nc_ter(pdba, cc->chainstart[i], cc->chainstart[i+1],
1827                         &(cc->r_start[j]), &(cc->r_end[j]), rt);
1828
1829             if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
1830             {
1831                 j++;
1832             }
1833         }
1834         cc->nterpairs = j;
1835         if (cc->nterpairs == 0)
1836         {
1837             printf("Problem with chain definition, or missing terminal residues.\n"
1838                    "This chain does not appear to contain a recognized chain molecule.\n"
1839                    "If this is incorrect, you can edit residuetypes.dat to modify the behavior.\n");
1840         }
1841
1842         /* Check for disulfides and other special bonds */
1843         nssbonds = mk_specbonds(pdba, x, bCysMan, &ssbonds, bVerbose);
1844
1845         if (nrtprename > 0)
1846         {
1847             rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, nrtprename, rtprename,
1848                           &symtab, bVerbose);
1849         }
1850
1851         if (debug)
1852         {
1853             if (nch == 1)
1854             {
1855                 sprintf(fn, "chain.pdb");
1856             }
1857             else
1858             {
1859                 sprintf(fn, "chain_%c%d.pdb", cc->chainid, cc->chainnum);
1860             }
1861             write_sto_conf(fn, title, pdba, x, nullptr, ePBC, box);
1862         }
1863
1864
1865         for (i = 0; i < cc->nterpairs; i++)
1866         {
1867
1868             /* Set termini.
1869              * We first apply a filter so we only have the
1870              * termini that can be applied to the residue in question
1871              * (or a generic terminus if no-residue specific is available).
1872              */
1873             /* First the N terminus */
1874             if (nNtdb > 0)
1875             {
1876                 tdblist = filter_ter(nrtp, restp, nNtdb, ntdb,
1877                                      *pdba->resinfo[cc->r_start[i]].name,
1878                                      *pdba->resinfo[cc->r_start[i]].rtp,
1879                                      &ntdblist);
1880                 if (ntdblist == 0)
1881                 {
1882                     printf("No suitable end (N or 5') terminus found in database - assuming this residue\n"
1883                            "is already in a terminus-specific form and skipping terminus selection.\n");
1884                     cc->ntdb[i] = nullptr;
1885                 }
1886                 else
1887                 {
1888                     if (bTerMan && ntdblist > 1)
1889                     {
1890                         sprintf(select, "Select start terminus type for %s-%d",
1891                                 *pdba->resinfo[cc->r_start[i]].name,
1892                                 pdba->resinfo[cc->r_start[i]].nr);
1893                         cc->ntdb[i] = choose_ter(ntdblist, tdblist, select);
1894                     }
1895                     else
1896                     {
1897                         cc->ntdb[i] = tdblist[0];
1898                     }
1899
1900                     printf("Start terminus %s-%d: %s\n",
1901                            *pdba->resinfo[cc->r_start[i]].name,
1902                            pdba->resinfo[cc->r_start[i]].nr,
1903                            (cc->ntdb[i])->name);
1904                     sfree(tdblist);
1905                 }
1906             }
1907             else
1908             {
1909                 cc->ntdb[i] = nullptr;
1910             }
1911
1912             /* And the C terminus */
1913             if (nCtdb > 0)
1914             {
1915                 tdblist = filter_ter(nrtp, restp, nCtdb, ctdb,
1916                                      *pdba->resinfo[cc->r_end[i]].name,
1917                                      *pdba->resinfo[cc->r_end[i]].rtp,
1918                                      &ntdblist);
1919                 if (ntdblist == 0)
1920                 {
1921                     printf("No suitable end (C or 3') terminus found in database - assuming this residue\n"
1922                            "is already in a terminus-specific form and skipping terminus selection.\n");
1923                     cc->ctdb[i] = nullptr;
1924                 }
1925                 else
1926                 {
1927                     if (bTerMan && ntdblist > 1)
1928                     {
1929                         sprintf(select, "Select end terminus type for %s-%d",
1930                                 *pdba->resinfo[cc->r_end[i]].name,
1931                                 pdba->resinfo[cc->r_end[i]].nr);
1932                         cc->ctdb[i] = choose_ter(ntdblist, tdblist, select);
1933                     }
1934                     else
1935                     {
1936                         cc->ctdb[i] = tdblist[0];
1937                     }
1938                     printf("End terminus %s-%d: %s\n",
1939                            *pdba->resinfo[cc->r_end[i]].name,
1940                            pdba->resinfo[cc->r_end[i]].nr,
1941                            (cc->ctdb[i])->name);
1942                     sfree(tdblist);
1943                 }
1944             }
1945             else
1946             {
1947                 cc->ctdb[i] = nullptr;
1948             }
1949         }
1950         /* lookup hackblocks and rtp for all residues */
1951         get_hackblocks_rtp(&hb_chain, &restp_chain,
1952                            nrtp, restp, pdba->nres, pdba->resinfo,
1953                            cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end);
1954         /* ideally, now we would not need the rtp itself anymore, but do
1955            everything using the hb and restp arrays. Unfortunately, that
1956            requires some re-thinking of code in gen_vsite.c, which I won't
1957            do now :( AF 26-7-99 */
1958
1959         rename_atoms(nullptr, ffdir,
1960                      pdba, &symtab, restp_chain, FALSE, rt, FALSE, bVerbose);
1961
1962         match_atomnames_with_rtp(restp_chain, hb_chain, pdba, x, bVerbose);
1963
1964         if (bSort)
1965         {
1966             block = new_blocka();
1967             snew(gnames, 1);
1968             sort_pdbatoms(restp_chain, natom, &pdba, &x, block, &gnames);
1969             remove_duplicate_atoms(pdba, x, bVerbose);
1970             if (ftp2bSet(efNDX, NFILE, fnm))
1971             {
1972                 if (bRemoveH)
1973                 {
1974                     fprintf(stderr, "WARNING: with the -remh option the generated "
1975                             "index file (%s) might be useless\n"
1976                             "(the index file is generated before hydrogens are added)",
1977                             ftp2fn(efNDX, NFILE, fnm));
1978                 }
1979                 write_index(ftp2fn(efNDX, NFILE, fnm), block, gnames, FALSE, 0);
1980             }
1981             for (i = 0; i < block->nr; i++)
1982             {
1983                 sfree(gnames[i]);
1984             }
1985             sfree(gnames);
1986             done_blocka(block);
1987         }
1988         else
1989         {
1990             fprintf(stderr, "WARNING: "
1991                     "without sorting no check for duplicate atoms can be done\n");
1992         }
1993
1994         /* Generate Hydrogen atoms (and termini) in the sequence */
1995         printf("Generating any missing hydrogen atoms and/or adding termini.\n");
1996         natom = add_h(&pdba, &x, nah, ah,
1997                       cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end, bAllowMissing,
1998                       nullptr, nullptr, TRUE, FALSE);
1999         printf("Now there are %d residues with %d atoms\n",
2000                pdba->nres, pdba->nr);
2001         if (debug)
2002         {
2003             write_pdbfile(debug, title, pdba, x, ePBC, box, ' ', 0, nullptr, TRUE);
2004         }
2005
2006         if (debug)
2007         {
2008             for (i = 0; (i < natom); i++)
2009             {
2010                 fprintf(debug, "Res %s%d atom %d %s\n",
2011                         *(pdba->resinfo[pdba->atom[i].resind].name),
2012                         pdba->resinfo[pdba->atom[i].resind].nr, i+1, *pdba->atomname[i]);
2013             }
2014         }
2015
2016         strcpy(posre_fn, ftp2fn(efITP, NFILE, fnm));
2017
2018         /* make up molecule name(s) */
2019
2020         k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2021
2022         gmx_residuetype_get_type(rt, *pdba->resinfo[k].name, &p_restype);
2023
2024         suffix[0] = '\0';
2025
2026         if (cc->bAllWat)
2027         {
2028             sprintf(molname, "Water");
2029         }
2030         else
2031         {
2032             this_chainid = cc->chainid;
2033
2034             /* Add the chain id if we have one */
2035             if (this_chainid != ' ')
2036             {
2037                 sprintf(buf, "_chain_%c", this_chainid);
2038                 strcat(suffix, buf);
2039             }
2040
2041             /* Check if there have been previous chains with the same id */
2042             nid_used = 0;
2043             for (k = 0; k < chain; k++)
2044             {
2045                 if (cc->chainid == chains[k].chainid)
2046                 {
2047                     nid_used++;
2048                 }
2049             }
2050             /* Add the number for this chain identifier if there are multiple copies */
2051             if (nid_used > 0)
2052             {
2053
2054                 sprintf(buf, "%d", nid_used+1);
2055                 strcat(suffix, buf);
2056             }
2057
2058             if (strlen(suffix) > 0)
2059             {
2060                 sprintf(molname, "%s%s", p_restype, suffix);
2061             }
2062             else
2063             {
2064                 strcpy(molname, p_restype);
2065             }
2066         }
2067
2068         if ((nch-nwaterchain > 1) && !cc->bAllWat)
2069         {
2070             bITP = TRUE;
2071             strcpy(itp_fn, top_fn);
2072             printf("Chain time...\n");
2073             c = strrchr(itp_fn, '.');
2074             sprintf(c, "_%s.itp", molname);
2075             c = strrchr(posre_fn, '.');
2076             sprintf(c, "_%s.itp", molname);
2077             if (strcmp(itp_fn, posre_fn) == 0)
2078             {
2079                 strcpy(buf_fn, posre_fn);
2080                 c  = strrchr(buf_fn, '.');
2081                 *c = '\0';
2082                 sprintf(posre_fn, "%s_pr.itp", buf_fn);
2083             }
2084
2085             nincl++;
2086             srenew(incls, nincl);
2087             incls[nincl-1] = gmx_strdup(itp_fn);
2088             itp_file       = gmx_fio_fopen(itp_fn, "w");
2089         }
2090         else
2091         {
2092             bITP = FALSE;
2093         }
2094
2095         srenew(mols, nmol+1);
2096         if (cc->bAllWat)
2097         {
2098             mols[nmol].name = gmx_strdup("SOL");
2099             mols[nmol].nr   = pdba->nres;
2100         }
2101         else
2102         {
2103             mols[nmol].name = gmx_strdup(molname);
2104             mols[nmol].nr   = 1;
2105         }
2106         nmol++;
2107
2108         if (bITP)
2109         {
2110             print_top_comment(itp_file, itp_fn, ffdir, TRUE);
2111         }
2112
2113         if (cc->bAllWat)
2114         {
2115             top_file2 = nullptr;
2116         }
2117         else
2118         if (bITP)
2119         {
2120             top_file2 = itp_file;
2121         }
2122         else
2123         {
2124             top_file2 = top_file;
2125         }
2126
2127         pdb2top(top_file2, posre_fn, molname, pdba, &x, atype, &symtab,
2128                 nrtp, restp,
2129                 restp_chain, hb_chain,
2130                 bAllowMissing,
2131                 bVsites, bVsiteAromatics, ffdir,
2132                 mHmult, nssbonds, ssbonds,
2133                 long_bond_dist, short_bond_dist, bDeuterate, bChargeGroups, bCmap,
2134                 bRenumRes, bRTPresname);
2135
2136         if (!cc->bAllWat)
2137         {
2138             write_posres(posre_fn, pdba, posre_fc);
2139         }
2140
2141         if (bITP)
2142         {
2143             gmx_fio_fclose(itp_file);
2144         }
2145
2146         /* pdba and natom have been reassigned somewhere so: */
2147         cc->pdba = pdba;
2148         cc->x    = x;
2149
2150         if (debug)
2151         {
2152             if (cc->chainid == ' ')
2153             {
2154                 sprintf(fn, "chain.pdb");
2155             }
2156             else
2157             {
2158                 sprintf(fn, "chain_%c.pdb", cc->chainid);
2159             }
2160             write_sto_conf(fn, "", pdba, x, nullptr, ePBC, box);
2161         }
2162     }
2163
2164     if (watermodel == nullptr)
2165     {
2166         for (chain = 0; chain < nch; chain++)
2167         {
2168             if (chains[chain].bAllWat)
2169             {
2170                 gmx_fatal(FARGS, "You have chosen not to include a water model, but there is water in the input file. Select a water model or remove the water from your input file.");
2171             }
2172         }
2173     }
2174     else
2175     {
2176         sprintf(buf_fn, "%s%c%s.itp", ffdir, DIR_SEPARATOR, watermodel);
2177         if (!fflib_fexist(buf_fn))
2178         {
2179             gmx_fatal(FARGS, "The topology file '%s' for the selected water model '%s' can not be found in the force field directory. Select a different water model.",
2180                       buf_fn, watermodel);
2181         }
2182     }
2183
2184     print_top_mols(top_file, title, ffdir, watermodel, nincl, incls, nmol, mols);
2185     gmx_fio_fclose(top_file);
2186
2187     gmx_residuetype_destroy(rt);
2188
2189     /* now merge all chains back together */
2190     natom = 0;
2191     nres  = 0;
2192     for (i = 0; (i < nch); i++)
2193     {
2194         natom += chains[i].pdba->nr;
2195         nres  += chains[i].pdba->nres;
2196     }
2197     snew(atoms, 1);
2198     init_t_atoms(atoms, natom, FALSE);
2199     for (i = 0; i < atoms->nres; i++)
2200     {
2201         sfree(atoms->resinfo[i].name);
2202     }
2203     sfree(atoms->resinfo);
2204     atoms->nres = nres;
2205     snew(atoms->resinfo, nres);
2206     snew(x, natom);
2207     k = 0;
2208     l = 0;
2209     for (i = 0; (i < nch); i++)
2210     {
2211         if (nch > 1)
2212         {
2213             printf("Including chain %d in system: %d atoms %d residues\n",
2214                    i+1, chains[i].pdba->nr, chains[i].pdba->nres);
2215         }
2216         for (j = 0; (j < chains[i].pdba->nr); j++)
2217         {
2218             atoms->atom[k]         = chains[i].pdba->atom[j];
2219             atoms->atom[k].resind += l; /* l is processed nr of residues */
2220             atoms->atomname[k]     = chains[i].pdba->atomname[j];
2221             atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2222             copy_rvec(chains[i].x[j], x[k]);
2223             k++;
2224         }
2225         for (j = 0; (j < chains[i].pdba->nres); j++)
2226         {
2227             atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2228             if (bRTPresname)
2229             {
2230                 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2231             }
2232             l++;
2233         }
2234     }
2235
2236     if (nch > 1)
2237     {
2238         fprintf(stderr, "Now there are %d atoms and %d residues\n", k, l);
2239         print_sums(atoms, TRUE);
2240     }
2241
2242     fprintf(stderr, "\nWriting coordinate file...\n");
2243     clear_rvec(box_space);
2244     if (box[0][0] == 0)
2245     {
2246         make_new_box(atoms->nr, x, box, box_space, FALSE);
2247     }
2248     write_sto_conf(ftp2fn(efSTO, NFILE, fnm), title, atoms, x, nullptr, ePBC, box);
2249
2250     printf("\t\t--------- PLEASE NOTE ------------\n");
2251     printf("You have successfully generated a topology from: %s.\n",
2252            opt2fn("-f", NFILE, fnm));
2253     if (watermodel != nullptr)
2254     {
2255         printf("The %s force field and the %s water model are used.\n",
2256                ffname, watermodel);
2257     }
2258     else
2259     {
2260         printf("The %s force field is used.\n",
2261                ffname);
2262     }
2263     printf("\t\t--------- ETON ESAELP ------------\n");
2264
2265     return 0;
2266 }