added Verlet scheme and NxN non-bonded functionality
[gromacs.git] / src / kernel / pme_switch.c
blobbdae8b7d1ccd50b3f9b2642d916415a40aa8993c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
3 *
4 * This source code is part of
5 *
6 * G R O M A C S
7 *
8 * GROningen MAchine for Chemical Simulations
9 *
10 * VERSION 4.6.0
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2011, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
33 * And Hey:
34 * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
36 #ifdef HAVE_CONFIG_H
37 #include <config.h>
38 #endif
40 #include "smalloc.h"
41 #include "network.h"
42 #include "calcgrid.h"
43 #include "pme.h"
44 #include "vec.h"
45 #include "domdec.h"
46 #include "nbnxn_cuda_data_mgmt.h"
47 #include "force.h"
48 #include "pme_switch.h"
50 typedef struct {
51 real rcut;
52 real rlist;
53 real spacing;
54 ivec grid;
55 real grid_eff;
56 real coeff;
57 gmx_pme_t pmedata;
59 int count;
60 double cycles;
61 } pme_setup_t;
63 /* In the initial scan, step by grids that are at least a factor 0.8 coarser */
64 #define PMES_GRID_SCALE_FAC 0.8
65 /* In the initial scan, try to skip grids with uneven x/y/z spacing,
66 * checking if the "efficiency" is more than 5% worse than the previous grid.
68 #define PMES_GRID_EFF_FAC 1.05
69 /* Rerun up till 12% slower setups than the fastest up till now */
70 #define PMES_SLOW_FAC 1.12
71 /* If setups get more than 2% faster, do another round to avoid
72 * choosing a slower setup due to acceleration or fluctuations.
74 #define PMES_ACCEL_TOL 1.02
76 typedef struct pme_switch {
77 int nstage; /* the current maximum number of stages */
79 real cut_spacing; /* the minimum cutoff / PME grid spacing ratio */
80 real rbuf; /* the pairlist buffer size */
81 matrix box_start; /* the initial simulation box */
82 int n; /* the count of setup as well as the allocation size */
83 pme_setup_t *setup; /* the PME+cutoff setups */
84 int cur; /* the current setup */
85 int fastest; /* fastest setup up till now */
86 int start; /* start of setup range to consider in stage>0 */
87 int end; /* end of setup range to consider in stage>0 */
89 int stage; /* the current stage */
90 } t_pme_switch;
92 void switch_pme_init(pme_switch_t *pmes_p,
93 const t_inputrec *ir,matrix box,
94 const interaction_const_t *ic,
95 gmx_pme_t pmedata)
97 pme_switch_t pmes;
98 real spm,sp;
99 int d;
101 snew(pmes,1);
103 /* Any number of stages >= 2 is supported */
104 pmes->nstage = 2;
106 pmes->rbuf = ic->rlist - ic->rcoulomb;
108 copy_mat(box,pmes->box_start);
109 if (ir->ePBC==epbcXY && ir->nwall==2)
111 svmul(ir->wall_ewald_zfac,pmes->box_start[ZZ],pmes->box_start[ZZ]);
114 pmes->n = 1;
115 snew(pmes->setup,pmes->n);
117 pmes->cur = 0;
118 pmes->setup[0].rcut = ic->rcoulomb;
119 pmes->setup[0].rlist = ic->rlist;
120 pmes->setup[0].grid[XX] = ir->nkx;
121 pmes->setup[0].grid[YY] = ir->nky;
122 pmes->setup[0].grid[ZZ] = ir->nkz;
123 pmes->setup[0].coeff = ic->ewaldcoeff;
125 pmes->setup[0].pmedata = pmedata;
127 spm = 0;
128 for(d=0; d<DIM; d++)
130 sp = norm(pmes->box_start[d])/pmes->setup[0].grid[d];
131 if (sp > spm)
133 spm = sp;
136 pmes->setup[0].spacing = spm;
138 if (ir->fourier_spacing > 0)
140 pmes->cut_spacing = ir->rcoulomb/ir->fourier_spacing;
142 else
144 pmes->cut_spacing = ir->rcoulomb/pmes->setup[0].spacing;
147 pmes->stage = 0;
149 pmes->fastest = 0;
150 pmes->start = 0;
152 *pmes_p = pmes;
155 static gmx_bool switch_pme_increase_cutoff(pme_switch_t pmes,int pme_order)
157 pme_setup_t *set;
158 real fac,sp;
159 int d;
161 /* Try to add a new setup with next larger cut-off to the list */
162 pmes->n++;
163 srenew(pmes->setup,pmes->n);
164 set = &pmes->setup[pmes->n-1];
165 set->pmedata = NULL;
167 fac = 1;
170 fac *= 1.01;
171 clear_ivec(set->grid);
172 sp = calc_grid(NULL,pmes->box_start,
173 fac*pmes->setup[pmes->cur].spacing,
174 &set->grid[XX],
175 &set->grid[YY],
176 &set->grid[ZZ]);
178 /* In parallel we can't have grids smaller than 2*pme_order,
179 * and we would anyhow not gain much speed at these grid sizes.
181 for(d=0; d<DIM; d++)
183 if (set->grid[d] <= 2*pme_order)
185 pmes->n--;
187 return FALSE;
191 while (sp <= 1.001*pmes->setup[pmes->cur].spacing);
193 set->rcut = pmes->cut_spacing*sp;
194 set->rlist = set->rcut + pmes->rbuf;
195 set->spacing = sp;
196 /* The grid efficiency is the size wrt a grid with uniform x/y/z spacing */
197 set->grid_eff = 1;
198 for(d=0; d<DIM; d++)
200 set->grid_eff *= (set->grid[d]*sp)/norm(pmes->box_start[d]);
202 /* The Ewald coefficient is inversly proportional to the cut-off */
203 set->coeff = pmes->setup[0].coeff*pmes->setup[0].rcut/set->rcut;
205 set->count = 0;
206 set->cycles = 0;
208 if (debug)
210 fprintf(debug,"PME switch grid %d %d %d, cutoff %f\n",
211 set->grid[XX],set->grid[YY],set->grid[ZZ],set->rcut);
214 return TRUE;
217 static void print_grid(FILE *fp_err,FILE *fp_log,
218 const char *pre,
219 const char *desc,
220 const pme_setup_t *set,
221 double cycles)
223 char buf[STRLEN],buft[STRLEN];
225 if (cycles >= 0)
227 sprintf(buft,": %.1f M-cycles",cycles*1e-6);
229 else
231 buft[0] = '\0';
233 sprintf(buf,"%-11s%10s pme grid %d %d %d, cutoff %.3f%s",
234 pre,
235 desc,set->grid[XX],set->grid[YY],set->grid[ZZ],set->rcut,
236 buft);
237 if (fp_err != NULL)
239 fprintf(fp_err,"%s\n",buf);
241 if (fp_log != NULL)
243 fprintf(fp_log,"%s\n",buf);
247 static void switch_to_stage1(pme_switch_t pmes)
249 pmes->start = 0;
250 while (pmes->start+1 < pmes->n &&
251 (pmes->setup[pmes->start].count == 0 ||
252 pmes->setup[pmes->start].cycles >
253 pmes->setup[pmes->fastest].cycles*PMES_SLOW_FAC))
255 pmes->start++;
257 while (pmes->start > 0 && pmes->setup[pmes->start-1].cycles == 0)
259 pmes->start--;
262 pmes->end = pmes->n;
263 if (pmes->setup[pmes->end-1].count > 0 &&
264 pmes->setup[pmes->end-1].cycles >
265 pmes->setup[pmes->fastest].cycles*PMES_SLOW_FAC)
267 pmes->end--;
270 pmes->stage = 1;
272 /* Start add start, 1 will be added immediately after returning */
273 pmes->cur = pmes->start - 1;
276 gmx_bool switch_pme(pme_switch_t pmes,
277 t_commrec *cr,
278 FILE *fp_err,
279 FILE *fp_log,
280 t_inputrec *ir,
281 t_state *state,
282 double cycles,
283 interaction_const_t *ic,
284 nonbonded_verlet_t *nbv,
285 gmx_pme_t *pmedata,
286 int step)
288 gmx_bool OK;
289 pme_setup_t *set;
290 double cycles_fast;
291 char buf[STRLEN];
293 if (pmes->stage == pmes->nstage)
295 return FALSE;
298 if (PAR(cr))
300 gmx_sumd(1,&cycles,cr);
301 cycles /= cr->nnodes;
304 set = &pmes->setup[pmes->cur];
306 set->count++;
307 if (set->count % 2 == 1)
309 /* Skip the first cycle, because the first step after a switch
310 * is much slower due to allocation and/or caching effects.
312 return TRUE;
315 sprintf(buf, "step %4d: ", step);
316 print_grid(fp_err,fp_log,buf,"timed with",set,cycles);
318 if (set->count <= 2)
320 set->cycles = cycles;
322 else
324 if (cycles*PMES_ACCEL_TOL < set->cycles &&
325 pmes->stage == pmes->nstage - 1)
327 /* The performance went up a lot (due to e.g. DD load balancing).
328 * Add a stage, keep the minima, but rescan all setups.
330 pmes->nstage++;
332 if (debug)
334 fprintf(debug,"The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this is more than %f\n"
335 "Increased the number stages to %d"
336 " and ignoring the previous performance\n",
337 set->grid[XX],set->grid[YY],set->grid[ZZ],
338 cycles*1e-6,set->cycles*1e-6,PMES_ACCEL_TOL,
339 pmes->nstage);
342 set->cycles = min(set->cycles,cycles);
345 if (set->cycles < pmes->setup[pmes->fastest].cycles)
347 pmes->fastest = pmes->cur;
349 cycles_fast = pmes->setup[pmes->fastest].cycles;
351 /* Check in stage 0 if we should stop scanning grids.
352 * Stop when the time is more than SLOW_FAC longer than the fastest.
354 if (pmes->stage == 0 && pmes->cur > 0 &&
355 cycles > pmes->setup[pmes->fastest].cycles*PMES_SLOW_FAC)
357 pmes->n = pmes->cur + 1;
358 /* Done with scanning, go to stage 1 */
359 switch_to_stage1(pmes);
362 if (pmes->stage == 0)
364 int gridsize_start;
366 gridsize_start = set->grid[XX]*set->grid[YY]*set->grid[ZZ];
370 if (pmes->cur+1 < pmes->n)
372 /* We had already generated the next setup */
373 OK = TRUE;
375 else
377 /* Find the next setup */
378 OK = switch_pme_increase_cutoff(pmes,ir->pme_order);
381 if (OK && ir->ePBC != epbcNONE)
383 OK = (sqr(pmes->setup[pmes->cur+1].rlist)
384 <= max_cutoff2(ir->ePBC,state->box));
387 if (OK)
389 pmes->cur++;
391 if (DOMAINDECOMP(cr))
393 OK = change_dd_cutoff(cr,state,ir,
394 pmes->setup[pmes->cur].rlist);
395 if (!OK)
397 /* Failed: do not use this setup */
398 pmes->cur--;
402 if (!OK)
404 /* We hit the upper limit for the cut-off,
405 * the setup should not go further than cur.
407 pmes->n = pmes->cur + 1;
408 /* Switch to the next stage */
409 switch_to_stage1(pmes);
412 while (OK &&
413 !(pmes->setup[pmes->cur].grid[XX]*
414 pmes->setup[pmes->cur].grid[YY]*
415 pmes->setup[pmes->cur].grid[ZZ] <
416 gridsize_start*PMES_GRID_SCALE_FAC
418 pmes->setup[pmes->cur].grid_eff <
419 pmes->setup[pmes->cur-1].grid_eff*PMES_GRID_EFF_FAC));
422 if (pmes->stage > 0 && pmes->end == 1)
424 pmes->cur = 0;
425 pmes->stage = pmes->nstage;
427 else if (pmes->stage > 0 && pmes->end > 1)
429 /* If stage = nstage-1:
430 * scan over all setups, rerunning only those setups
431 * which are not much slower than the fastest
432 * else:
433 * use the next setup
437 pmes->cur++;
438 if (pmes->cur == pmes->end)
440 pmes->stage++;
441 pmes->cur = pmes->start;
444 while (pmes->stage == pmes->nstage - 1 &&
445 pmes->setup[pmes->cur].count > 0 &&
446 pmes->setup[pmes->cur].cycles > cycles_fast*PMES_SLOW_FAC);
448 if (pmes->stage == pmes->nstage)
450 /* We are done optiming, use the fastest setup we found */
451 pmes->cur = pmes->fastest;
455 if (DOMAINDECOMP(cr) && pmes->stage > 0)
457 OK = change_dd_cutoff(cr,state,ir,pmes->setup[pmes->cur].rlist);
458 if (!OK)
460 /* Failsafe solution */
461 if (pmes->cur > 1 && pmes->stage == pmes->nstage)
463 pmes->stage--;
465 pmes->fastest = 0;
466 pmes->start = 0;
467 pmes->end = pmes->cur;
468 pmes->cur = pmes->start;
472 /* Change the Coulomb cut-off and the PME grid */
474 set = &pmes->setup[pmes->cur];
476 ic->rcoulomb = set->rcut;
477 ic->rlist = set->rlist;
478 ic->ewaldcoeff = set->coeff;
480 if (nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
482 nbnxn_cuda_pmetune_update_param(nbv->cu_nbv,ic);
484 else
486 init_interaction_const_tables(NULL,ic,nbv->grp[0].kernel_type);
489 if (nbv->ngrp > 1)
491 init_interaction_const_tables(NULL,ic,nbv->grp[1].kernel_type);
494 if (cr->duty & DUTY_PME)
496 if (pmes->setup[pmes->cur].pmedata == NULL)
498 /* Generate a new PME data structure,
499 * copying part of the old pointers.
501 gmx_pme_reinit(&set->pmedata,
502 cr,pmes->setup[0].pmedata,ir,
503 set->grid);
505 *pmedata = set->pmedata;
507 else
509 /* Tell our PME-only node to switch grid */
510 gmx_pme_send_switch(cr, set->grid, set->coeff);
513 if (debug)
515 print_grid(NULL,debug,"","switched to",set,-1);
518 if (pmes->stage == pmes->nstage)
520 print_grid(fp_err,fp_log,"","optimal",set,-1);
523 return TRUE;
526 void restart_switch_pme(pme_switch_t pmes, int n)
528 pmes->nstage += n;