1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2011, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
46 #include "nbnxn_cuda_data_mgmt.h"
48 #include "pme_switch.h"
63 /* In the initial scan, step by grids that are at least a factor 0.8 coarser */
64 #define PMES_GRID_SCALE_FAC 0.8
65 /* In the initial scan, try to skip grids with uneven x/y/z spacing,
66 * checking if the "efficiency" is more than 5% worse than the previous grid.
68 #define PMES_GRID_EFF_FAC 1.05
69 /* Rerun up till 12% slower setups than the fastest up till now */
70 #define PMES_SLOW_FAC 1.12
71 /* If setups get more than 2% faster, do another round to avoid
72 * choosing a slower setup due to acceleration or fluctuations.
74 #define PMES_ACCEL_TOL 1.02
76 typedef struct pme_switch
{
77 int nstage
; /* the current maximum number of stages */
79 real cut_spacing
; /* the minimum cutoff / PME grid spacing ratio */
80 real rbuf
; /* the pairlist buffer size */
81 matrix box_start
; /* the initial simulation box */
82 int n
; /* the count of setup as well as the allocation size */
83 pme_setup_t
*setup
; /* the PME+cutoff setups */
84 int cur
; /* the current setup */
85 int fastest
; /* fastest setup up till now */
86 int start
; /* start of setup range to consider in stage>0 */
87 int end
; /* end of setup range to consider in stage>0 */
89 int stage
; /* the current stage */
92 void switch_pme_init(pme_switch_t
*pmes_p
,
93 const t_inputrec
*ir
,matrix box
,
94 const interaction_const_t
*ic
,
103 /* Any number of stages >= 2 is supported */
106 pmes
->rbuf
= ic
->rlist
- ic
->rcoulomb
;
108 copy_mat(box
,pmes
->box_start
);
109 if (ir
->ePBC
==epbcXY
&& ir
->nwall
==2)
111 svmul(ir
->wall_ewald_zfac
,pmes
->box_start
[ZZ
],pmes
->box_start
[ZZ
]);
115 snew(pmes
->setup
,pmes
->n
);
118 pmes
->setup
[0].rcut
= ic
->rcoulomb
;
119 pmes
->setup
[0].rlist
= ic
->rlist
;
120 pmes
->setup
[0].grid
[XX
] = ir
->nkx
;
121 pmes
->setup
[0].grid
[YY
] = ir
->nky
;
122 pmes
->setup
[0].grid
[ZZ
] = ir
->nkz
;
123 pmes
->setup
[0].coeff
= ic
->ewaldcoeff
;
125 pmes
->setup
[0].pmedata
= pmedata
;
130 sp
= norm(pmes
->box_start
[d
])/pmes
->setup
[0].grid
[d
];
136 pmes
->setup
[0].spacing
= spm
;
138 if (ir
->fourier_spacing
> 0)
140 pmes
->cut_spacing
= ir
->rcoulomb
/ir
->fourier_spacing
;
144 pmes
->cut_spacing
= ir
->rcoulomb
/pmes
->setup
[0].spacing
;
155 static gmx_bool
switch_pme_increase_cutoff(pme_switch_t pmes
,int pme_order
)
161 /* Try to add a new setup with next larger cut-off to the list */
163 srenew(pmes
->setup
,pmes
->n
);
164 set
= &pmes
->setup
[pmes
->n
-1];
171 clear_ivec(set
->grid
);
172 sp
= calc_grid(NULL
,pmes
->box_start
,
173 fac
*pmes
->setup
[pmes
->cur
].spacing
,
178 /* In parallel we can't have grids smaller than 2*pme_order,
179 * and we would anyhow not gain much speed at these grid sizes.
183 if (set
->grid
[d
] <= 2*pme_order
)
191 while (sp
<= 1.001*pmes
->setup
[pmes
->cur
].spacing
);
193 set
->rcut
= pmes
->cut_spacing
*sp
;
194 set
->rlist
= set
->rcut
+ pmes
->rbuf
;
196 /* The grid efficiency is the size wrt a grid with uniform x/y/z spacing */
200 set
->grid_eff
*= (set
->grid
[d
]*sp
)/norm(pmes
->box_start
[d
]);
202 /* The Ewald coefficient is inversly proportional to the cut-off */
203 set
->coeff
= pmes
->setup
[0].coeff
*pmes
->setup
[0].rcut
/set
->rcut
;
210 fprintf(debug
,"PME switch grid %d %d %d, cutoff %f\n",
211 set
->grid
[XX
],set
->grid
[YY
],set
->grid
[ZZ
],set
->rcut
);
217 static void print_grid(FILE *fp_err
,FILE *fp_log
,
220 const pme_setup_t
*set
,
223 char buf
[STRLEN
],buft
[STRLEN
];
227 sprintf(buft
,": %.1f M-cycles",cycles
*1e-6);
233 sprintf(buf
,"%-11s%10s pme grid %d %d %d, cutoff %.3f%s",
235 desc
,set
->grid
[XX
],set
->grid
[YY
],set
->grid
[ZZ
],set
->rcut
,
239 fprintf(fp_err
,"%s\n",buf
);
243 fprintf(fp_log
,"%s\n",buf
);
247 static void switch_to_stage1(pme_switch_t pmes
)
250 while (pmes
->start
+1 < pmes
->n
&&
251 (pmes
->setup
[pmes
->start
].count
== 0 ||
252 pmes
->setup
[pmes
->start
].cycles
>
253 pmes
->setup
[pmes
->fastest
].cycles
*PMES_SLOW_FAC
))
257 while (pmes
->start
> 0 && pmes
->setup
[pmes
->start
-1].cycles
== 0)
263 if (pmes
->setup
[pmes
->end
-1].count
> 0 &&
264 pmes
->setup
[pmes
->end
-1].cycles
>
265 pmes
->setup
[pmes
->fastest
].cycles
*PMES_SLOW_FAC
)
272 /* Start add start, 1 will be added immediately after returning */
273 pmes
->cur
= pmes
->start
- 1;
276 gmx_bool
switch_pme(pme_switch_t pmes
,
283 interaction_const_t
*ic
,
284 nonbonded_verlet_t
*nbv
,
293 if (pmes
->stage
== pmes
->nstage
)
300 gmx_sumd(1,&cycles
,cr
);
301 cycles
/= cr
->nnodes
;
304 set
= &pmes
->setup
[pmes
->cur
];
307 if (set
->count
% 2 == 1)
309 /* Skip the first cycle, because the first step after a switch
310 * is much slower due to allocation and/or caching effects.
315 sprintf(buf
, "step %4d: ", step
);
316 print_grid(fp_err
,fp_log
,buf
,"timed with",set
,cycles
);
320 set
->cycles
= cycles
;
324 if (cycles
*PMES_ACCEL_TOL
< set
->cycles
&&
325 pmes
->stage
== pmes
->nstage
- 1)
327 /* The performance went up a lot (due to e.g. DD load balancing).
328 * Add a stage, keep the minima, but rescan all setups.
334 fprintf(debug
,"The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this is more than %f\n"
335 "Increased the number stages to %d"
336 " and ignoring the previous performance\n",
337 set
->grid
[XX
],set
->grid
[YY
],set
->grid
[ZZ
],
338 cycles
*1e-6,set
->cycles
*1e-6,PMES_ACCEL_TOL
,
342 set
->cycles
= min(set
->cycles
,cycles
);
345 if (set
->cycles
< pmes
->setup
[pmes
->fastest
].cycles
)
347 pmes
->fastest
= pmes
->cur
;
349 cycles_fast
= pmes
->setup
[pmes
->fastest
].cycles
;
351 /* Check in stage 0 if we should stop scanning grids.
352 * Stop when the time is more than SLOW_FAC longer than the fastest.
354 if (pmes
->stage
== 0 && pmes
->cur
> 0 &&
355 cycles
> pmes
->setup
[pmes
->fastest
].cycles
*PMES_SLOW_FAC
)
357 pmes
->n
= pmes
->cur
+ 1;
358 /* Done with scanning, go to stage 1 */
359 switch_to_stage1(pmes
);
362 if (pmes
->stage
== 0)
366 gridsize_start
= set
->grid
[XX
]*set
->grid
[YY
]*set
->grid
[ZZ
];
370 if (pmes
->cur
+1 < pmes
->n
)
372 /* We had already generated the next setup */
377 /* Find the next setup */
378 OK
= switch_pme_increase_cutoff(pmes
,ir
->pme_order
);
381 if (OK
&& ir
->ePBC
!= epbcNONE
)
383 OK
= (sqr(pmes
->setup
[pmes
->cur
+1].rlist
)
384 <= max_cutoff2(ir
->ePBC
,state
->box
));
391 if (DOMAINDECOMP(cr
))
393 OK
= change_dd_cutoff(cr
,state
,ir
,
394 pmes
->setup
[pmes
->cur
].rlist
);
397 /* Failed: do not use this setup */
404 /* We hit the upper limit for the cut-off,
405 * the setup should not go further than cur.
407 pmes
->n
= pmes
->cur
+ 1;
408 /* Switch to the next stage */
409 switch_to_stage1(pmes
);
413 !(pmes
->setup
[pmes
->cur
].grid
[XX
]*
414 pmes
->setup
[pmes
->cur
].grid
[YY
]*
415 pmes
->setup
[pmes
->cur
].grid
[ZZ
] <
416 gridsize_start
*PMES_GRID_SCALE_FAC
418 pmes
->setup
[pmes
->cur
].grid_eff
<
419 pmes
->setup
[pmes
->cur
-1].grid_eff
*PMES_GRID_EFF_FAC
));
422 if (pmes
->stage
> 0 && pmes
->end
== 1)
425 pmes
->stage
= pmes
->nstage
;
427 else if (pmes
->stage
> 0 && pmes
->end
> 1)
429 /* If stage = nstage-1:
430 * scan over all setups, rerunning only those setups
431 * which are not much slower than the fastest
438 if (pmes
->cur
== pmes
->end
)
441 pmes
->cur
= pmes
->start
;
444 while (pmes
->stage
== pmes
->nstage
- 1 &&
445 pmes
->setup
[pmes
->cur
].count
> 0 &&
446 pmes
->setup
[pmes
->cur
].cycles
> cycles_fast
*PMES_SLOW_FAC
);
448 if (pmes
->stage
== pmes
->nstage
)
450 /* We are done optiming, use the fastest setup we found */
451 pmes
->cur
= pmes
->fastest
;
455 if (DOMAINDECOMP(cr
) && pmes
->stage
> 0)
457 OK
= change_dd_cutoff(cr
,state
,ir
,pmes
->setup
[pmes
->cur
].rlist
);
460 /* Failsafe solution */
461 if (pmes
->cur
> 1 && pmes
->stage
== pmes
->nstage
)
467 pmes
->end
= pmes
->cur
;
468 pmes
->cur
= pmes
->start
;
472 /* Change the Coulomb cut-off and the PME grid */
474 set
= &pmes
->setup
[pmes
->cur
];
476 ic
->rcoulomb
= set
->rcut
;
477 ic
->rlist
= set
->rlist
;
478 ic
->ewaldcoeff
= set
->coeff
;
480 if (nbv
->grp
[0].kernel_type
== nbk8x8x8_CUDA
)
482 nbnxn_cuda_pmetune_update_param(nbv
->cu_nbv
,ic
);
486 init_interaction_const_tables(NULL
,ic
,nbv
->grp
[0].kernel_type
);
491 init_interaction_const_tables(NULL
,ic
,nbv
->grp
[1].kernel_type
);
494 if (cr
->duty
& DUTY_PME
)
496 if (pmes
->setup
[pmes
->cur
].pmedata
== NULL
)
498 /* Generate a new PME data structure,
499 * copying part of the old pointers.
501 gmx_pme_reinit(&set
->pmedata
,
502 cr
,pmes
->setup
[0].pmedata
,ir
,
505 *pmedata
= set
->pmedata
;
509 /* Tell our PME-only node to switch grid */
510 gmx_pme_send_switch(cr
, set
->grid
, set
->coeff
);
515 print_grid(NULL
,debug
,"","switched to",set
,-1);
518 if (pmes
->stage
== pmes
->nstage
)
520 print_grid(fp_err
,fp_log
,"","optimal",set
,-1);
526 void restart_switch_pme(pme_switch_t pmes
, int n
)