1 /***********************************************************************
5 The following is a notice of limited availability of the code and
6 Government license and disclaimer which must be included in the
7 prologue of the code and in all source listings of the code.
10 (c) 1977 University of Chicago
12 Permission is hereby granted to use, reproduce, prepare
13 derivative works, and to redistribute to others at no charge. If
14 you distribute a copy or copies of the Software, or you modify a
15 copy or copies of the Software or any portion of it, thus forming
16 a work based on the Software and make and/or distribute copies of
17 such work, you must meet the following conditions:
19 a) If you make a copy of the Software (modified or verbatim)
20 it must include the copyright notice and Government
21 license and disclaimer.
23 b) You must cause the modified Software to carry prominent
24 notices stating that you changed specified portions of
27 This software was authored by:
29 Argonne National Laboratory
30 J. Michalakes: (630) 252-6646; email: michalak@mcs.anl.gov
31 Mathematics and Computer Science Division
32 Argonne National Laboratory, Argonne, IL 60439
34 ARGONNE NATIONAL LABORATORY (ANL), WITH FACILITIES IN THE STATES
35 OF ILLINOIS AND IDAHO, IS OWNED BY THE UNITED STATES GOVERNMENT,
36 AND OPERATED BY THE UNIVERSITY OF CHICAGO UNDER PROVISION OF A
37 CONTRACT WITH THE DEPARTMENT OF ENERGY.
39 GOVERNMENT LICENSE AND DISCLAIMER
41 This computer code material was prepared, in part, as an account
42 of work sponsored by an agency of the United States Government.
43 The Government is granted for itself and others acting on its
44 behalf a paid-up, nonexclusive, irrevocable worldwide license in
45 this data to reproduce, prepare derivative works, distribute
46 copies to the public, perform publicly and display publicly, and
47 to permit others to do so. NEITHER THE UNITED STATES GOVERNMENT
48 NOR ANY AGENCY THEREOF, NOR THE UNIVERSITY OF CHICAGO, NOR ANY OF
49 THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
50 ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
51 COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS,
52 PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD
53 NOT INFRINGE PRIVATELY OWNED RIGHTS.
55 ***************************************************************************/
62 RSL_GET_RUN_INFO - get RSL information about size and shape local allocation.
65 This routine is used to initialize loops over each processor's
66 local partition of the decomposed domain. The routine can be used
67 directly, or RSL-provided macros may be used in the code that expand
68 to the proper calls. Both M4 and CPP macros are provided in the
69 RSL distribution (see LoopMacros).
70 The macro approach is recommended
71 for readability, simplicity,
72 and also to insulate the code from future potential updates in RSL.
73 Of the two sets, the M4 macros are recommended since they
76 RSL returns through the arguments Arg6, Arg7, and Arg8 information
77 for iterating over the local processor partition N-major, M-minor.
78 Argument Arg4 is the number of iterations to cover the part of the
79 domain allocated to the processor. Arg6 contains the
80 J-indices (Arg4 of them) of each I-strip local to the processor.
81 The first index into the local partition is stored at stored at Arg6(3).
82 The locations Arg6(1) and Arg6(2) are placeholders for indices
83 if one wishes to include one or two of the pad or ghost cells in
84 the iteration (see RSL_GET_INFOP).
85 Arg7 contains the starting and ending indices of each I-strip. Likewise,
86 the first actual index in Arg7 is at Arg7(3).
87 The arguments Arg9, Arg10, and
88 Arg11 have the same sense as Arg6, Arg7, and Arg8, except that they
89 provide information for
90 iteration in M-major, N-minor order. Argument Arg5 is
91 the number of major iterations over M.
92 The arrays Arg6, Arg7, Arg8, Arg9, Arg10, and Arg11 store local (memory)
93 indices. The logical (global) indices can be obtained by subtracting
94 Arg12 (for M-dimension indices) or Arg13 (for N-dimension indices).
96 Unlike programs that are implemented in single address
98 relationship between logical and memory
99 indices does not hold for data-domain decomposition over distributed
100 memories. That is, the memory index (the subscripts into a model
101 array) may not be used for testing proximity of the point to a
102 boundary in the logical domain. Further, the relationship between
103 logical and memory indices differs on each processor. The Arg12 and
104 Arg13 arguments are the differences between the local and global
105 indices in the M and N dimensions, respectively, and can be used by
106 the program for converting between logical and memory indices. For
107 example, the local index I in the M dimension is equal to Arg12 plus
110 All of these arrays -- Arg6, Arg7, Arg8, Arg9, Arg10, and Arg11
111 are integers and must have been allocated by the user with size
112 large enough to fit the largest possible run through a dimension.
113 The argument Arg2 is the length of the arrays.
114 The integer Arg1 is an RSL domain descriptor.
117 is the nest level of the domain (mother domain is at nest-level 1),
118 which is not necessary for iteration over the domain but which
119 is information that RSL has available and that is useful to have
120 at the beginning of a module.
124 $ -- original code --
126 $ subroutine F( ... )
130 $ a(i,j) = b(i,j) + c(i,j)
134 $ -- example using M4 macros in LoopMacros.m4 --
136 $ subroutine F( ... )
139 $ RSL_INIT_RUNVARS(d) ! d is an RSL domain descriptor
142 $ a(i,j) = b(i,j) + c(i,j)
146 $ -- example using CPP macros in LoopMacros.inc --
148 $ #include "LoopMacros.cpp"
149 $ subroutine F( ... )
150 $ RSL_DECLARE_RUN_VARS
152 $ RSL_INIT_RUNS(d) ! d is an RSL domain descriptor
153 $ RSL_MAJOR_BOUND(j,3,jl-2)
154 $ RSL_MINOR_BOUND(j,3,jl-2)
155 $ a(i,j) = b(i,j) + c(i,j)
156 $ RSL_END_MINOR_LOOPB
157 $ RSL_END_MAJOR_LOOPB
159 $ -- example with macros expanded --
161 $ subroutine F( ... )
162 $ integer ig,jg,nruni,nrunj,js,is,ie,is2,js2,je2,idif,jdif,nr
163 $ dimension js(512) ,is(512) ,ie(512) ! for N-major iteration
164 $ dimension is2(512) ,js2(512) ,je2(512) ! for M-major iteration
166 $ call rsl_get_run_info( d, 512, nl, nrunj, nruni,
174 $ if ( jg .ge. 3 .and. jg .le. maxj-2 ) then
175 $ do i=is(nr),ie1(nr)
177 $ if ( ig .ge. 3 .and. ig .le. maxi-2 ) then
178 $ a(i,j) = b(i,j) + c(i,j)
187 RSL_GET_RUN_INFOP, LoopMacros.m4
191 RSL_GET_RUN_INFO ( d_p, maxrun_p, nl_p, nrunj_p, nruni_p, js, is, ie, is2, js2, je2, idif_p, jdif_p )
193 d_p /* (I) RSL domain descriptor (input) */
194 ,maxrun_p /* (I) Number of elements in array arguments to this routine */
195 ,nl_p /* (O) Nest level of the domain */
196 ,nrunj_p /* (O) Number of runs through domain in j-major traversal */
197 ,nruni_p ; /* (O) Number of runs through domain in i-minor traversal */
199 js[] /* (O) Local J-index of each run in j-major traversal */
200 ,is[] /* (O) Starting local I-index of each run in j-major traversal */
201 ,ie[] /* (O) Ending local I-index of each run in j-major traversal */
202 ,is2[] /* (O) Local I-index of each run in i-major traversal */
203 ,js2[] /* (O) Starting local J-index of each run in i-major traversal */
204 ,je2[] ; /* (O) Ending local J-index of each run in i-major traversal */
206 idif_p /* (O) Difference between local and global I indices (i-ig). */
207 ,jdif_p ; /* (O) Difference between local and global J indices (j-jg). */
213 RSL_GET_RUN_INFOP ( d_p, &x, maxrun_p, nl_p, nrunj_p, nruni_p,
214 js, is, ie, is2, js2, je2, idif_p, jdif_p,
218 /* additional P argument is the width of pad to allow for */
220 RSL_GET_RUN_INFOP - get RSL information about size and shape local allocation.
223 This routine is similar to RSL_GET_RUN_INFO except that it allows for
224 execution on the extended array pads of the local processor
225 subdomains. This can be useful for trading off computation for
226 communication in the code and can simplify the implementation
227 by allowing fewer modifications for distributed memory
228 parallelism. The argument Arg2 may be set to
229 for one of 3 modes of iteration over the local subdomain ---
230 Arg2 = 2 gives iteration over the local subdomain and the set of
231 ghost points that are two-away from points in the local subdomain,
232 Arg2 = 1 gives iteration over the local subdomain and the set of
233 ghost points that are one-away from points in the local subdomain, and
234 Arg2 = 0 gives iteration over just the local subdomain (no ghost points).
235 that are immediately adjacent to the local processor subdomain, or
236 the set that is within 2 cells of the local processor subdomain. The
237 information for controlling iteration over the region is returned in
238 the arguments Arg5, Arg6, Arg7, Arg8, Arg9, Arg10, Arg11, and Arg12.
240 The M4 RSL loop macros automatically initialize 3 separate sets of
241 these data structures for the 3 available modes of iteration.
244 RSL_GET_RUN_INFO, LoopMacros.m4
246 RSL_GET_RUN_INFOP ( d_p, p_p, maxrun_p, nl_p, nrunj_p, nruni_p,
247 js, is, ie, is2, js2, je2, idif_p, jdif_p,
250 d_p /* (I) RSL domain descriptor (input) */
251 ,p_p /* (I) How many extra pad cells to include (0, 1, or 2) */
252 ,maxrun_p /* (I) Number of elements in array arguments to this routine */
253 ,nl_p /* (O) Nest level of the domain */
254 ,nrunj_p /* (O) Number of runs through domain in j-major traversal */
255 ,nruni_p /* (O) Number of runs through domain in i-minor traversal */
256 ,js /* (OA) Local J-index of each run in j-major traversal */
257 ,is /* (OA) Starting local I-index of each run in j-major traversal */
258 ,ie /* (OA) Ending local I-index of each run in j-major traversal */
259 ,is2 /* (OA) Local I-index of each run in i-major traversal */
260 ,js2 /* (OA) Starting local J-index of each run in i-major traversal */
261 ,je2 /* (OA) Ending local J-index of each run in i-major traversal */
262 ,idif_p /* (O) Difference between local and global I indices (i-ig). */
263 ,jdif_p /* (O) Difference between local and global J indices (j-jg). */
264 ,jg2n /* (OA) Number of run for a global J-index in j-major traversal. */
265 ,ig2n /* (OA) Number of run for a global I-index in i-major traversal. */
272 RSL_TEST_ERR( d < 0 || d >= RSL_MAXDOMAINS,
273 "rsl_get_run_info: bad domain") ;
274 RSL_TEST_ERR( p < 0, "Negative pad arg for RSL_GET_RUN_INFOP") ;
275 if ( p > MAX_KINDPAD )
277 sprintf(mess,"RSL_GET_RUN_INFOP: pad arg (%d) larger than %d",p,MAX_KINDPAD) ;
278 RSL_TEST_ERR( 1, mess ) ;
280 RSL_TEST_ERR( domain_info[d].valid != RSL_VALID,
281 "rsl_init_nextcell: invalid domain") ;
282 if ( domain_info[d].decomposed != 1 )
284 default_decomposition( d_p,
285 &(domain_info[*d_p].loc_m),
286 &(domain_info[*d_p].loc_n) ) ;
289 *nl_p = domain_info[d].nest_level ;
290 *idif_p = domain_info[d].idif ;
291 *jdif_p = domain_info[d].jdif ;
293 *nrunj_p = domain_info[d].nrun[p] ;
294 if ( *maxrun_p < *nrunj_p )
296 *maxrun_p, *nrunj_p ) ;
297 RSL_TEST_ERR( 1, mess ) ;
303 for ( i = 0 ; i < MAX_RUNPAD-p ; i++ )
309 for ( i = 0 ; i < *nrunj_p ; i++ )
311 js[i+(MAX_RUNPAD-p)] = domain_info[d].js[p][i] ;
312 is[i+(MAX_RUNPAD-p)] = domain_info[d].is[p][i] ;
313 ie[i+(MAX_RUNPAD-p)] = domain_info[d].ie[p][i] ;
317 for ( i = 0 ; i < domain_info[d].len_n ; i++ )
319 jg2n[i] = domain_info[d].jg2n[p][i] + MAX_RUNPAD-p;
323 *nruni_p = domain_info[d].nruni[p] ;
324 if ( *maxrun_p < *nruni_p )
327 "rsl_get_run_info: would overwrite user arrays: maxrun (%d) < nruni (%d)",
328 *maxrun_p, *nruni_p ) ;
329 RSL_TEST_ERR( 1, mess ) ;
331 for ( i = 0 ; i < MAX_RUNPAD-p ; i++ )
337 for ( i = 0 ; i < *nruni_p ; i++ )
339 is2[i+(MAX_RUNPAD-p)] = domain_info[d].is2[p][i] ;
340 js2[i+(MAX_RUNPAD-p)] = domain_info[d].js2[p][i] ;
341 je2[i+(MAX_RUNPAD-p)] = domain_info[d].je2[p][i] ;
345 for ( i = 0 ; i < domain_info[d].len_n ; i++ )
347 ig2n[i] = domain_info[d].ig2n[p][i] + MAX_RUNPAD-p;
353 else if ( p > 3 && p <= 4)
357 for ( i = 0 ; i < MAX_RUNPAD-p1 ; i++ )
363 for ( i = 0 ; i < *nrunj_p ; i++ )
365 js[i+(MAX_RUNPAD-0)] = domain_info[d].js[p][i] ;
366 is[i+(MAX_RUNPAD-0)] = domain_info[d].is[p][i] ;
367 ie[i+(MAX_RUNPAD-0)] = domain_info[d].ie[p][i] ;
371 for ( i = 0 ; i < domain_info[d].len_n ; i++ )
373 jg2n[i] = domain_info[d].jg2n[p][i] + MAX_RUNPAD-0;
377 *nruni_p = domain_info[d].nruni[p] ;
378 if ( *maxrun_p < *nruni_p )
381 "rsl_get_run_info: would overwrite user arrays: maxrun (%d) < nruni (%d)",
382 *maxrun_p, *nruni_p ) ;
383 RSL_TEST_ERR( 1, mess ) ;
385 for ( i = 0 ; i < MAX_RUNPAD-0 ; i++ )
391 for ( i = 0 ; i < *nruni_p ; i++ )
393 is2[i+(MAX_RUNPAD-0)] = domain_info[d].is2[p][i] ;
394 js2[i+(MAX_RUNPAD-0)] = domain_info[d].js2[p][i] ;
395 je2[i+(MAX_RUNPAD-0)] = domain_info[d].je2[p][i] ;
399 for ( i = 0 ; i < domain_info[d].len_n ; i++ )
401 ig2n[i] = domain_info[d].ig2n[p][i] + MAX_RUNPAD-0;
408 RSL_REG_RUN_INFOP( d_p, p_p, maxrun_p, nl_p,
413 d_p /* (I) RSL domain descriptor (input) */
414 ,p_p /* (I) How many extra pad cells to include (0, 1, or 2) */
415 ,maxrun_p /* (I) Number of elements in array arguments to this routine */
416 ,nl_p /* (O) Nest level of the domain */
424 int d, i, j, p, cnt ;
428 RSL_TEST_ERR( d < 0 || d >= RSL_MAXDOMAINS,
429 "rsl_get_run_info: bad domain") ;
430 RSL_TEST_ERR( p < 0, "Negative pad arg for RSL_GET_RUN_INFOP") ;
431 if ( p > MAX_KINDPAD )
433 sprintf(mess,"RSL_GET_RUN_INFOP: pad arg (%d) larger than %d",p,MAX_KINDPAD)
435 RSL_TEST_ERR( 1, mess ) ;
437 RSL_TEST_ERR( domain_info[d].valid != RSL_VALID,
438 "rsl_init_nextcell: invalid domain") ;
439 if ( domain_info[d].decomposed != 1 )
441 default_decomposition( d_p,
442 &(domain_info[*d_p].loc_m),
443 &(domain_info[*d_p].loc_n) ) ;
445 RSL_TEST_ERR( domain_info[*d_p].len_n > *maxrun_p,
446 "domain_info[*d_p].len_n > *maxrun_p") ;
447 RSL_TEST_ERR( domain_info[*d_p].len_m > *maxrun_p,
448 "domain_info[*d_p].len_m > *maxrun_p") ;
450 *nl_p = domain_info[d].nest_level ;
451 *idif_p = domain_info[d].idif ;
452 *jdif_p = domain_info[d].jdif ;
454 /* in following code, note assumptions on order of traversal,
455 contiguity of points, and rectangularity of partitions */
457 for ( j=0, cnt=0 ; j < domain_info[*d_p].len_n ; j++ )
459 if ( j+1 < domain_info[d].js2[p][MAX_RUNPAD]-*jdif_p )
461 js[j]=domain_info[d].js2[p][MAX_RUNPAD] ;
464 else if ( j+1 > domain_info[d].je2[p][MAX_RUNPAD]-*jdif_p )
467 je[j]=domain_info[d].je2[p][MAX_RUNPAD] ;
471 js[j]=domain_info[d].js2[p][MAX_RUNPAD] + cnt ;
472 je[j]=domain_info[d].js2[p][MAX_RUNPAD] + cnt ; /* yes -> js2 */
477 for ( i=0, cnt=0 ; i < domain_info[*d_p].len_m ; i++ )
479 if ( i+1 < domain_info[d].is[p][MAX_RUNPAD]-*idif_p )
481 is[i]=domain_info[d].is[p][MAX_RUNPAD] ;
484 else if ( i+1 > domain_info[d].ie[p][MAX_RUNPAD]-*idif_p )
487 ie[i]=domain_info[d].ie[p][MAX_RUNPAD] ;
491 is[i]=domain_info[d].is[p][MAX_RUNPAD] + cnt ;
492 ie[i]=domain_info[d].is[p][MAX_RUNPAD] + cnt ; /* yes -> is */
497 printf("is[%3d] = info[%d].is[%d] + cnt %3d = %d : ",i,d,p,cnt,is[i]);
498 printf("ie[%3d] = %d \n",i,d,p,cnt,ie[i]);