1 /* OpenACC parallelism dimensions clauses: num_gangs, num_workers,
4 /* { dg-additional-options "-DEXPENSIVE" { target run_expensive_tests } } */
6 /* { dg-additional-options "--param=openacc-kernels=decompose" } */
8 /* { dg-additional-options "-fopt-info-all-omp" }
9 { dg-additional-options "-foffload=-fopt-info-all-omp" } */
11 /* { dg-additional-options "--param=openacc-privatization=noisy" }
12 { dg-additional-options "-foffload=--param=openacc-privatization=noisy" }
13 Prune a few: uninteresting, and potentially varying depending on GCC configuration (data types):
14 { dg-prune-output {note: variable 'D\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} } */
16 /* It's only with Tcl 8.5 (released in 2007) that "the variable 'varName'
17 passed to 'incr' may be unset, and in that case, it will be set to [...]",
18 so to maintain compatibility with earlier Tcl releases, we manually
19 initialize counter variables:
20 { dg-line l_dummy[variable c_compute 0 c_loop_i 0 c_loop_j 0 c_loop_k 0] }
21 { dg-message dummy {} { target iN-VAl-Id } l_dummy } to avoid
22 "WARNING: dg-line var l_dummy defined, but not used". */
24 /* { dg-additional-options "-Wopenacc-parallelism" } for testing/documenting
25 aspects of that functionality. */
27 /* See also '../libgomp.oacc-fortran/parallel-dims.f90'. */
31 #include <gomp-constants.h>
33 #pragma acc routine seq
34 inline __attribute__ ((always_inline
))
35 static int acc_gang ()
37 return __builtin_goacc_parlevel_id (GOMP_DIM_GANG
);
40 #pragma acc routine seq
41 inline __attribute__ ((always_inline
))
42 static int acc_worker ()
44 return __builtin_goacc_parlevel_id (GOMP_DIM_WORKER
);
47 #pragma acc routine seq
48 inline __attribute__ ((always_inline
))
49 static int acc_vector ()
51 return __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR
);
62 acc_init (acc_device_default
);
64 /* OpenACC parallel construct. */
66 /* Non-positive value. */
71 /* { dg-warning {'num_gangs' value must be positive} {} { target c } .-1 } */
72 int gangs_actual
= GANGS
;
73 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
74 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
75 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
76 #pragma acc parallel /* { dg-line l_compute[incr c_compute] } */ \
78 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
80 /* { dg-note {in expansion of macro 'GANGS'} {} { target c } .-1 } */
81 /* { dg-warning {'num_gangs' value must be positive} {} { target c++ } .-2 } */
82 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
84 /* We're actually executing with num_gangs (1). */
86 for (int i
= N
* gangs_actual
; i
> -N
* gangs_actual
; --i
)
88 gangs_min
= gangs_max
= acc_gang ();
89 workers_min
= workers_max
= acc_worker ();
90 vectors_min
= vectors_max
= acc_vector ();
93 if (gangs_actual
!= 1)
95 if (gangs_min
!= 0 || gangs_max
!= gangs_actual
- 1
96 || workers_min
!= 0 || workers_max
!= 0
97 || vectors_min
!= 0 || vectors_max
!= 0)
105 /* { dg-warning {'num_gangs' value must be positive} {} { target c } .-1 } */
106 int gangs_actual
= GANGS
;
107 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
108 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
109 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
110 #pragma acc parallel /* { dg-line l_compute[incr c_compute] } */ \
111 copy (gangs_actual) \
113 /* { dg-note {in expansion of macro 'GANGS'} {} { target c } .-1 } */
114 /* { dg-warning {'num_gangs' value must be positive} {} { target c++ } .-2 } */
115 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
116 /* { dg-warning {region contains gang partitioned code but is not gang partitioned} {} { target *-*-* } l_compute$c_compute } */
118 /* We're actually executing with num_gangs (1). */
120 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
122 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
123 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
124 /* { dg-optimized {assigned OpenACC gang loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
125 for (int i
= N
* gangs_actual
; i
> -N
* gangs_actual
; --i
)
127 gangs_min
= gangs_max
= acc_gang ();
128 workers_min
= workers_max
= acc_worker ();
129 vectors_min
= vectors_max
= acc_vector ();
132 if (gangs_actual
!= 1)
134 if (gangs_min
!= 0 || gangs_max
!= gangs_actual
- 1
135 || workers_min
!= 0 || workers_max
!= 0
136 || vectors_min
!= 0 || vectors_max
!= 0)
144 /* { dg-warning {'num_workers' value must be positive} {} { target c } .-1 } */
145 int workers_actual
= WORKERS
;
146 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
147 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
148 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
149 #pragma acc parallel /* { dg-line l_compute[incr c_compute] } */ \
150 copy (workers_actual) \
151 num_workers (WORKERS)
152 /* { dg-note {in expansion of macro 'WORKERS'} {} { target c } .-1 } */
153 /* { dg-warning {'num_workers' value must be positive} {} { target c++ } .-2 } */
154 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
155 /* { dg-warning {region contains worker partitioned code but is not worker partitioned} {} { target *-*-* } l_compute$c_compute } */
157 /* We're actually executing with num_workers (1). */
159 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
161 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
162 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
163 /* { dg-optimized {assigned OpenACC worker loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
164 for (int i
= N
* workers_actual
; i
> -N
* workers_actual
; --i
)
166 gangs_min
= gangs_max
= acc_gang ();
167 workers_min
= workers_max
= acc_worker ();
168 vectors_min
= vectors_max
= acc_vector ();
171 if (workers_actual
!= 1)
173 if (gangs_min
!= 0 || gangs_max
!= 0
174 || workers_min
!= 0 || workers_max
!= workers_actual
- 1
175 || vectors_min
!= 0 || vectors_max
!= 0)
183 /* { dg-warning {'vector_length' value must be positive} {} { target c } .-1 } */
184 int vectors_actual
= VECTORS
;
185 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
186 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
187 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
188 #pragma acc parallel /* { dg-line l_compute[incr c_compute] } */ \
189 copy (vectors_actual) \
190 vector_length (VECTORS)
191 /* { dg-note {in expansion of macro 'VECTORS'} {} { target c } .-1 } */
192 /* { dg-warning {'vector_length' value must be positive} {} { target c++ } .-2 } */
193 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
194 /* { dg-warning {region contains vector partitioned code but is not vector partitioned} {} { target *-*-* } l_compute$c_compute } */
195 /* { dg-warning {using 'vector_length \(32\)', ignoring 1} {} { target openacc_nvidia_accel_selected } l_compute$c_compute } */
197 /* We're actually executing with vector_length (1), just the GCC nvptx
198 back end enforces vector_length (32). */
199 if (acc_on_device (acc_device_nvidia
))
200 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
201 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
205 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
207 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
208 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
209 /* { dg-optimized {assigned OpenACC vector loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
210 for (int i
= N
* vectors_actual
; i
> -N
* vectors_actual
; --i
)
212 gangs_min
= gangs_max
= acc_gang ();
213 workers_min
= workers_max
= acc_worker ();
214 vectors_min
= vectors_max
= acc_vector ();
217 if (acc_get_device_type () == acc_device_nvidia
)
219 if (vectors_actual
!= 32)
223 if (vectors_actual
!= 1)
225 if (gangs_min
!= 0 || gangs_max
!= 0
226 || workers_min
!= 0 || workers_max
!= 0
227 || vectors_min
!= 0 || vectors_max
!= vectors_actual
- 1)
237 /* There is no actual limit for the number of gangs, so we try with a
238 rather high value. */
240 int gangs_actual
= gangs
;
241 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
242 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
243 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
244 #pragma acc parallel /* { dg-line l_compute[incr c_compute] } */ \
245 copy (gangs_actual) \
246 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
248 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
249 /* { dg-bogus {warning: region is gang partitioned but does not contain gang partitioned code} {TODO 'reduction'} { xfail *-*-* } l_compute$c_compute } */
251 if (acc_on_device (acc_device_host
))
252 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
253 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
255 /* We're actually executing with num_gangs (1). */
258 /* As we're executing GR not GP, don't multiply with a "gangs_actual"
260 for (int i
= N
/* * gangs_actual */; i
> -N
/* * gangs_actual */; --i
)
262 gangs_min
= gangs_max
= acc_gang ();
263 workers_min
= workers_max
= acc_worker ();
264 vectors_min
= vectors_max
= acc_vector ();
267 if (gangs_actual
< 1)
269 if (gangs_min
!= 0 || gangs_max
!= gangs_actual
- 1
270 || workers_min
!= 0 || workers_max
!= 0
271 || vectors_min
!= 0 || vectors_max
!= 0)
277 /* There is no actual limit for the number of gangs, so we try with a
278 rather high value. */
280 int gangs_actual
= gangs
;
281 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
282 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
283 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
284 #pragma acc parallel /* { dg-line l_compute[incr c_compute] } */ \
285 copy (gangs_actual) \
287 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
289 if (acc_on_device (acc_device_host
))
290 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
291 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
293 /* We're actually executing with num_gangs (1). */
296 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
298 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
299 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
300 /* { dg-optimized {assigned OpenACC gang loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
301 for (int i
= N
* gangs_actual
; i
> -N
* gangs_actual
; --i
)
303 gangs_min
= gangs_max
= acc_gang ();
304 workers_min
= workers_max
= acc_worker ();
305 vectors_min
= vectors_max
= acc_vector ();
308 if (gangs_actual
< 1)
310 if (gangs_min
!= 0 || gangs_max
!= gangs_actual
- 1
311 || workers_min
!= 0 || workers_max
!= 0
312 || vectors_min
!= 0 || vectors_max
!= 0)
318 /* We try with an outrageously large value. */
319 #define WORKERS 2 << 20
320 int workers_actual
= WORKERS
;
321 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
322 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
323 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
324 #pragma acc parallel /* { dg-line l_compute[incr c_compute] } */ \
325 copy (workers_actual) \
326 num_workers (WORKERS)
327 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
328 /* { dg-warning {using 'num_workers \(32\)', ignoring 2097152} {} { target openacc_nvidia_accel_selected } l_compute$c_compute } */
330 if (acc_on_device (acc_device_host
))
331 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
332 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
334 /* We're actually executing with num_workers (1). */
337 else if (acc_on_device (acc_device_nvidia
))
338 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
339 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
341 /* The GCC nvptx back end enforces num_workers (32). */
344 else if (acc_on_device (acc_device_radeon
))
345 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
346 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
348 /* The GCC GCN back end is limited to num_workers (16). */
353 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
355 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
356 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
357 /* { dg-optimized {assigned OpenACC worker loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
358 for (int i
= N
* workers_actual
; i
> -N
* workers_actual
; --i
)
360 gangs_min
= gangs_max
= acc_gang ();
361 workers_min
= workers_max
= acc_worker ();
362 vectors_min
= vectors_max
= acc_vector ();
365 if (workers_actual
< 1)
367 if (gangs_min
!= 0 || gangs_max
!= 0
368 || workers_min
!= 0 || workers_max
!= workers_actual
- 1
369 || vectors_min
!= 0 || vectors_max
!= 0)
376 /* We try with an outrageously large value. */
377 int workers
= 2 << 20;
378 /* For nvptx offloading, this one will not result in "using num_workers
379 (32), ignoring runtime setting", and will in fact try to launch with
380 "num_workers (workers)", which will run into "libgomp: cuLaunchKernel
381 error: invalid argument". So, limit ourselves here. */
382 if (acc_get_device_type () == acc_device_nvidia
)
384 int workers_actual
= workers
;
385 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
386 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
387 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
388 #pragma acc parallel /* { dg-line l_compute[incr c_compute] } */ \
389 copy (workers_actual) \
390 num_workers (workers)
391 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
393 if (acc_on_device (acc_device_host
))
394 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
395 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
397 /* We're actually executing with num_workers (1). */
400 else if (acc_on_device (acc_device_nvidia
))
401 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
402 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
404 /* We're actually executing with num_workers (32). */
405 /* workers_actual = 32; */
407 else if (acc_on_device (acc_device_radeon
))
408 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
409 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
411 /* The GCC GCN back end is limited to num_workers (16). */
416 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
418 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
419 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
420 /* { dg-optimized {assigned OpenACC worker loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
421 for (int i
= N
* workers_actual
; i
> -N
* workers_actual
; --i
)
423 gangs_min
= gangs_max
= acc_gang ();
424 workers_min
= workers_max
= acc_worker ();
425 vectors_min
= vectors_max
= acc_vector ();
428 if (workers_actual
< 1)
430 if (gangs_min
!= 0 || gangs_max
!= 0
431 || workers_min
!= 0 || workers_max
!= workers_actual
- 1
432 || vectors_min
!= 0 || vectors_max
!= 0)
438 /* We try with an outrageously large value. */
439 #define VECTORS 2 << 20
440 int vectors_actual
= VECTORS
;
441 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
442 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
443 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
444 #pragma acc parallel /* { dg-line l_compute[incr c_compute] } */ \
445 copy (vectors_actual) \
446 vector_length (VECTORS)
447 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
448 /* { dg-warning {using 'vector_length \(1024\)', ignoring 2097152} {} { target openacc_nvidia_accel_selected } l_compute$c_compute } */
450 if (acc_on_device (acc_device_host
))
451 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
452 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
454 /* We're actually executing with vector_length (1). */
457 else if (acc_on_device (acc_device_nvidia
))
458 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
459 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
461 /* The GCC nvptx back end reduces to vector_length (1024). */
462 vectors_actual
= 1024;
464 else if (acc_on_device (acc_device_radeon
))
465 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
466 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
468 /* The GCC GCN back end enforces vector_length (1): autovectorize. */
473 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
475 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
476 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
477 /* { dg-optimized {assigned OpenACC vector loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
478 for (int i
= N
* vectors_actual
; i
> -N
* vectors_actual
; --i
)
480 gangs_min
= gangs_max
= acc_gang ();
481 workers_min
= workers_max
= acc_worker ();
482 vectors_min
= vectors_max
= acc_vector ();
485 if (vectors_actual
< 1)
487 if (gangs_min
!= 0 || gangs_max
!= 0
488 || workers_min
!= 0 || workers_max
!= 0
489 || vectors_min
!= 0 || vectors_max
!= vectors_actual
- 1)
496 /* We try with an outrageously large value. */
497 int vectors
= 2 << 20;
498 int vectors_actual
= vectors
;
499 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
500 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
501 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
502 #pragma acc parallel /* { dg-line l_compute[incr c_compute] } */ \
503 copy (vectors_actual) \
504 vector_length (vectors)
505 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
506 /* { dg-warning {using 'vector_length \(32\)', ignoring runtime setting} {} { target openacc_nvidia_accel_selected } l_compute$c_compute } */
508 if (acc_on_device (acc_device_host
))
509 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
510 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
512 /* We're actually executing with vector_length (1). */
515 else if (acc_on_device (acc_device_nvidia
))
516 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
517 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
519 /* The GCC nvptx back end enforces vector_length (32). */
522 else if (acc_on_device (acc_device_radeon
))
523 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
524 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
526 /* Because of the way vectors are implemented for GCN, a vector loop
527 containing a seq routine call will not vectorize calls to that
528 routine. Hence, we'll only get one "vector". */
533 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
535 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
536 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
537 /* { dg-optimized {assigned OpenACC vector loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
538 for (int i
= N
* vectors_actual
; i
> -N
* vectors_actual
; --i
)
540 gangs_min
= gangs_max
= acc_gang ();
541 workers_min
= workers_max
= acc_worker ();
542 vectors_min
= vectors_max
= acc_vector ();
545 if (vectors_actual
< 1)
547 if (gangs_min
!= 0 || gangs_max
!= 0
548 || workers_min
!= 0 || workers_max
!= 0
549 || vectors_min
!= 0 || vectors_max
!= vectors_actual
- 1)
554 /* Composition of GP, WP, VP. */
557 /* With nvptx offloading, multi-level reductions apparently are very slow
558 in the following case. So, limit ourselves here. */
559 if (acc_get_device_type () == acc_device_nvidia
)
561 /* Similar appears to be true for GCN. */
562 if (acc_get_device_type () == acc_device_radeon
)
564 int gangs_actual
= gangs
;
566 int workers_actual
= WORKERS
;
568 int vectors_actual
= VECTORS
;
569 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
570 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
571 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
572 #pragma acc parallel /* { dg-line l_compute[incr c_compute] } */ \
573 copy (gangs_actual, workers_actual, vectors_actual) \
575 num_workers (WORKERS) \
576 vector_length (VECTORS)
577 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
578 /* { dg-warning {using 'vector_length \(32\)', ignoring 11} {} { target openacc_nvidia_accel_selected } l_compute$c_compute } */
580 if (acc_on_device (acc_device_host
))
581 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
582 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
584 /* We're actually executing with num_gangs (1), num_workers (1),
585 vector_length (1). */
590 else if (acc_on_device (acc_device_nvidia
))
591 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
592 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
594 /* The GCC nvptx back end enforces vector_length (32). */
597 else if (acc_on_device (acc_device_radeon
))
598 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
599 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
601 /* See above comments about GCN vectors_actual. */
606 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
608 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
609 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
610 /* { dg-note {variable 'j' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
611 /* { dg-optimized {assigned OpenACC gang loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
612 for (int i
= N
* gangs_actual
; i
> -N
* gangs_actual
; --i
)
613 #pragma acc loop /* { dg-line l_loop_j[incr c_loop_j] } */ \
615 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
616 /* { dg-note {variable 'j' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_j$c_loop_j } */
617 /* { dg-note {variable 'k' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_j$c_loop_j } */
618 /* { dg-optimized {assigned OpenACC worker loop parallelism} {} { target *-*-* } l_loop_j$c_loop_j } */
619 for (int j
= N
* workers_actual
; j
> -N
* workers_actual
; --j
)
620 #pragma acc loop /* { dg-line l_loop_k[incr c_loop_k] } */ \
622 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
623 /* { dg-note {variable 'k' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_k$c_loop_k } */
624 /* { dg-optimized {assigned OpenACC vector loop parallelism} {} { target *-*-* } l_loop_k$c_loop_k } */
625 for (int k
= N
* vectors_actual
; k
> -N
* vectors_actual
; --k
)
627 gangs_min
= gangs_max
= acc_gang ();
628 workers_min
= workers_max
= acc_worker ();
629 vectors_min
= vectors_max
= acc_vector ();
632 if (gangs_min
!= 0 || gangs_max
!= gangs_actual
- 1
633 || workers_min
!= 0 || workers_max
!= workers_actual
- 1
634 || vectors_min
!= 0 || vectors_max
!= vectors_actual
- 1)
641 /* OpenACC kernels construct. */
643 /* We can't test parallelized OpenACC kernels constructs in this way: use of
644 the acc_gang, acc_worker, acc_vector functions will make the construct
648 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
651 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
652 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
653 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
654 #pragma acc kernels /* { dg-line l_compute[incr c_compute] } */
655 /* { dg-note {OpenACC 'kernels' decomposition: variable 'vectors_max' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
656 { dg-note {variable 'vectors_max' made addressable} {} { target *-*-* } l_compute$c_compute } */
657 /* { dg-note {OpenACC 'kernels' decomposition: variable 'vectors_min' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
658 { dg-note {variable 'vectors_min' made addressable} {} { target *-*-* } l_compute$c_compute } */
659 /* { dg-note {OpenACC 'kernels' decomposition: variable 'workers_max' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
660 { dg-note {variable 'workers_max' made addressable} {} { target *-*-* } l_compute$c_compute } */
661 /* { dg-note {OpenACC 'kernels' decomposition: variable 'workers_min' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
662 { dg-note {variable 'workers_min' made addressable} {} { target *-*-* } l_compute$c_compute } */
663 /* { dg-note {OpenACC 'kernels' decomposition: variable 'gangs_max' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
664 { dg-note {variable 'gangs_max' made addressable} {} { target *-*-* } l_compute$c_compute } */
665 /* { dg-note {OpenACC 'kernels' decomposition: variable 'gangs_min' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
666 { dg-note {variable 'gangs_min' made addressable} {} { target *-*-* } l_compute$c_compute } */
668 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
669 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
670 /* { dg-note {forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis} {} { target *-*-* } l_loop_i$c_loop_i } */
671 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
672 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
673 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
674 for (int i
= N
; i
> -N
; --i
)
676 /* This is to make the loop unparallelizable. */
677 asm volatile ("" : : : "memory");
679 gangs_min
= gangs_max
= acc_gang ();
680 workers_min
= workers_max
= acc_worker ();
681 vectors_min
= vectors_max
= acc_vector ();
684 if (gangs_min
!= 0 || gangs_max
!= 1 - 1
685 || workers_min
!= 0 || workers_max
!= 1 - 1
686 || vectors_min
!= 0 || vectors_max
!= 1 - 1)
691 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
692 kernels even when there are explicit num_gangs, num_workers, or
693 vector_length clauses. */
698 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
699 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
700 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
701 #pragma acc kernels /* { dg-line l_compute[incr c_compute] } */ \
703 num_workers (WORKERS) \
704 vector_length (VECTORS)
705 /* { dg-note {OpenACC 'kernels' decomposition: variable 'vectors_max' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
706 { dg-note {variable 'vectors_max' made addressable} {} { target *-*-* } l_compute$c_compute } */
707 /* { dg-note {OpenACC 'kernels' decomposition: variable 'vectors_min' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
708 { dg-note {variable 'vectors_min' made addressable} {} { target *-*-* } l_compute$c_compute } */
709 /* { dg-note {OpenACC 'kernels' decomposition: variable 'workers_max' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
710 { dg-note {variable 'workers_max' made addressable} {} { target *-*-* } l_compute$c_compute } */
711 /* { dg-note {OpenACC 'kernels' decomposition: variable 'workers_min' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
712 { dg-note {variable 'workers_min' made addressable} {} { target *-*-* } l_compute$c_compute } */
713 /* { dg-note {OpenACC 'kernels' decomposition: variable 'gangs_max' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
714 { dg-note {variable 'gangs_max' made addressable} {} { target *-*-* } l_compute$c_compute } */
715 /* { dg-note {OpenACC 'kernels' decomposition: variable 'gangs_min' in 'copy' clause requested to be made addressable} {} { target *-*-* } l_compute$c_compute }
716 { dg-note {variable 'gangs_min' made addressable} {} { target *-*-* } l_compute$c_compute } */
718 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
719 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
720 /* { dg-note {forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis} {} { target *-*-* } l_loop_i$c_loop_i } */
721 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
722 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
723 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
724 for (int i
= N
; i
> -N
; --i
)
726 /* This is to make the loop unparallelizable. */
727 asm volatile ("" : : : "memory");
729 gangs_min
= gangs_max
= acc_gang ();
730 workers_min
= workers_max
= acc_worker ();
731 vectors_min
= vectors_max
= acc_vector ();
734 if (gangs_min
!= 0 || gangs_max
!= 1 - 1
735 || workers_min
!= 0 || workers_max
!= 1 - 1
736 || vectors_min
!= 0 || vectors_max
!= 1 - 1)
743 /* OpenACC serial construct. */
747 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
748 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
749 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
750 #pragma acc serial /* { dg-line l_compute[incr c_compute] } */ \
751 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
752 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
753 /* { dg-warning {using 'vector_length \(32\)', ignoring 1} {} { target openacc_nvidia_accel_selected } l_compute$c_compute } */
755 for (int i
= N
; i
> -N
; i
--)
757 gangs_min
= gangs_max
= acc_gang ();
758 workers_min
= workers_max
= acc_worker ();
759 vectors_min
= vectors_max
= acc_vector ();
762 if (gangs_min
!= 0 || gangs_max
!= 1 - 1
763 || workers_min
!= 0 || workers_max
!= 1 - 1
764 || vectors_min
!= 0 || vectors_max
!= 1 - 1)
768 /* Composition of GP, WP, VP. */
770 int vectors_actual
= 1; /* Implicit 'vector_length (1)' clause. */
771 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
772 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
773 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
774 #pragma acc serial /* { dg-line l_compute[incr c_compute] } */ \
775 copy (vectors_actual) \
776 copy (gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max)
777 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */
778 /* { dg-bogus {warning: region contains gang partitioned code but is not gang partitioned} {TODO 'serial'} { xfail *-*-* } l_compute$c_compute }
779 { dg-bogus {warning: region contains worker partitioned code but is not worker partitioned} {TODO 'serial'} { xfail *-*-* } l_compute$c_compute }
780 { dg-bogus {warning: region contains vector partitioned code but is not vector partitioned} {TODO 'serial'} { xfail *-*-* } l_compute$c_compute } */
781 /* { dg-warning {using 'vector_length \(32\)', ignoring 1} {} { target openacc_nvidia_accel_selected } l_compute$c_compute } */
783 if (acc_on_device (acc_device_nvidia
))
784 /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 }
785 ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */
787 /* The GCC nvptx back end enforces vector_length (32). */
788 /* It's unclear if that's actually permissible here;
789 <https://github.com/OpenACC/openacc-spec/issues/238> "OpenACC
790 'serial' construct might not actually be serial". */
793 #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \
795 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
796 /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
797 /* { dg-note {variable 'j' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */
798 /* { dg-optimized {assigned OpenACC gang loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */
799 for (int i
= N
; i
> -N
; i
--)
800 #pragma acc loop /* { dg-line l_loop_j[incr c_loop_j] } */ \
802 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
803 /* { dg-note {variable 'j' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_j$c_loop_j } */
804 /* { dg-note {variable 'k' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_j$c_loop_j } */
805 /* { dg-optimized {assigned OpenACC worker loop parallelism} {} { target *-*-* } l_loop_j$c_loop_j } */
806 for (int j
= N
; j
> -N
; j
--)
807 #pragma acc loop /* { dg-line l_loop_k[incr c_loop_k] } */ \
809 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
810 /* { dg-note {variable 'k' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_k$c_loop_k } */
811 /* { dg-optimized {assigned OpenACC vector loop parallelism} {} { target *-*-* } l_loop_k$c_loop_k } */
812 for (int k
= N
* vectors_actual
; k
> -N
* vectors_actual
; k
--)
814 gangs_min
= gangs_max
= acc_gang ();
815 workers_min
= workers_max
= acc_worker ();
816 vectors_min
= vectors_max
= acc_vector ();
819 if (acc_get_device_type () == acc_device_nvidia
)
821 if (vectors_actual
!= 32)
825 if (vectors_actual
!= 1)
827 if (gangs_min
!= 0 || gangs_max
!= 1 - 1
828 || workers_min
!= 0 || workers_max
!= 1 - 1
829 || vectors_min
!= 0 || vectors_max
!= vectors_actual
- 1)