1 /* OpenACC parallelism dimensions clauses: num_gangs, num_workers,
7 /* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper
8 not behaving as expected for -O0. */
9 #pragma acc routine seq
10 static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
12 if (acc_on_device ((int) acc_device_host
))
14 else if (acc_on_device ((int) acc_device_nvidia
))
17 asm volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (r
));
24 #pragma acc routine seq
25 static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
27 if (acc_on_device ((int) acc_device_host
))
29 else if (acc_on_device ((int) acc_device_nvidia
))
32 asm volatile ("mov.u32 %0,%%tid.y;" : "=r" (r
));
39 #pragma acc routine seq
40 static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
42 if (acc_on_device ((int) acc_device_host
))
44 else if (acc_on_device ((int) acc_device_nvidia
))
47 asm volatile ("mov.u32 %0,%%tid.x;" : "=r" (r
));
57 acc_init (acc_device_default
);
59 /* Non-positive value. */
63 #define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
64 int gangs_actual
= GANGS
;
65 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
66 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
67 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
68 #pragma acc parallel copy (gangs_actual) \
69 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
70 num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
72 /* We're actually executing with num_gangs (1). */
74 for (int i
= 100 * gangs_actual
; i
> -100 * gangs_actual
; --i
)
76 /* <https://gcc.gnu.org/PR80547>. */
78 gangs_min
= gangs_max
= acc_gang ();
79 workers_min
= workers_max
= acc_worker ();
80 vectors_min
= vectors_max
= acc_vector ();
82 int gangs
= acc_gang ();
83 gangs_min
= (gangs_min
< gangs
) ? gangs_min
: gangs
;
84 gangs_max
= (gangs_max
> gangs
) ? gangs_max
: gangs
;
85 int workers
= acc_worker ();
86 workers_min
= (workers_min
< workers
) ? workers_min
: workers
;
87 workers_max
= (workers_max
> workers
) ? workers_max
: workers
;
88 int vectors
= acc_vector ();
89 vectors_min
= (vectors_min
< vectors
) ? vectors_min
: vectors
;
90 vectors_max
= (vectors_max
> vectors
) ? vectors_max
: vectors
;
94 if (gangs_actual
!= 1)
96 if (gangs_min
!= 0 || gangs_max
!= gangs_actual
- 1
97 || workers_min
!= 0 || workers_max
!= 0
98 || vectors_min
!= 0 || vectors_max
!= 0)
105 #define GANGS 0 /* { dg-warning "'num_gangs' value must be positive" "" { target c } } */
106 int gangs_actual
= GANGS
;
107 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
108 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
109 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
110 #pragma acc parallel copy (gangs_actual) \
111 num_gangs (GANGS) /* { dg-warning "'num_gangs' value must be positive" "" { target c++ } } */
113 /* We're actually executing with num_gangs (1). */
115 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
116 for (int i
= 100 * gangs_actual
; i
> -100 * gangs_actual
; --i
)
118 gangs_min
= gangs_max
= acc_gang ();
119 workers_min
= workers_max
= acc_worker ();
120 vectors_min
= vectors_max
= acc_vector ();
123 if (gangs_actual
!= 1)
125 if (gangs_min
!= 0 || gangs_max
!= gangs_actual
- 1
126 || workers_min
!= 0 || workers_max
!= 0
127 || vectors_min
!= 0 || vectors_max
!= 0)
134 #define WORKERS 0 /* { dg-warning "'num_workers' value must be positive" "" { target c } } */
135 int workers_actual
= WORKERS
;
136 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
137 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
138 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
139 #pragma acc parallel copy (workers_actual) \
140 num_workers (WORKERS) /* { dg-warning "'num_workers' value must be positive" "" { target c++ } } */
142 /* We're actually executing with num_workers (1). */
144 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
145 for (int i
= 100 * workers_actual
; i
> -100 * workers_actual
; --i
)
147 gangs_min
= gangs_max
= acc_gang ();
148 workers_min
= workers_max
= acc_worker ();
149 vectors_min
= vectors_max
= acc_vector ();
152 if (workers_actual
!= 1)
154 if (gangs_min
!= 0 || gangs_max
!= 0
155 || workers_min
!= 0 || workers_max
!= workers_actual
- 1
156 || vectors_min
!= 0 || vectors_max
!= 0)
163 #define VECTORS 0 /* { dg-warning "'vector_length' value must be positive" "" { target c } } */
164 int vectors_actual
= VECTORS
;
165 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
166 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
167 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
168 #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_configured } } */ \
169 vector_length (VECTORS) /* { dg-warning "'vector_length' value must be positive" "" { target c++ } } */
171 /* We're actually executing with vector_length (1), just the GCC nvptx
172 back end enforces vector_length (32). */
173 if (acc_on_device (acc_device_nvidia
))
177 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
178 for (int i
= 100 * vectors_actual
; i
> -100 * vectors_actual
; --i
)
180 gangs_min
= gangs_max
= acc_gang ();
181 workers_min
= workers_max
= acc_worker ();
182 vectors_min
= vectors_max
= acc_vector ();
185 if (acc_get_device_type () == acc_device_nvidia
)
187 if (vectors_actual
!= 32)
191 if (vectors_actual
!= 1)
193 if (gangs_min
!= 0 || gangs_max
!= 0
194 || workers_min
!= 0 || workers_max
!= 0
195 || vectors_min
!= 0 || vectors_max
!= vectors_actual
- 1)
205 /* There is no actual limit for the number of gangs, so we try with a
206 rather high value. */
208 int gangs_actual
= gangs
;
209 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
210 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
211 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
212 #pragma acc parallel copy (gangs_actual) \
213 reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) \
216 if (acc_on_device (acc_device_host
))
218 /* We're actually executing with num_gangs (1). */
221 /* As we're executing GR not GP, don't multiply with a "gangs_actual"
223 for (int i
= 100 /* * gangs_actual */; i
> -100 /* * gangs_actual */; --i
)
225 gangs_min
= gangs_max
= acc_gang ();
226 workers_min
= workers_max
= acc_worker ();
227 vectors_min
= vectors_max
= acc_vector ();
230 if (gangs_actual
< 1)
232 if (gangs_min
!= 0 || gangs_max
!= gangs_actual
- 1
233 || workers_min
!= 0 || workers_max
!= 0
234 || vectors_min
!= 0 || vectors_max
!= 0)
240 /* There is no actual limit for the number of gangs, so we try with a
241 rather high value. */
243 int gangs_actual
= gangs
;
244 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
245 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
246 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
247 #pragma acc parallel copy (gangs_actual) \
250 if (acc_on_device (acc_device_host
))
252 /* We're actually executing with num_gangs (1). */
255 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
256 for (int i
= 100 * gangs_actual
; i
> -100 * gangs_actual
; --i
)
258 gangs_min
= gangs_max
= acc_gang ();
259 workers_min
= workers_max
= acc_worker ();
260 vectors_min
= vectors_max
= acc_vector ();
263 if (gangs_actual
< 1)
265 if (gangs_min
!= 0 || gangs_max
!= gangs_actual
- 1
266 || workers_min
!= 0 || workers_max
!= 0
267 || vectors_min
!= 0 || vectors_max
!= 0)
273 /* We try with an outrageously large value. */
274 #define WORKERS 2 << 20
275 int workers_actual
= WORKERS
;
276 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
277 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
278 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
279 #pragma acc parallel copy (workers_actual) /* { dg-warning "using num_workers \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \
280 num_workers (WORKERS)
282 if (acc_on_device (acc_device_host
))
284 /* We're actually executing with num_workers (1). */
287 else if (acc_on_device (acc_device_nvidia
))
289 /* The GCC nvptx back end enforces num_workers (32). */
294 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
295 for (int i
= 100 * workers_actual
; i
> -100 * workers_actual
; --i
)
297 gangs_min
= gangs_max
= acc_gang ();
298 workers_min
= workers_max
= acc_worker ();
299 vectors_min
= vectors_max
= acc_vector ();
302 if (workers_actual
< 1)
304 if (gangs_min
!= 0 || gangs_max
!= 0
305 || workers_min
!= 0 || workers_max
!= workers_actual
- 1
306 || vectors_min
!= 0 || vectors_max
!= 0)
313 /* We try with an outrageously large value. */
314 int workers
= 2 << 20;
315 /* For nvptx offloading, this one will not result in "using num_workers
316 (32), ignoring runtime setting", and will in fact try to launch with
317 "num_workers (workers)", which will run into "libgomp: cuLaunchKernel
318 error: invalid argument". So, limit ourselves here. */
319 if (acc_get_device_type () == acc_device_nvidia
)
321 int workers_actual
= workers
;
322 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
323 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
324 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
325 #pragma acc parallel copy (workers_actual) \
326 num_workers (workers)
328 if (acc_on_device (acc_device_host
))
330 /* We're actually executing with num_workers (1). */
333 else if (acc_on_device (acc_device_nvidia
))
335 /* We're actually executing with num_workers (32). */
336 /* workers_actual = 32; */
340 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
341 for (int i
= 100 * workers_actual
; i
> -100 * workers_actual
; --i
)
343 gangs_min
= gangs_max
= acc_gang ();
344 workers_min
= workers_max
= acc_worker ();
345 vectors_min
= vectors_max
= acc_vector ();
348 if (workers_actual
< 1)
350 if (gangs_min
!= 0 || gangs_max
!= 0
351 || workers_min
!= 0 || workers_max
!= workers_actual
- 1
352 || vectors_min
!= 0 || vectors_max
!= 0)
358 /* We try with an outrageously large value. */
359 #define VECTORS 2 << 20
360 int vectors_actual
= VECTORS
;
361 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
362 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
363 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
364 #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_configured } } */ \
365 vector_length (VECTORS)
367 if (acc_on_device (acc_device_host
))
369 /* We're actually executing with vector_length (1). */
372 else if (acc_on_device (acc_device_nvidia
))
374 /* The GCC nvptx back end enforces vector_length (32). */
379 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
380 for (int i
= 100 * vectors_actual
; i
> -100 * vectors_actual
; --i
)
382 gangs_min
= gangs_max
= acc_gang ();
383 workers_min
= workers_max
= acc_worker ();
384 vectors_min
= vectors_max
= acc_vector ();
387 if (vectors_actual
< 1)
389 if (gangs_min
!= 0 || gangs_max
!= 0
390 || workers_min
!= 0 || workers_max
!= 0
391 || vectors_min
!= 0 || vectors_max
!= vectors_actual
- 1)
398 /* We try with an outrageously large value. */
399 int vectors
= 2 << 20;
400 int vectors_actual
= vectors
;
401 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
402 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
403 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
404 #pragma acc parallel copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring runtime setting" "" { target openacc_nvidia_accel_configured } } */ \
405 vector_length (vectors)
407 if (acc_on_device (acc_device_host
))
409 /* We're actually executing with vector_length (1). */
412 else if (acc_on_device (acc_device_nvidia
))
414 /* The GCC nvptx back end enforces vector_length (32). */
419 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
420 for (int i
= 100 * vectors_actual
; i
> -100 * vectors_actual
; --i
)
422 gangs_min
= gangs_max
= acc_gang ();
423 workers_min
= workers_max
= acc_worker ();
424 vectors_min
= vectors_max
= acc_vector ();
427 if (vectors_actual
< 1)
429 if (gangs_min
!= 0 || gangs_max
!= 0
430 || workers_min
!= 0 || workers_max
!= 0
431 || vectors_min
!= 0 || vectors_max
!= vectors_actual
- 1)
436 /* Composition of GP, WP, VP. */
439 /* With nvptx offloading, multi-level reductions apparently are very slow
440 in the following case. So, limit ourselves here. */
441 if (acc_get_device_type () == acc_device_nvidia
)
443 int gangs_actual
= gangs
;
445 int workers_actual
= WORKERS
;
447 int vectors_actual
= VECTORS
;
448 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
449 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
450 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
451 #pragma acc parallel copy (gangs_actual, workers_actual, vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 11" "" { target openacc_nvidia_accel_configured } } */ \
453 num_workers (WORKERS) \
454 vector_length (VECTORS)
456 if (acc_on_device (acc_device_host
))
458 /* We're actually executing with num_gangs (1), num_workers (1),
459 vector_length (1). */
464 else if (acc_on_device (acc_device_nvidia
))
466 /* The GCC nvptx back end enforces vector_length (32). */
471 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
472 for (int i
= 100 * gangs_actual
; i
> -100 * gangs_actual
; --i
)
473 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
474 for (int j
= 100 * workers_actual
; j
> -100 * workers_actual
; --j
)
475 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
476 for (int k
= 100 * vectors_actual
; k
> -100 * vectors_actual
; --k
)
478 gangs_min
= gangs_max
= acc_gang ();
479 workers_min
= workers_max
= acc_worker ();
480 vectors_min
= vectors_max
= acc_vector ();
483 if (gangs_min
!= 0 || gangs_max
!= gangs_actual
- 1
484 || workers_min
!= 0 || workers_max
!= workers_actual
- 1
485 || vectors_min
!= 0 || vectors_max
!= vectors_actual
- 1)
492 /* We can't test parallelized OpenACC kernels constructs in this way: use of
493 the acc_gang, acc_worker, acc_vector functions will make the construct
497 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
500 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
501 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
502 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
505 /* This is to make the OpenACC kernels construct unparallelizable. */
506 asm volatile ("" : : : "memory");
508 #pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
509 for (int i
= 100; i
> -100; --i
)
511 gangs_min
= gangs_max
= acc_gang ();
512 workers_min
= workers_max
= acc_worker ();
513 vectors_min
= vectors_max
= acc_vector ();
516 if (gangs_min
!= 0 || gangs_max
!= 1 - 1
517 || workers_min
!= 0 || workers_max
!= 1 - 1
518 || vectors_min
!= 0 || vectors_max
!= 1 - 1)
523 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
524 kernels even when there are explicit num_gangs, num_workers, or
525 vector_length clauses. */
530 int gangs_min
, gangs_max
, workers_min
, workers_max
, vectors_min
, vectors_max
;
531 gangs_min
= workers_min
= vectors_min
= INT_MAX
;
532 gangs_max
= workers_max
= vectors_max
= INT_MIN
;
533 #pragma acc kernels \
535 num_workers (WORKERS) \
536 vector_length (VECTORS)
538 /* This is to make the OpenACC kernels construct unparallelizable. */
539 asm volatile ("" : : : "memory");
541 #pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
542 for (int i
= 100; i
> -100; --i
)
544 gangs_min
= gangs_max
= acc_gang ();
545 workers_min
= workers_max
= acc_worker ();
546 vectors_min
= vectors_max
= acc_vector ();
549 if (gangs_min
!= 0 || gangs_max
!= 1 - 1
550 || workers_min
!= 0 || workers_max
!= 1 - 1
551 || vectors_min
!= 0 || vectors_max
!= 1 - 1)