1 /* "Function scope" (top-level block scope) 'static' variables
3 ... inside OpenACC compute construct regions as well as OpenACC 'routine'.
5 This is to document/verify aspects of GCC's observed behavior, not
6 necessarily as it's (intended to be?) restricted by the OpenACC
7 specification. See also PR84991, PR84992, PR90779 etc., and
8 <https://github.com/OpenACC/openacc-spec/issues/372> "C/C++ 'static'
9 variables" (only visible to members of the GitHub OpenACC organization).
12 /* { dg-additional-options "-fopt-info-note-omp" }
13 { dg-additional-options "--param=openacc-privatization=noisy" }
14 { dg-additional-options "-foffload=-fopt-info-note-omp" }
15 { dg-additional-options "-foffload=--param=openacc-privatization=noisy" }
16 for testing/documenting aspects of that functionality. */
18 /* { dg-additional-options "-Wopenacc-parallelism" } for testing/documenting
19 aspects of that functionality. */
26 #include <gomp-constants.h>
29 #define IF_DEBUG if (0)
32 /* Without explicit 'num_gangs'. */
34 static void t0_c(void)
37 __builtin_printf ("%s\n", __FUNCTION__
);
39 const int i_limit
= 11;
40 const int var_init
= 16;
42 for (int i
= 0; i
< i_limit
; ++i
)
45 int num_gangs_actual
= -1;
46 #pragma acc parallel \
47 reduction(max:num_gangs_actual) \
49 /* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-3 } */
51 num_gangs_actual
= 1 + __builtin_goacc_parlevel_id(GOMP_DIM_GANG
);
53 static int var
= var_init
;
55 #pragma acc atomic capture
58 /* Irrespective of the order in which the gang-redundant threads
59 execute, 'var' has now been incremented 'num_gangs_actual' times, and
60 the final value captured as 'result'. */
62 /* Without an explicit 'num_gangs' clause GCC assigns 'num_gangs(1)'
63 because it doesn't see any use of gang-level parallelism inside the
65 assert(num_gangs_actual
== 1);
66 assert(result
== var_init
+ num_gangs_actual
* (1 + i
));
71 /* Call a gang-level routine. */
73 static const int t0_r_var_init
= 61;
75 #pragma acc routine gang
76 /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+4 } */
77 /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .+3 } */
78 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .+2 } */
79 __attribute__((noinline
))
80 static int t0_r_r(void)
82 static int var
= t0_r_var_init
;
85 #pragma acc atomic capture
91 static void t0_r(void)
94 __builtin_printf ("%s\n", __FUNCTION__
);
96 const int i_limit
= 11;
98 for (int i
= 0; i
< i_limit
; ++i
)
101 int num_gangs_actual
= -1;
102 #pragma acc parallel \
103 reduction(max:num_gangs_actual) \
104 reduction(max:result)
106 num_gangs_actual
= 1 + __builtin_goacc_parlevel_id(GOMP_DIM_GANG
);
110 /* Irrespective of the order in which the gang-redundant threads
111 execute, 'var' has now been incremented 'num_gangs_actual' times, and
112 the final value captured as 'result'. */
114 /* The number of gangs selected by the implemention ought to but must not
115 be bigger than one. */
117 __builtin_printf ("%d: num_gangs_actual: %d\n", i
, num_gangs_actual
);
118 assert(num_gangs_actual
>= 1);
119 assert(result
== t0_r_var_init
+ num_gangs_actual
* (1 + i
));
124 /* Explicit 'num_gangs'. */
126 static void t1_c(void)
129 __builtin_printf ("%s\n", __FUNCTION__
);
131 const int i_limit
= 22;
132 const int num_gangs_request
= 444;
133 const int var_init
= 5;
135 for (int i
= 0; i
< i_limit
; ++i
)
138 int num_gangs_actual
= -1;
139 /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+1 } */
140 #pragma acc parallel \
141 num_gangs(num_gangs_request) \
142 reduction(max:num_gangs_actual) \
143 reduction(max:result)
144 /* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-4 } */
146 num_gangs_actual
= 1 + __builtin_goacc_parlevel_id(GOMP_DIM_GANG
);
148 static int var
= var_init
;
150 #pragma acc atomic capture
153 /* Irrespective of the order in which the gang-redundant threads
154 execute, 'var' has now been incremented 'num_gangs_actual' times, and
155 the final value captured as 'result'. */
157 if (acc_get_device_type() == acc_device_host
)
158 assert(num_gangs_actual
== 1);
160 assert(num_gangs_actual
== num_gangs_request
);
161 assert(result
== var_init
+ num_gangs_actual
* (1 + i
));
166 /* Check the same routine called from two compute constructs. */
168 static const int t1_r2_var_init
= 166;
170 #pragma acc routine gang
171 /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+4 } */
172 /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .+3 } */
173 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .+2 } */
174 __attribute__((noinline
))
175 static int t1_r2_r(void)
177 static int var
= t1_r2_var_init
;
180 #pragma acc atomic capture
186 static void t1_r2(void)
189 __builtin_printf ("%s\n", __FUNCTION__
);
191 const int i_limit
= 71;
192 /* The checking assumes the same 'num_gangs' for all compute constructs. */
193 const int num_gangs_request
= 333;
194 int num_gangs_actual
= -1;
195 if (acc_get_device_type() == acc_device_host
)
196 num_gangs_actual
= 1;
199 /* We're assuming that the implementation is able to accomodate the
200 'num_gangs' requested (which really ought to be true for
202 num_gangs_actual
= num_gangs_request
;
205 for (int i
= 0; i
< i_limit
; ++i
)
208 #pragma acc parallel \
209 num_gangs(num_gangs_request) \
210 reduction(max:result_1)
212 result_1
= t1_r2_r();
214 /* Irrespective of the order in which the gang-redundant threads
215 execute, 'var' has now been incremented 'num_gangs_actual' times, and
216 the final value captured as 'result_1'. */
219 __builtin_printf ("%d: result_1: %d\n", i
, result_1
);
220 assert(result_1
== t1_r2_var_init
+ num_gangs_actual
* (1 + (i
* 3 + 0)));
223 #pragma acc parallel \
224 num_gangs(num_gangs_request) \
225 reduction(max:result_2)
227 result_2
= t1_r2_r() + t1_r2_r();
229 /* Irrespective of the order in which the gang-redundant threads
230 execute, 'var' has now been incremented '2 * num_gangs_actual' times.
231 However, the order of the two 't1_r2_r' function calls is not
232 synchronized (between different gang-redundant threads). We thus
233 cannot verify the actual 'result_2' values in this case. */
236 __builtin_printf ("%d: result_2: %d\n", i
, result_2
);
237 if (num_gangs_actual
== 1)
238 /* Per the rationale above, only in this case we can check the actual
240 assert(result_2
== (t1_r2_var_init
+ num_gangs_actual
* (1 + (i
* 3 + 1))
241 + t1_r2_var_init
+ num_gangs_actual
* (1 + (i
* 3 + 2))));
242 /* But we can generally check low and high limits. */
244 /* Must be bigger than '2 * result_1'. */
245 int c
= 2 * result_1
;
247 __builtin_printf (" > %d\n", c
);
248 assert(result_2
> c
);
251 /* ..., but limited by the base value for next 'i'. */
252 int c
= 2 * (t1_r2_var_init
+ num_gangs_actual
* (0 + ((i
+ 1) * 3 + 0)));
254 __builtin_printf (" < %d\n", c
);
255 assert(result_2
< c
);
261 /* Asynchronous execution. */
263 static const int t2_var_init_2
= -55;
265 #pragma acc routine gang
266 /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+4 } */
267 /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .+3 } */
268 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .+2 } */
269 __attribute__((noinline
))
270 static int t2_r(void)
272 static int var
= t2_var_init_2
;
275 #pragma acc atomic capture
284 __builtin_printf ("%s\n", __FUNCTION__
);
286 const int i_limit
= 12;
287 const int num_gangs_request_1
= 14;
288 const int var_init_1
= 5;
289 int results_1
[i_limit
][num_gangs_request_1
];
290 memset (results_1
, 0, sizeof results_1
);
291 const int num_gangs_request_2
= 5;
292 int results_2
[i_limit
][num_gangs_request_2
];
293 memset (results_2
, 0, sizeof results_2
);
294 const int num_gangs_request_3
= 34;
295 const int var_init_3
= 1250;
296 int results_3
[i_limit
][num_gangs_request_3
];
297 memset (results_3
, 0, sizeof results_3
);
300 copy(results_1, results_2, results_3)
301 /* { dg-note {variable 'num_gangs_request_1\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target { c && { ! __OPTIMIZE__ } } } .-2 } */
302 /* { dg-note {variable 'num_gangs_request_2\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target { c && { ! __OPTIMIZE__ } } } .-3 } */
303 /* { dg-note {variable 'num_gangs_request_3\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target { c && { ! __OPTIMIZE__ } } } .-4 } */
304 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-5 } */
306 for (int i
= 0; i
< i_limit
; ++i
)
308 /* The following 'async' clauses effect asynchronous execution, but
309 using the same async-argument for each compute construct implies that
310 the respective compute constructs' execution is synchronized with
311 itself, meaning that all 'i = 0' execution has finished (on the
312 device) before 'i = 1' is started (on the device), etc. */
314 /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+1 } */
315 #pragma acc parallel \
317 num_gangs(num_gangs_request_1) \
319 /* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-4 } */
320 /* { dg-note {variable 'tmp' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-5 } */
322 static int var
= var_init_1
;
325 #pragma acc atomic capture
328 results_1
[i
][__builtin_goacc_parlevel_id(GOMP_DIM_GANG
)] += tmp
;
331 #pragma acc parallel \
333 num_gangs(num_gangs_request_2) \
336 results_2
[i
][__builtin_goacc_parlevel_id(GOMP_DIM_GANG
)] += t2_r();
339 /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+1 } */
340 #pragma acc parallel \
342 num_gangs(num_gangs_request_3) \
344 /* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-4 } */
345 /* { dg-note {variable 'tmp' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-5 } */
347 static int var
= var_init_3
;
350 #pragma acc atomic capture
353 results_3
[i
][__builtin_goacc_parlevel_id(GOMP_DIM_GANG
)] += tmp
;
358 int num_gangs_actual_1
;
359 int num_gangs_actual_2
;
360 int num_gangs_actual_3
;
361 if (acc_get_device_type() == acc_device_host
)
363 num_gangs_actual_1
= 1;
364 num_gangs_actual_2
= 1;
365 num_gangs_actual_3
= 1;
369 /* We're assuming that the implementation is able to accomodate the
370 'num_gangs' requested (which really ought to be true for
372 num_gangs_actual_1
= num_gangs_request_1
;
373 num_gangs_actual_2
= num_gangs_request_2
;
374 num_gangs_actual_3
= num_gangs_request_3
;
377 /* For 'i = 0', 'results_*[i][0..num_gangs_actual_*]' are expected to each
378 contain one value of '(1 + var_init_*)..(var_init_* + num_gangs_actual_*)',
379 and so on for increasing 'i'. Their order however is unspecified due to
380 the gang-redundant execution. (Thus checking that their sums match.) */
385 for (int i
= 0; i
< i_limit
; ++i
)
388 for (int g
= 0; g
< num_gangs_actual_1
; ++g
)
391 __builtin_printf ("results_1[%d][%d]: %d\n", i
, g
, results_1
[i
][g
]);
392 result_1_
+= results_1
[i
][g
];
395 __builtin_printf ("%d result_1_: %d\n", i
, result_1_
);
396 assert (result_1_
== (((var_init_1
+ num_gangs_actual_1
* (1 + i
)) * (1 + var_init_1
+ num_gangs_actual_1
* (1 + i
)) / 2)
397 - ((var_init_1
+ num_gangs_actual_1
* (0 + i
)) * (1 + var_init_1
+ num_gangs_actual_1
* (0 + i
)) / 2)));
398 result_1
+= result_1_
;
401 for (int g
= 0; g
< num_gangs_actual_2
; ++g
)
404 __builtin_printf ("results_2[%d][%d]: %d\n", i
, g
, results_2
[i
][g
]);
405 result_2_
+= results_2
[i
][g
];
408 __builtin_printf ("%d result_2_: %d\n", i
, result_2_
);
409 assert (result_2_
== (((t2_var_init_2
+ num_gangs_actual_2
* (1 + i
)) * (1 + t2_var_init_2
+ num_gangs_actual_2
* (1 + i
)) / 2)
410 - ((t2_var_init_2
+ num_gangs_actual_2
* (0 + i
)) * (1 + t2_var_init_2
+ num_gangs_actual_2
* (0 + i
)) / 2)));
411 result_2
+= result_2_
;
414 for (int g
= 0; g
< num_gangs_actual_3
; ++g
)
417 __builtin_printf ("results_3[%d][%d]: %d\n", i
, g
, results_3
[i
][g
]);
418 result_3_
+= results_3
[i
][g
];
421 __builtin_printf ("%d result_3_: %d\n", i
, result_3_
);
422 assert (result_3_
== (((var_init_3
+ num_gangs_actual_3
* (1 + i
)) * (1 + var_init_3
+ num_gangs_actual_3
* (1 + i
)) / 2)
423 - ((var_init_3
+ num_gangs_actual_3
* (0 + i
)) * (1 + var_init_3
+ num_gangs_actual_3
* (0 + i
)) / 2)));
424 result_3
+= result_3_
;
427 __builtin_printf ("result_1: %d\n", result_1
);
428 assert (result_1
== (((var_init_1
+ num_gangs_actual_1
* i_limit
) * (1 + var_init_1
+ num_gangs_actual_1
* i_limit
) / 2)
429 - (var_init_1
* (var_init_1
+ 1) / 2)));
431 __builtin_printf ("result_2: %d\n", result_2
);
432 assert (result_2
== (((t2_var_init_2
+ num_gangs_actual_2
* i_limit
) * (1 + t2_var_init_2
+ num_gangs_actual_2
* i_limit
) / 2)
433 - (t2_var_init_2
* (t2_var_init_2
+ 1) / 2)));
435 __builtin_printf ("result_3: %d\n", result_3
);
436 assert (result_3
== (((var_init_3
+ num_gangs_actual_3
* i_limit
) * (1 + var_init_3
+ num_gangs_actual_3
* i_limit
) / 2)
437 - (var_init_3
* (var_init_3
+ 1) / 2)));
441 #pragma acc routine seq
442 __attribute__((noinline
))
443 static int pr84991_1_r_s(int n
)
445 static const int test
[] = {1,2,3,4};
449 static void pr84991_1(void)
453 #pragma acc parallel copy(n)
455 n
[0] = pr84991_1_r_s(n
[0]);
461 static void pr84992_1(void)
465 #pragma acc parallel copy(n)
466 /* { dg-note {variable 'test' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-1 } */
468 static const int test
[] = {1,2,3,4};