14 /* Test of gang-private variables declared in local scope with parallel
21 for (i
= 0; i
< 32; i
++)
24 #pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
28 #pragma acc loop gang(static:1)
29 for (i
= 0; i
< 32; i
++)
32 #pragma acc loop gang(static:1)
33 for (i
= 0; i
< 32; i
++)
35 if (acc_on_device (acc_device_host
))
41 for (i
= 0; i
< 32; i
++)
42 assert (arr
[i
] == 3 + i
* 2);
46 /* Test of worker-private variables declared in a local scope, broadcasting
47 to vector-partitioned mode. Back-to-back worker loops. */
51 int i
, arr
[32 * 32 * 32];
53 for (i
= 0; i
< 32 * 32 * 32; i
++)
56 #pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
61 for (i
= 0; i
< 32; i
++)
63 #pragma acc loop worker
64 for (j
= 0; j
< 32; j
++)
69 #pragma acc loop vector
70 for (k
= 0; k
< 32; k
++)
71 arr
[i
* 1024 + j
* 32 + k
] += x
* k
;
74 #pragma acc loop worker
75 for (j
= 0; j
< 32; j
++)
80 #pragma acc loop vector
81 for (k
= 0; k
< 32; k
++)
82 arr
[i
* 1024 + j
* 32 + k
] += x
* k
;
87 for (i
= 0; i
< 32; i
++)
88 for (int j
= 0; j
< 32; j
++)
89 for (int k
= 0; k
< 32; k
++)
91 int idx
= i
* 1024 + j
* 32 + k
;
92 assert (arr
[idx
] == idx
+ (i
^ j
* 3) * k
+ (i
| j
* 5) * k
);
97 /* Test of worker-private variables declared in a local scope, broadcasting
98 to vector-partitioned mode. Successive vector loops. */
102 int i
, arr
[32 * 32 * 32];
104 for (i
= 0; i
< 32 * 32 * 32; i
++)
107 #pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
111 #pragma acc loop gang
112 for (i
= 0; i
< 32; i
++)
114 #pragma acc loop worker
115 for (j
= 0; j
< 32; j
++)
120 #pragma acc loop vector
121 for (k
= 0; k
< 32; k
++)
122 arr
[i
* 1024 + j
* 32 + k
] += x
* k
;
126 #pragma acc loop vector
127 for (k
= 0; k
< 32; k
++)
128 arr
[i
* 1024 + j
* 32 + k
] += x
* k
;
133 for (i
= 0; i
< 32; i
++)
134 for (int j
= 0; j
< 32; j
++)
135 for (int k
= 0; k
< 32; k
++)
137 int idx
= i
* 1024 + j
* 32 + k
;
138 assert (arr
[idx
] == idx
+ (i
^ j
* 3) * k
+ (i
| j
* 5) * k
);
143 /* Test of worker-private variables declared in a local scope, broadcasting
144 to vector-partitioned mode. Aggregate worker variable. */
148 int i
, arr
[32 * 32 * 32];
150 for (i
= 0; i
< 32 * 32 * 32; i
++)
153 #pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
157 #pragma acc loop gang
158 for (i
= 0; i
< 32; i
++)
160 #pragma acc loop worker
161 for (j
= 0; j
< 32; j
++)
169 #pragma acc loop vector
170 for (k
= 0; k
< 32; k
++)
171 arr
[i
* 1024 + j
* 32 + k
] += pt
.x
* k
;
173 #pragma acc loop vector
174 for (k
= 0; k
< 32; k
++)
175 arr
[i
* 1024 + j
* 32 + k
] += pt
.y
* k
;
180 for (i
= 0; i
< 32; i
++)
181 for (int j
= 0; j
< 32; j
++)
182 for (int k
= 0; k
< 32; k
++)
184 int idx
= i
* 1024 + j
* 32 + k
;
185 assert (arr
[idx
] == idx
+ (i
^ j
* 3) * k
+ (i
| j
* 5) * k
);
190 /* Test of worker-private variables declared in a local scope, broadcasting
191 to vector-partitioned mode. Addressable worker variable. */
195 int i
, arr
[32 * 32 * 32];
197 for (i
= 0; i
< 32 * 32 * 32; i
++)
200 #pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
204 #pragma acc loop gang
205 for (i
= 0; i
< 32; i
++)
207 #pragma acc loop worker
208 for (j
= 0; j
< 32; j
++)
217 #pragma acc loop vector
218 for (k
= 0; k
< 32; k
++)
219 arr
[i
* 1024 + j
* 32 + k
] += ptp
->x
* k
;
223 #pragma acc loop vector
224 for (k
= 0; k
< 32; k
++)
225 arr
[i
* 1024 + j
* 32 + k
] += pt
.y
* k
;
230 for (i
= 0; i
< 32; i
++)
231 for (int j
= 0; j
< 32; j
++)
232 for (int k
= 0; k
< 32; k
++)
234 int idx
= i
* 1024 + j
* 32 + k
;
235 assert (arr
[idx
] == idx
+ (i
^ j
* 3) * k
+ (i
| j
* 5) * k
);
240 /* Test of worker-private variables declared in a local scope, broadcasting
241 to vector-partitioned mode. Array worker variable. */
245 int i
, arr
[32 * 32 * 32];
247 for (i
= 0; i
< 32 * 32 * 32; i
++)
250 #pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
254 #pragma acc loop gang
255 for (i
= 0; i
< 32; i
++)
257 #pragma acc loop worker
258 for (j
= 0; j
< 32; j
++)
265 #pragma acc loop vector
266 for (k
= 0; k
< 32; k
++)
267 arr
[i
* 1024 + j
* 32 + k
] += pt
[0] * k
;
271 #pragma acc loop vector
272 for (k
= 0; k
< 32; k
++)
273 arr
[i
* 1024 + j
* 32 + k
] += pt
[1] * k
;
278 for (i
= 0; i
< 32; i
++)
279 for (int j
= 0; j
< 32; j
++)
280 for (int k
= 0; k
< 32; k
++)
282 int idx
= i
* 1024 + j
* 32 + k
;
283 assert (arr
[idx
] == idx
+ (i
^ j
* 3) * k
+ (i
| j
* 5) * k
);
288 /* Test of gang-private variables declared on loop directive. */
292 int x
= 5, i
, arr
[32];
294 for (i
= 0; i
< 32; i
++)
297 #pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
299 #pragma acc loop gang private(x)
300 for (i
= 0; i
< 32; i
++)
307 for (i
= 0; i
< 32; i
++)
308 assert (arr
[i
] == i
* 3);
312 /* Test of gang-private variables declared on loop directive, with broadcasting
313 to partitioned workers. */
317 int x
= 5, i
, arr
[32 * 32];
319 for (i
= 0; i
< 32 * 32; i
++)
322 #pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
324 #pragma acc loop gang private(x)
325 for (i
= 0; i
< 32; i
++)
329 #pragma acc loop worker
330 for (int j
= 0; j
< 32; j
++)
331 arr
[i
* 32 + j
] += x
;
335 for (i
= 0; i
< 32 * 32; i
++)
336 assert (arr
[i
] == i
+ (i
/ 32) * 2);
340 /* Test of gang-private variables declared on loop directive, with broadcasting
341 to partitioned vectors. */
345 int x
= 5, i
, arr
[32 * 32];
347 for (i
= 0; i
< 32 * 32; i
++)
350 #pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
352 #pragma acc loop gang private(x)
353 for (i
= 0; i
< 32; i
++)
357 #pragma acc loop vector
358 for (int j
= 0; j
< 32; j
++)
359 arr
[i
* 32 + j
] += x
;
363 for (i
= 0; i
< 32 * 32; i
++)
364 assert (arr
[i
] == i
+ (i
/ 32) * 2);
368 /* Test of gang-private addressable variable declared on loop directive, with
369 broadcasting to partitioned workers. */
373 int x
= 5, i
, arr
[32 * 32];
375 for (i
= 0; i
< 32 * 32; i
++)
378 #pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
380 #pragma acc loop gang private(x)
381 for (i
= 0; i
< 32; i
++)
387 #pragma acc loop worker
388 for (int j
= 0; j
< 32; j
++)
389 arr
[i
* 32 + j
] += x
;
395 for (i
= 0; i
< 32 * 32; i
++)
396 assert (arr
[i
] == i
+ (i
/ 32) * 2);
400 /* Test of gang-private array variable declared on loop directive, with
401 broadcasting to partitioned workers. */
405 int x
[8], i
, arr
[32 * 32];
407 for (i
= 0; i
< 32 * 32; i
++)
410 #pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
412 #pragma acc loop gang private(x)
413 for (i
= 0; i
< 32; i
++)
415 for (int j
= 0; j
< 8; j
++)
418 #pragma acc loop worker
419 for (int j
= 0; j
< 32; j
++)
420 arr
[i
* 32 + j
] += x
[j
% 8];
424 for (i
= 0; i
< 32 * 32; i
++)
425 assert (arr
[i
] == i
+ (i
% 8) * 2);
429 /* Test of gang-private aggregate variable declared on loop directive, with
430 broadcasting to partitioned workers. */
437 for (i
= 0; i
< 32 * 32; i
++)
440 #pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
442 #pragma acc loop gang private(pt)
443 for (i
= 0; i
< 32; i
++)
450 #pragma acc loop worker
451 for (int j
= 0; j
< 32; j
++)
452 arr
[i
* 32 + j
] += pt
.x
+ pt
.y
+ pt
.z
+ pt
.attr
[5];
456 for (i
= 0; i
< 32 * 32; i
++)
457 assert (arr
[i
] == i
+ (i
/ 32) * 13);
461 /* Test of vector-private variables declared on loop directive. */
465 int x
, i
, arr
[32 * 32 * 32];
467 for (i
= 0; i
< 32 * 32 * 32; i
++)
470 #pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
474 #pragma acc loop gang
475 for (i
= 0; i
< 32; i
++)
477 #pragma acc loop worker
478 for (j
= 0; j
< 32; j
++)
482 #pragma acc loop vector private(x)
483 for (k
= 0; k
< 32; k
++)
486 arr
[i
* 1024 + j
* 32 + k
] += x
* k
;
489 #pragma acc loop vector private(x)
490 for (k
= 0; k
< 32; k
++)
493 arr
[i
* 1024 + j
* 32 + k
] += x
* k
;
499 for (i
= 0; i
< 32; i
++)
500 for (int j
= 0; j
< 32; j
++)
501 for (int k
= 0; k
< 32; k
++)
503 int idx
= i
* 1024 + j
* 32 + k
;
504 assert (arr
[idx
] == idx
+ (i
^ j
* 3) * k
+ (i
| j
* 5) * k
);
509 /* Test of vector-private variables declared on loop directive. Array type. */
513 int pt
[2], i
, arr
[32 * 32 * 32];
515 for (i
= 0; i
< 32 * 32 * 32; i
++)
518 #pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
522 #pragma acc loop gang
523 for (i
= 0; i
< 32; i
++)
525 #pragma acc loop worker
526 for (j
= 0; j
< 32; j
++)
530 #pragma acc loop vector private(pt)
531 for (k
= 0; k
< 32; k
++)
535 arr
[i
* 1024 + j
* 32 + k
] += pt
[0] * k
;
536 arr
[i
* 1024 + j
* 32 + k
] += pt
[1] * k
;
542 for (i
= 0; i
< 32; i
++)
543 for (int j
= 0; j
< 32; j
++)
544 for (int k
= 0; k
< 32; k
++)
546 int idx
= i
* 1024 + j
* 32 + k
;
547 assert (arr
[idx
] == idx
+ (i
^ j
* 3) * k
+ (i
| j
* 5) * k
);
552 /* Test of worker-private variables declared on a loop directive. */
556 int x
= 5, i
, arr
[32 * 32];
558 for (i
= 0; i
< 32 * 32; i
++)
561 #pragma acc parallel copy(arr) num_gangs(32) num_workers(8) vector_length(32)
565 #pragma acc loop gang
566 for (i
= 0; i
< 32; i
++)
568 #pragma acc loop worker private(x)
569 for (j
= 0; j
< 32; j
++)
572 /* Try to ensure 'x' accesses doesn't get optimized into a
574 __asm__
__volatile__ ("");
575 arr
[i
* 32 + j
] += x
;
580 for (i
= 0; i
< 32 * 32; i
++)
581 assert (arr
[i
] == i
+ ((i
/ 32) ^ (i
% 32) * 3));
585 /* Test of worker-private variables declared on a loop directive, broadcasting
586 to vector-partitioned mode. */
590 int x
= 5, i
, arr
[32 * 32 * 32];
592 for (i
= 0; i
< 32 * 32 * 32; i
++)
595 #pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
599 #pragma acc loop gang
600 for (i
= 0; i
< 32; i
++)
602 #pragma acc loop worker private(x)
603 for (j
= 0; j
< 32; j
++)
608 #pragma acc loop vector
609 for (k
= 0; k
< 32; k
++)
610 arr
[i
* 1024 + j
* 32 + k
] += x
* k
;
615 for (i
= 0; i
< 32; i
++)
616 for (int j
= 0; j
< 32; j
++)
617 for (int k
= 0; k
< 32; k
++)
619 int idx
= i
* 1024 + j
* 32 + k
;
620 assert (arr
[idx
] == idx
+ (i
^ j
* 3) * k
);
625 /* Test of worker-private variables declared on a loop directive, broadcasting
626 to vector-partitioned mode. Back-to-back worker loops. */
630 int x
= 5, i
, arr
[32 * 32 * 32];
632 for (i
= 0; i
< 32 * 32 * 32; i
++)
635 #pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
639 #pragma acc loop gang
640 for (i
= 0; i
< 32; i
++)
642 #pragma acc loop worker private(x)
643 for (j
= 0; j
< 32; j
++)
648 #pragma acc loop vector
649 for (k
= 0; k
< 32; k
++)
650 arr
[i
* 1024 + j
* 32 + k
] += x
* k
;
653 #pragma acc loop worker private(x)
654 for (j
= 0; j
< 32; j
++)
659 #pragma acc loop vector
660 for (k
= 0; k
< 32; k
++)
661 arr
[i
* 1024 + j
* 32 + k
] += x
* k
;
666 for (i
= 0; i
< 32; i
++)
667 for (int j
= 0; j
< 32; j
++)
668 for (int k
= 0; k
< 32; k
++)
670 int idx
= i
* 1024 + j
* 32 + k
;
671 assert (arr
[idx
] == idx
+ (i
^ j
* 3) * k
+ (i
| j
* 5) * k
);
676 /* Test of worker-private variables declared on a loop directive, broadcasting
677 to vector-partitioned mode. Successive vector loops. */
681 int x
= 5, i
, arr
[32 * 32 * 32];
683 for (i
= 0; i
< 32 * 32 * 32; i
++)
686 #pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
690 #pragma acc loop gang
691 for (i
= 0; i
< 32; i
++)
693 #pragma acc loop worker private(x)
694 for (j
= 0; j
< 32; j
++)
699 #pragma acc loop vector
700 for (k
= 0; k
< 32; k
++)
701 arr
[i
* 1024 + j
* 32 + k
] += x
* k
;
705 #pragma acc loop vector
706 for (k
= 0; k
< 32; k
++)
707 arr
[i
* 1024 + j
* 32 + k
] += x
* k
;
712 for (i
= 0; i
< 32; i
++)
713 for (int j
= 0; j
< 32; j
++)
714 for (int k
= 0; k
< 32; k
++)
716 int idx
= i
* 1024 + j
* 32 + k
;
717 assert (arr
[idx
] == idx
+ (i
^ j
* 3) * k
+ (i
| j
* 5) * k
);
722 /* Test of worker-private variables declared on a loop directive, broadcasting
723 to vector-partitioned mode. Addressable worker variable. */
727 int x
= 5, i
, arr
[32 * 32 * 32];
729 for (i
= 0; i
< 32 * 32 * 32; i
++)
732 #pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
736 #pragma acc loop gang
737 for (i
= 0; i
< 32; i
++)
739 #pragma acc loop worker private(x)
740 for (j
= 0; j
< 32; j
++)
747 #pragma acc loop vector
748 for (k
= 0; k
< 32; k
++)
749 arr
[i
* 1024 + j
* 32 + k
] += x
* k
;
753 #pragma acc loop vector
754 for (k
= 0; k
< 32; k
++)
755 arr
[i
* 1024 + j
* 32 + k
] += x
* k
;
760 for (i
= 0; i
< 32; i
++)
761 for (int j
= 0; j
< 32; j
++)
762 for (int k
= 0; k
< 32; k
++)
764 int idx
= i
* 1024 + j
* 32 + k
;
765 assert (arr
[idx
] == idx
+ (i
^ j
* 3) * k
+ (i
| j
* 5) * k
);
770 /* Test of worker-private variables declared on a loop directive, broadcasting
771 to vector-partitioned mode. Aggregate worker variable. */
775 int i
, arr
[32 * 32 * 32];
778 for (i
= 0; i
< 32 * 32 * 32; i
++)
781 #pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
785 #pragma acc loop gang
786 for (i
= 0; i
< 32; i
++)
788 #pragma acc loop worker private(pt)
789 for (j
= 0; j
< 32; j
++)
796 #pragma acc loop vector
797 for (k
= 0; k
< 32; k
++)
798 arr
[i
* 1024 + j
* 32 + k
] += pt
.x
* k
;
800 #pragma acc loop vector
801 for (k
= 0; k
< 32; k
++)
802 arr
[i
* 1024 + j
* 32 + k
] += pt
.y
* k
;
807 for (i
= 0; i
< 32; i
++)
808 for (int j
= 0; j
< 32; j
++)
809 for (int k
= 0; k
< 32; k
++)
811 int idx
= i
* 1024 + j
* 32 + k
;
812 assert (arr
[idx
] == idx
+ (i
^ j
* 3) * k
+ (i
| j
* 5) * k
);
817 /* Test of worker-private variables declared on loop directive, broadcasting
818 to vector-partitioned mode. Array worker variable. */
822 int i
, arr
[32 * 32 * 32];
825 for (i
= 0; i
< 32 * 32 * 32; i
++)
828 /* "pt" is treated as "present_or_copy" on the parallel directive because it
829 is an array variable. */
830 #pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
834 #pragma acc loop gang
835 for (i
= 0; i
< 32; i
++)
837 /* But here, it is made private per-worker. */
838 #pragma acc loop worker private(pt)
839 for (j
= 0; j
< 32; j
++)
845 #pragma acc loop vector
846 for (k
= 0; k
< 32; k
++)
847 arr
[i
* 1024 + j
* 32 + k
] += pt
[0] * k
;
851 #pragma acc loop vector
852 for (k
= 0; k
< 32; k
++)
853 arr
[i
* 1024 + j
* 32 + k
] += pt
[1] * k
;
858 for (i
= 0; i
< 32; i
++)
859 for (int j
= 0; j
< 32; j
++)
860 for (int k
= 0; k
< 32; k
++)
862 int idx
= i
* 1024 + j
* 32 + k
;
863 assert (arr
[idx
] == idx
+ (i
^ j
* 3) * k
+ (i
| j
* 5) * k
);
868 /* Test of gang-private variables declared on the parallel directive. */
872 int x
= 5, i
, arr
[32];
874 for (i
= 0; i
< 32; i
++)
877 #pragma acc parallel private(x) copy(arr) num_gangs(32) num_workers(8) vector_length(32)
879 #pragma acc loop gang(static:1)
880 for (i
= 0; i
< 32; i
++)
883 #pragma acc loop gang(static:1)
884 for (i
= 0; i
< 32; i
++)
886 if (acc_on_device (acc_device_host
))
892 for (i
= 0; i
< 32; i
++)
893 assert (arr
[i
] == 3 + i
* 2);
897 /* Test of gang-private array variable declared on the parallel directive. */
901 int x
[32], i
, arr
[32 * 32];
903 for (i
= 0; i
< 32 * 32; i
++)
906 #pragma acc parallel private(x) copy(arr) num_gangs(32) num_workers(2) vector_length(32)
908 #pragma acc loop gang
909 for (i
= 0; i
< 32; i
++)
912 for (j
= 0; j
< 32; j
++)
915 #pragma acc loop worker
916 for (j
= 0; j
< 32; j
++)
917 arr
[i
* 32 + j
] += x
[31 - j
];
921 for (i
= 0; i
< 32 * 32; i
++)
922 assert (arr
[i
] == i
+ (31 - (i
% 32)) * 2);