1 /* Miscellaneous test cases for gang/worker/vector mode transitions. */
10 /* Test basic vector-partitioned mode transitions. */
14 int n
= 0, arr
[32], i
;
16 for (i
= 0; i
< 32; i
++)
19 #pragma acc parallel copy(n, arr) \
20 num_gangs(1) num_workers(1) vector_length(32)
24 #pragma acc loop vector
25 for (j
= 0; j
< 32; j
++)
32 for (i
= 0; i
< 32; i
++)
37 /* Test vector-partitioned, gang-partitioned mode. */
41 int n
[32], arr
[1024], i
;
43 for (i
= 0; i
< 1024; i
++)
46 for (i
= 0; i
< 32; i
++)
49 #pragma acc parallel copy(n, arr) \
50 num_gangs(32) num_workers(1) vector_length(32)
54 #pragma acc loop gang(static:*)
55 for (j
= 0; j
< 32; j
++)
59 for (j
= 0; j
< 32; j
++)
60 #pragma acc loop vector
61 for (k
= 0; k
< 32; k
++)
64 #pragma acc loop gang(static:*)
65 for (j
= 0; j
< 32; j
++)
69 for (i
= 0; i
< 32; i
++)
72 for (i
= 0; i
< 1024; i
++)
77 /* Test conditions inside vector-partitioned loops. */
81 int n
[32], arr
[1024], i
;
83 for (i
= 0; i
< 1024; i
++)
86 for (i
= 0; i
< 32; i
++)
89 #pragma acc parallel copy(n, arr) \
90 num_gangs(32) num_workers(1) vector_length(32)
94 #pragma acc loop gang(static:*)
95 for (j
= 0; j
< 32; j
++)
99 for (j
= 0; j
< 32; j
++)
101 #pragma acc loop vector
102 for (k
= 0; k
< 32; k
++)
103 if ((arr
[j
* 32 + k
] % 2) != 0)
104 arr
[j
* 32 + k
] *= 2;
107 #pragma acc loop gang(static:*)
108 for (j
= 0; j
< 32; j
++)
112 for (i
= 0; i
< 32; i
++)
115 for (i
= 0; i
< 1024; i
++)
116 assert (arr
[i
] == ((i
% 2) == 0 ? i
: i
* 2));
120 /* Test conditions inside gang-partitioned/vector-partitioned loops. */
124 int n
[32], arr
[1024], i
;
126 for (i
= 0; i
< 1024; i
++)
129 for (i
= 0; i
< 32; i
++)
132 #pragma acc parallel copy(n, arr) \
133 num_gangs(32) num_workers(1) vector_length(32)
137 #pragma acc loop gang(static:*)
138 for (j
= 0; j
< 32; j
++)
141 #pragma acc loop gang vector
142 for (j
= 0; j
< 1024; j
++)
143 if ((arr
[j
] % 2) != 0)
146 #pragma acc loop gang(static:*)
147 for (j
= 0; j
< 32; j
++)
151 for (i
= 0; i
< 32; i
++)
154 for (i
= 0; i
< 1024; i
++)
155 assert (arr
[i
] == ((i
% 2) == 0 ? i
: i
* 2));
159 /* Test trivial operation of vector-single mode. */
164 #pragma acc parallel copy(n) \
165 num_gangs(1) num_workers(1) vector_length(32)
173 /* Test vector-single, gang-partitioned mode. */
180 for (gangs
= 1; gangs
<= 1024; gangs
<<= 1)
184 for (i
= 0; i
< 1024; i
++)
187 #pragma acc parallel copy(arr) \
188 num_gangs(gangs) num_workers(1) vector_length(32)
191 #pragma acc loop gang
192 for (j
= 0; j
< 1024; j
++)
196 for (i
= 0; i
< 1024; i
++)
197 assert (arr
[i
] == 1);
202 /* Test conditions in vector-single mode. */
209 for (gangs
= 1; gangs
<= 1024; gangs
<<= 1)
213 for (i
= 0; i
< 1024; i
++)
216 #pragma acc parallel copy(arr) \
217 num_gangs(gangs) num_workers(1) vector_length(32)
220 #pragma acc loop gang
221 for (j
= 0; j
< 1024; j
++)
228 for (i
= 0; i
< 1024; i
++)
229 assert (arr
[i
] == ((i
% 3) == 0) ? 1 : 2);
234 /* Test switch in vector-single mode. */
241 for (gangs
= 1; gangs
<= 1024; gangs
<<= 1)
245 for (i
= 0; i
< 1024; i
++)
248 #pragma acc parallel copy(arr) \
249 num_gangs(gangs) num_workers(1) vector_length(32)
252 #pragma acc loop gang
253 for (j
= 0; j
< 1024; j
++)
256 case 0: arr
[j
] += 1; break;
257 case 1: arr
[j
] += 2; break;
258 case 2: arr
[j
] += 3; break;
259 case 3: arr
[j
] += 4; break;
260 case 4: arr
[j
] += 5; break;
261 default: arr
[j
] += 99;
265 for (i
= 0; i
< 1024; i
++)
266 assert (arr
[i
] == (i
% 5) + 1);
271 /* Test switch in vector-single mode, initialise array on device. */
278 for (i
= 0; i
< 1024; i
++)
281 #pragma acc parallel copy(arr) \
282 num_gangs(1024) num_workers(1) vector_length(32)
286 /* This loop and the one following must be distributed to available gangs
287 in the same way to ensure data dependencies are not violated (hence the
288 "static" clauses). */
289 #pragma acc loop gang(static:*)
290 for (j
= 0; j
< 1024; j
++)
293 #pragma acc loop gang(static:*)
294 for (j
= 0; j
< 1024; j
++)
297 case 0: arr
[j
] += 1; break;
298 case 1: arr
[j
] += 2; break;
299 case 2: arr
[j
] += 3; break;
300 case 3: arr
[j
] += 4; break;
301 case 4: arr
[j
] += 5; break;
302 default: arr
[j
] += 99;
306 for (i
= 0; i
< 1024; i
++)
307 assert (arr
[i
] == (i
% 5) + 1);
311 /* Test multiple conditions in vector-single mode. */
313 #define NUM_GANGS 4096
316 bool fizz
[NUM_GANGS
], buzz
[NUM_GANGS
], fizzbuzz
[NUM_GANGS
];
319 #pragma acc parallel copyout(fizz, buzz, fizzbuzz) \
320 num_gangs(NUM_GANGS) num_workers(1) vector_length(32)
324 /* This loop and the one following must be distributed to available gangs
325 in the same way to ensure data dependencies are not violated (hence the
326 "static" clauses). */
327 #pragma acc loop gang(static:*)
328 for (j
= 0; j
< NUM_GANGS
; j
++)
329 fizz
[j
] = buzz
[j
] = fizzbuzz
[j
] = 0;
331 #pragma acc loop gang(static:*)
332 for (j
= 0; j
< NUM_GANGS
; j
++)
334 if ((j
% 3) == 0 && (j
% 5) == 0)
340 else if ((j
% 5) == 0)
346 for (i
= 0; i
< NUM_GANGS
; i
++)
348 assert (fizzbuzz
[i
] == ((i
% 3) == 0 && (i
% 5) == 0));
349 assert (fizz
[i
] == ((i
% 3) == 0 && (i
% 5) != 0));
350 assert (buzz
[i
] == ((i
% 3) != 0 && (i
% 5) == 0));
356 /* Test worker-partitioned/vector-single mode. */
362 for (i
= 0; i
< 32 * 8; i
++)
365 #pragma acc parallel copy(arr) \
366 num_gangs(8) num_workers(8) vector_length(32)
369 #pragma acc loop gang
370 for (j
= 0; j
< 32; j
++)
373 #pragma acc loop worker
374 for (k
= 0; k
< 8; k
++)
375 arr
[j
* 8 + k
] += j
* 8 + k
;
379 for (i
= 0; i
< 32 * 8; i
++)
380 assert (arr
[i
] == i
);
384 /* Test worker-single/worker-partitioned transitions. */
388 int n
[32], arr
[32 * 32], i
;
390 for (i
= 0; i
< 32 * 32; i
++)
393 for (i
= 0; i
< 32; i
++)
396 #pragma acc parallel copy(n, arr) \
397 num_gangs(8) num_workers(16) vector_length(32)
400 #pragma acc loop gang
401 for (j
= 0; j
< 32; j
++)
407 #pragma acc loop worker
408 for (k
= 0; k
< 32; k
++)
413 #pragma acc loop worker
414 for (k
= 0; k
< 32; k
++)
419 #pragma acc loop worker
420 for (k
= 0; k
< 32; k
++)
427 for (i
= 0; i
< 32; i
++)
430 for (i
= 0; i
< 32 * 32; i
++)
431 assert (arr
[i
] == 3);
435 /* Test correct synchronisation between worker-partitioned loops. */
439 int arr_a
[32 * 32], arr_b
[32 * 32], i
;
440 int num_workers
, num_gangs
;
442 for (num_workers
= 1; num_workers
<= 32; num_workers
<<= 1)
443 for (num_gangs
= 1; num_gangs
<= 32; num_gangs
<<= 1)
445 for (i
= 0; i
< 32 * 32; i
++)
448 #pragma acc parallel copyin(arr_a) copyout(arr_b) \
449 num_gangs(num_gangs) num_workers(num_workers) vector_length(32)
452 #pragma acc loop gang
453 for (j
= 0; j
< 32; j
++)
457 #pragma acc loop worker
458 for (k
= 0; k
< 32; k
++)
459 arr_b
[j
* 32 + (31 - k
)] = arr_a
[j
* 32 + k
] * 2;
461 #pragma acc loop worker
462 for (k
= 0; k
< 32; k
++)
463 arr_a
[j
* 32 + (31 - k
)] = arr_b
[j
* 32 + k
] * 2;
465 #pragma acc loop worker
466 for (k
= 0; k
< 32; k
++)
467 arr_b
[j
* 32 + (31 - k
)] = arr_a
[j
* 32 + k
] * 2;
471 for (i
= 0; i
< 32 * 32; i
++)
472 assert (arr_b
[i
] == (i
^ 31) * 8);
477 /* Test correct synchronisation between worker+vector-partitioned loops. */
481 int arr_a
[32 * 32 * 32], arr_b
[32 * 32 * 32], i
;
482 int num_workers
, num_gangs
;
484 for (num_workers
= 1; num_workers
<= 32; num_workers
<<= 1)
485 for (num_gangs
= 1; num_gangs
<= 32; num_gangs
<<= 1)
487 for (i
= 0; i
< 32 * 32 * 32; i
++)
490 #pragma acc parallel copyin(arr_a) copyout(arr_b) \
491 num_gangs(num_gangs) num_workers(num_workers) vector_length(32)
494 #pragma acc loop gang
495 for (j
= 0; j
< 32; j
++)
499 #pragma acc loop worker vector
500 for (k
= 0; k
< 32 * 32; k
++)
501 arr_b
[j
* 32 * 32 + (1023 - k
)] = arr_a
[j
* 32 * 32 + k
] * 2;
503 #pragma acc loop worker vector
504 for (k
= 0; k
< 32 * 32; k
++)
505 arr_a
[j
* 32 * 32 + (1023 - k
)] = arr_b
[j
* 32 * 32 + k
] * 2;
507 #pragma acc loop worker vector
508 for (k
= 0; k
< 32 * 32; k
++)
509 arr_b
[j
* 32 * 32 + (1023 - k
)] = arr_a
[j
* 32 * 32 + k
] * 2;
513 for (i
= 0; i
< 32 * 32 * 32; i
++)
514 assert (arr_b
[i
] == (i
^ 1023) * 8);
519 /* Test correct synchronisation between vector-partitioned loops in
520 worker-partitioned mode. */
524 int n
[32 * 32], arr_a
[32 * 32 * 32], arr_b
[32 * 32 * 32], i
;
525 int num_workers
, num_gangs
;
527 for (num_workers
= 1; num_workers
<= 32; num_workers
<<= 1)
528 for (num_gangs
= 1; num_gangs
<= 32; num_gangs
<<= 1)
530 for (i
= 0; i
< 32 * 32 * 32; i
++)
533 for (i
= 0; i
< 32 * 32; i
++)
536 #pragma acc parallel copy (n) copyin(arr_a) copyout(arr_b) \
537 num_gangs(num_gangs) num_workers(num_workers) vector_length(32)
540 #pragma acc loop gang
541 for (j
= 0; j
< 32; j
++)
545 #pragma acc loop worker
546 for (k
= 0; k
< 32; k
++)
552 #pragma acc loop vector
553 for (m
= 0; m
< 32; m
++)
555 if (((j
* 1024 + k
* 32 + m
) % 2) == 0)
556 arr_b
[j
* 1024 + k
* 32 + (31 - m
)]
557 = arr_a
[j
* 1024 + k
* 32 + m
] * 2;
559 arr_b
[j
* 1024 + k
* 32 + (31 - m
)]
560 = arr_a
[j
* 1024 + k
* 32 + m
] * 3;
563 /* Test returning to vector-single mode... */
566 #pragma acc loop vector
567 for (m
= 0; m
< 32; m
++)
569 if (((j
* 1024 + k
* 32 + m
) % 3) == 0)
570 arr_a
[j
* 1024 + k
* 32 + (31 - m
)]
571 = arr_b
[j
* 1024 + k
* 32 + m
] * 5;
573 arr_a
[j
* 1024 + k
* 32 + (31 - m
)]
574 = arr_b
[j
* 1024 + k
* 32 + m
] * 7;
577 /* ...and back-to-back vector loops. */
579 #pragma acc loop vector
580 for (m
= 0; m
< 32; m
++)
582 if (((j
* 1024 + k
* 32 + m
) % 2) == 0)
583 arr_b
[j
* 1024 + k
* 32 + (31 - m
)]
584 = arr_a
[j
* 1024 + k
* 32 + m
] * 3;
586 arr_b
[j
* 1024 + k
* 32 + (31 - m
)]
587 = arr_a
[j
* 1024 + k
* 32 + m
] * 2;
593 for (i
= 0; i
< 32 * 32; i
++)
596 for (i
= 0; i
< 32 * 32 * 32; i
++)
598 int m
= 6 * ((i
% 3) == 0 ? 5 : 7);
599 assert (arr_b
[i
] == (i
^ 31) * m
);
605 /* With -O0, variables are on the stack, not in registers. Check that worker
606 state propagation handles the stack frame. */
621 #pragma acc parallel copy (w0, w1, w2, w3, w4, w5, w6, w7) \
622 num_gangs (1) num_workers (8)
626 #pragma acc loop worker
627 for (i
= 0; i
< 8; i
++)
631 case 0: w0
= internal
; break;
632 case 1: w1
= internal
; break;
633 case 2: w2
= internal
; break;
634 case 3: w3
= internal
; break;
635 case 4: w4
= internal
; break;
636 case 5: w5
= internal
; break;
637 case 6: w6
= internal
; break;
638 case 7: w7
= internal
; break;
656 /* Test worker-single/vector-single mode. */
662 for (i
= 0; i
< 32; i
++)
665 #pragma acc parallel copy(arr) \
666 num_gangs(8) num_workers(8) vector_length(32)
669 #pragma acc loop gang
670 for (j
= 0; j
< 32; j
++)
674 for (i
= 0; i
< 32; i
++)
675 assert (arr
[i
] == 1);
679 /* Test worker-single/vector-single mode. */
685 for (i
= 0; i
< 32; i
++)
688 #pragma acc parallel copy(arr) \
689 num_gangs(8) num_workers(8) vector_length(32)
692 #pragma acc loop gang
693 for (j
= 0; j
< 32; j
++)
700 for (i
= 0; i
< 32; i
++)
701 assert (arr
[i
] == 1);
705 /* Test condition in worker-single/vector-single mode. */
711 for (i
= 0; i
< 32; i
++)
714 #pragma acc parallel copy(arr) \
715 num_gangs(8) num_workers(8) vector_length(32)
718 #pragma acc loop gang
719 for (j
= 0; j
< 32; j
++)
720 if ((arr
[j
] % 2) != 0)
726 for (i
= 0; i
< 32; i
++)
727 assert (arr
[i
] == ((i
% 2) != 0) ? i
+ 1 : i
+ 2);
731 /* Test switch in worker-single/vector-single mode. */
737 for (i
= 0; i
< 32; i
++)
740 #pragma acc parallel copy(arr) \
741 num_gangs(8) num_workers(8) vector_length(32)
744 #pragma acc loop gang
745 for (j
= 0; j
< 32; j
++)
748 case 0: arr
[j
] += 1; break;
749 case 1: arr
[j
] += 2; break;
750 case 2: arr
[j
] += 3; break;
751 case 3: arr
[j
] += 4; break;
752 case 4: arr
[j
] += 5; break;
753 default: arr
[j
] += 99;
757 for (i
= 0; i
< 32; i
++)
758 assert (arr
[i
] == i
+ (i
% 5) + 1);
762 /* Test worker-single/vector-partitioned mode. */
768 for (i
= 0; i
< 32 * 32; i
++)
771 #pragma acc parallel copy(arr) \
772 num_gangs(8) num_workers(8) vector_length(32)
775 #pragma acc loop gang
776 for (j
= 0; j
< 32; j
++)
779 #pragma acc loop vector
780 for (k
= 0; k
< 32; k
++)
788 for (i
= 0; i
< 32 * 32; i
++)
789 assert (arr
[i
] == i
+ 1);
793 /* Test worker-single, vector-partitioned, gang-redundant mode. */
795 #define ACTUAL_GANGS 8
801 for (i
= 0; i
< 32; i
++)
806 #pragma acc parallel copy(n, arr) copyout(ondev) \
807 num_gangs(ACTUAL_GANGS) num_workers(8) vector_length(32)
811 ondev
= acc_on_device (acc_device_not_host
);
816 #pragma acc loop vector
817 for (j
= 0; j
< 32; j
++)
827 int m
= ondev
? ACTUAL_GANGS
: 1;
831 for (i
= 0; i
< 32; i
++)
832 assert (arr
[i
] == m
);
837 /* Check if worker-single variables get broadcastd to vectors. */
848 float threads
[N
], v1
= 3.14;
850 for (int i
= 0; i
< N
; i
++)
853 #pragma acc parallel num_gangs (1) vector_length (32) copy (v1)
855 float val
= t28_routine ();
857 #pragma acc loop vector
858 for (int i
= 0; i
< N
; i
++)
859 threads
[i
] = val
+ v1
*i
;
862 for (int i
= 0; i
< N
; i
++)
863 assert (fabs (threads
[i
] - (t28_routine () + v1
*i
)) < 0.0001);