* c-c++-common/Wrestrict.c (test_strcpy_range): Revert latest change.
[official-gcc.git] / libgomp / testsuite / libgomp.oacc-c-c++-common / mode-transitions.c
blob2394ac8cbd60076814222b8fb3e6c4cb99fb8ee1
1 /* Miscellaneous test cases for gang/worker/vector mode transitions. */
3 #include <assert.h>
4 #include <stdbool.h>
5 #include <stdlib.h>
6 #include <math.h>
7 #include <openacc.h>
10 /* Test basic vector-partitioned mode transitions. */
12 void t1()
14 int n = 0, arr[32], i;
16 for (i = 0; i < 32; i++)
17 arr[i] = 0;
19 #pragma acc parallel copy(n, arr) \
20 num_gangs(1) num_workers(1) vector_length(32)
22 int j;
23 n++;
24 #pragma acc loop vector
25 for (j = 0; j < 32; j++)
26 arr[j]++;
27 n++;
30 assert (n == 2);
32 for (i = 0; i < 32; i++)
33 assert (arr[i] == 1);
37 /* Test vector-partitioned, gang-partitioned mode. */
39 void t2()
41 int n[32], arr[1024], i;
43 for (i = 0; i < 1024; i++)
44 arr[i] = 0;
46 for (i = 0; i < 32; i++)
47 n[i] = 0;
49 #pragma acc parallel copy(n, arr) \
50 num_gangs(32) num_workers(1) vector_length(32)
52 int j, k;
54 #pragma acc loop gang(static:*)
55 for (j = 0; j < 32; j++)
56 n[j]++;
58 #pragma acc loop gang
59 for (j = 0; j < 32; j++)
60 #pragma acc loop vector
61 for (k = 0; k < 32; k++)
62 arr[j * 32 + k]++;
64 #pragma acc loop gang(static:*)
65 for (j = 0; j < 32; j++)
66 n[j]++;
69 for (i = 0; i < 32; i++)
70 assert (n[i] == 2);
72 for (i = 0; i < 1024; i++)
73 assert (arr[i] == 1);
77 /* Test conditions inside vector-partitioned loops. */
79 void t4()
81 int n[32], arr[1024], i;
83 for (i = 0; i < 1024; i++)
84 arr[i] = i;
86 for (i = 0; i < 32; i++)
87 n[i] = 0;
89 #pragma acc parallel copy(n, arr) \
90 num_gangs(32) num_workers(1) vector_length(32)
92 int j, k;
94 #pragma acc loop gang(static:*)
95 for (j = 0; j < 32; j++)
96 n[j]++;
98 #pragma acc loop gang
99 for (j = 0; j < 32; j++)
101 #pragma acc loop vector
102 for (k = 0; k < 32; k++)
103 if ((arr[j * 32 + k] % 2) != 0)
104 arr[j * 32 + k] *= 2;
107 #pragma acc loop gang(static:*)
108 for (j = 0; j < 32; j++)
109 n[j]++;
112 for (i = 0; i < 32; i++)
113 assert (n[i] == 2);
115 for (i = 0; i < 1024; i++)
116 assert (arr[i] == ((i % 2) == 0 ? i : i * 2));
120 /* Test conditions inside gang-partitioned/vector-partitioned loops. */
122 void t5()
124 int n[32], arr[1024], i;
126 for (i = 0; i < 1024; i++)
127 arr[i] = i;
129 for (i = 0; i < 32; i++)
130 n[i] = 0;
132 #pragma acc parallel copy(n, arr) \
133 num_gangs(32) num_workers(1) vector_length(32)
135 int j;
137 #pragma acc loop gang(static:*)
138 for (j = 0; j < 32; j++)
139 n[j]++;
141 #pragma acc loop gang vector
142 for (j = 0; j < 1024; j++)
143 if ((arr[j] % 2) != 0)
144 arr[j] *= 2;
146 #pragma acc loop gang(static:*)
147 for (j = 0; j < 32; j++)
148 n[j]++;
151 for (i = 0; i < 32; i++)
152 assert (n[i] == 2);
154 for (i = 0; i < 1024; i++)
155 assert (arr[i] == ((i % 2) == 0 ? i : i * 2));
159 /* Test trivial operation of vector-single mode. */
161 void t7()
163 int n = 0;
164 #pragma acc parallel copy(n) \
165 num_gangs(1) num_workers(1) vector_length(32)
167 n++;
169 assert (n == 1);
173 /* Test vector-single, gang-partitioned mode. */
175 void t8()
177 int arr[1024];
178 int gangs;
180 for (gangs = 1; gangs <= 1024; gangs <<= 1)
182 int i;
184 for (i = 0; i < 1024; i++)
185 arr[i] = 0;
187 #pragma acc parallel copy(arr) \
188 num_gangs(gangs) num_workers(1) vector_length(32)
190 int j;
191 #pragma acc loop gang
192 for (j = 0; j < 1024; j++)
193 arr[j]++;
196 for (i = 0; i < 1024; i++)
197 assert (arr[i] == 1);
202 /* Test conditions in vector-single mode. */
204 void t9()
206 int arr[1024];
207 int gangs;
209 for (gangs = 1; gangs <= 1024; gangs <<= 1)
211 int i;
213 for (i = 0; i < 1024; i++)
214 arr[i] = 0;
216 #pragma acc parallel copy(arr) \
217 num_gangs(gangs) num_workers(1) vector_length(32)
219 int j;
220 #pragma acc loop gang
221 for (j = 0; j < 1024; j++)
222 if ((j % 3) == 0)
223 arr[j]++;
224 else
225 arr[j] += 2;
228 for (i = 0; i < 1024; i++)
229 assert (arr[i] == ((i % 3) == 0) ? 1 : 2);
234 /* Test switch in vector-single mode. */
236 void t10()
238 int arr[1024];
239 int gangs;
241 for (gangs = 1; gangs <= 1024; gangs <<= 1)
243 int i;
245 for (i = 0; i < 1024; i++)
246 arr[i] = 0;
248 #pragma acc parallel copy(arr) \
249 num_gangs(gangs) num_workers(1) vector_length(32)
251 int j;
252 #pragma acc loop gang
253 for (j = 0; j < 1024; j++)
254 switch (j % 5)
256 case 0: arr[j] += 1; break;
257 case 1: arr[j] += 2; break;
258 case 2: arr[j] += 3; break;
259 case 3: arr[j] += 4; break;
260 case 4: arr[j] += 5; break;
261 default: arr[j] += 99;
265 for (i = 0; i < 1024; i++)
266 assert (arr[i] == (i % 5) + 1);
271 /* Test switch in vector-single mode, initialise array on device. */
273 void t11()
275 int arr[1024];
276 int i;
278 for (i = 0; i < 1024; i++)
279 arr[i] = 99;
281 #pragma acc parallel copy(arr) \
282 num_gangs(1024) num_workers(1) vector_length(32)
284 int j;
286 /* This loop and the one following must be distributed to available gangs
287 in the same way to ensure data dependencies are not violated (hence the
288 "static" clauses). */
289 #pragma acc loop gang(static:*)
290 for (j = 0; j < 1024; j++)
291 arr[j] = 0;
293 #pragma acc loop gang(static:*)
294 for (j = 0; j < 1024; j++)
295 switch (j % 5)
297 case 0: arr[j] += 1; break;
298 case 1: arr[j] += 2; break;
299 case 2: arr[j] += 3; break;
300 case 3: arr[j] += 4; break;
301 case 4: arr[j] += 5; break;
302 default: arr[j] += 99;
306 for (i = 0; i < 1024; i++)
307 assert (arr[i] == (i % 5) + 1);
311 /* Test multiple conditions in vector-single mode. */
313 #define NUM_GANGS 4096
314 void t12()
316 bool fizz[NUM_GANGS], buzz[NUM_GANGS], fizzbuzz[NUM_GANGS];
317 int i;
319 #pragma acc parallel copyout(fizz, buzz, fizzbuzz) \
320 num_gangs(NUM_GANGS) num_workers(1) vector_length(32)
322 int j;
324 /* This loop and the one following must be distributed to available gangs
325 in the same way to ensure data dependencies are not violated (hence the
326 "static" clauses). */
327 #pragma acc loop gang(static:*)
328 for (j = 0; j < NUM_GANGS; j++)
329 fizz[j] = buzz[j] = fizzbuzz[j] = 0;
331 #pragma acc loop gang(static:*)
332 for (j = 0; j < NUM_GANGS; j++)
334 if ((j % 3) == 0 && (j % 5) == 0)
335 fizzbuzz[j] = 1;
336 else
338 if ((j % 3) == 0)
339 fizz[j] = 1;
340 else if ((j % 5) == 0)
341 buzz[j] = 1;
346 for (i = 0; i < NUM_GANGS; i++)
348 assert (fizzbuzz[i] == ((i % 3) == 0 && (i % 5) == 0));
349 assert (fizz[i] == ((i % 3) == 0 && (i % 5) != 0));
350 assert (buzz[i] == ((i % 3) != 0 && (i % 5) == 0));
353 #undef NUM_GANGS
356 /* Test worker-partitioned/vector-single mode. */
358 void t13()
360 int arr[32 * 8], i;
362 for (i = 0; i < 32 * 8; i++)
363 arr[i] = 0;
365 #pragma acc parallel copy(arr) \
366 num_gangs(8) num_workers(8) vector_length(32)
368 int j;
369 #pragma acc loop gang
370 for (j = 0; j < 32; j++)
372 int k;
373 #pragma acc loop worker
374 for (k = 0; k < 8; k++)
375 arr[j * 8 + k] += j * 8 + k;
379 for (i = 0; i < 32 * 8; i++)
380 assert (arr[i] == i);
384 /* Test worker-single/worker-partitioned transitions. */
386 void t16()
388 int n[32], arr[32 * 32], i;
390 for (i = 0; i < 32 * 32; i++)
391 arr[i] = 0;
393 for (i = 0; i < 32; i++)
394 n[i] = 0;
396 #pragma acc parallel copy(n, arr) \
397 num_gangs(8) num_workers(16) vector_length(32)
399 int j;
400 #pragma acc loop gang
401 for (j = 0; j < 32; j++)
403 int k;
405 n[j]++;
407 #pragma acc loop worker
408 for (k = 0; k < 32; k++)
409 arr[j * 32 + k]++;
411 n[j]++;
413 #pragma acc loop worker
414 for (k = 0; k < 32; k++)
415 arr[j * 32 + k]++;
417 n[j]++;
419 #pragma acc loop worker
420 for (k = 0; k < 32; k++)
421 arr[j * 32 + k]++;
423 n[j]++;
427 for (i = 0; i < 32; i++)
428 assert (n[i] == 4);
430 for (i = 0; i < 32 * 32; i++)
431 assert (arr[i] == 3);
435 /* Test correct synchronisation between worker-partitioned loops. */
437 void t17()
439 int arr_a[32 * 32], arr_b[32 * 32], i;
440 int num_workers, num_gangs;
442 for (num_workers = 1; num_workers <= 32; num_workers <<= 1)
443 for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1)
445 for (i = 0; i < 32 * 32; i++)
446 arr_a[i] = i;
448 #pragma acc parallel copyin(arr_a) copyout(arr_b) \
449 num_gangs(num_gangs) num_workers(num_workers) vector_length(32)
451 int j;
452 #pragma acc loop gang
453 for (j = 0; j < 32; j++)
455 int k;
457 #pragma acc loop worker
458 for (k = 0; k < 32; k++)
459 arr_b[j * 32 + (31 - k)] = arr_a[j * 32 + k] * 2;
461 #pragma acc loop worker
462 for (k = 0; k < 32; k++)
463 arr_a[j * 32 + (31 - k)] = arr_b[j * 32 + k] * 2;
465 #pragma acc loop worker
466 for (k = 0; k < 32; k++)
467 arr_b[j * 32 + (31 - k)] = arr_a[j * 32 + k] * 2;
471 for (i = 0; i < 32 * 32; i++)
472 assert (arr_b[i] == (i ^ 31) * 8);
477 /* Test correct synchronisation between worker+vector-partitioned loops. */
479 void t18()
481 int arr_a[32 * 32 * 32], arr_b[32 * 32 * 32], i;
482 int num_workers, num_gangs;
484 for (num_workers = 1; num_workers <= 32; num_workers <<= 1)
485 for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1)
487 for (i = 0; i < 32 * 32 * 32; i++)
488 arr_a[i] = i;
490 #pragma acc parallel copyin(arr_a) copyout(arr_b) \
491 num_gangs(num_gangs) num_workers(num_workers) vector_length(32)
493 int j;
494 #pragma acc loop gang
495 for (j = 0; j < 32; j++)
497 int k;
499 #pragma acc loop worker vector
500 for (k = 0; k < 32 * 32; k++)
501 arr_b[j * 32 * 32 + (1023 - k)] = arr_a[j * 32 * 32 + k] * 2;
503 #pragma acc loop worker vector
504 for (k = 0; k < 32 * 32; k++)
505 arr_a[j * 32 * 32 + (1023 - k)] = arr_b[j * 32 * 32 + k] * 2;
507 #pragma acc loop worker vector
508 for (k = 0; k < 32 * 32; k++)
509 arr_b[j * 32 * 32 + (1023 - k)] = arr_a[j * 32 * 32 + k] * 2;
513 for (i = 0; i < 32 * 32 * 32; i++)
514 assert (arr_b[i] == (i ^ 1023) * 8);
519 /* Test correct synchronisation between vector-partitioned loops in
520 worker-partitioned mode. */
522 void t19()
524 int n[32 * 32], arr_a[32 * 32 * 32], arr_b[32 * 32 * 32], i;
525 int num_workers, num_gangs;
527 for (num_workers = 1; num_workers <= 32; num_workers <<= 1)
528 for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1)
530 for (i = 0; i < 32 * 32 * 32; i++)
531 arr_a[i] = i;
533 for (i = 0; i < 32 * 32; i++)
534 n[i] = 0;
536 #pragma acc parallel copy (n) copyin(arr_a) copyout(arr_b) \
537 num_gangs(num_gangs) num_workers(num_workers) vector_length(32)
539 int j;
540 #pragma acc loop gang
541 for (j = 0; j < 32; j++)
543 int k;
545 #pragma acc loop worker
546 for (k = 0; k < 32; k++)
548 int m;
550 n[j * 32 + k]++;
552 #pragma acc loop vector
553 for (m = 0; m < 32; m++)
555 if (((j * 1024 + k * 32 + m) % 2) == 0)
556 arr_b[j * 1024 + k * 32 + (31 - m)]
557 = arr_a[j * 1024 + k * 32 + m] * 2;
558 else
559 arr_b[j * 1024 + k * 32 + (31 - m)]
560 = arr_a[j * 1024 + k * 32 + m] * 3;
563 /* Test returning to vector-single mode... */
564 n[j * 32 + k]++;
566 #pragma acc loop vector
567 for (m = 0; m < 32; m++)
569 if (((j * 1024 + k * 32 + m) % 3) == 0)
570 arr_a[j * 1024 + k * 32 + (31 - m)]
571 = arr_b[j * 1024 + k * 32 + m] * 5;
572 else
573 arr_a[j * 1024 + k * 32 + (31 - m)]
574 = arr_b[j * 1024 + k * 32 + m] * 7;
577 /* ...and back-to-back vector loops. */
579 #pragma acc loop vector
580 for (m = 0; m < 32; m++)
582 if (((j * 1024 + k * 32 + m) % 2) == 0)
583 arr_b[j * 1024 + k * 32 + (31 - m)]
584 = arr_a[j * 1024 + k * 32 + m] * 3;
585 else
586 arr_b[j * 1024 + k * 32 + (31 - m)]
587 = arr_a[j * 1024 + k * 32 + m] * 2;
593 for (i = 0; i < 32 * 32; i++)
594 assert (n[i] == 2);
596 for (i = 0; i < 32 * 32 * 32; i++)
598 int m = 6 * ((i % 3) == 0 ? 5 : 7);
599 assert (arr_b[i] == (i ^ 31) * m);
605 /* With -O0, variables are on the stack, not in registers. Check that worker
606 state propagation handles the stack frame. */
608 void t20()
610 int w0 = 0;
611 int w1 = 0;
612 int w2 = 0;
613 int w3 = 0;
614 int w4 = 0;
615 int w5 = 0;
616 int w6 = 0;
617 int w7 = 0;
619 int i;
621 #pragma acc parallel copy (w0, w1, w2, w3, w4, w5, w6, w7) \
622 num_gangs (1) num_workers (8)
624 int internal = 100;
626 #pragma acc loop worker
627 for (i = 0; i < 8; i++)
629 switch (i)
631 case 0: w0 = internal; break;
632 case 1: w1 = internal; break;
633 case 2: w2 = internal; break;
634 case 3: w3 = internal; break;
635 case 4: w4 = internal; break;
636 case 5: w5 = internal; break;
637 case 6: w6 = internal; break;
638 case 7: w7 = internal; break;
639 default: break;
644 if (w0 != 100
645 || w1 != 100
646 || w2 != 100
647 || w3 != 100
648 || w4 != 100
649 || w5 != 100
650 || w6 != 100
651 || w7 != 100)
652 __builtin_abort ();
656 /* Test worker-single/vector-single mode. */
658 void t21()
660 int arr[32], i;
662 for (i = 0; i < 32; i++)
663 arr[i] = 0;
665 #pragma acc parallel copy(arr) \
666 num_gangs(8) num_workers(8) vector_length(32)
668 int j;
669 #pragma acc loop gang
670 for (j = 0; j < 32; j++)
671 arr[j]++;
674 for (i = 0; i < 32; i++)
675 assert (arr[i] == 1);
679 /* Test worker-single/vector-single mode. */
681 void t22()
683 int arr[32], i;
685 for (i = 0; i < 32; i++)
686 arr[i] = 0;
688 #pragma acc parallel copy(arr) \
689 num_gangs(8) num_workers(8) vector_length(32)
691 int j;
692 #pragma acc loop gang
693 for (j = 0; j < 32; j++)
695 #pragma acc atomic
696 arr[j]++;
700 for (i = 0; i < 32; i++)
701 assert (arr[i] == 1);
705 /* Test condition in worker-single/vector-single mode. */
707 void t23()
709 int arr[32], i;
711 for (i = 0; i < 32; i++)
712 arr[i] = i;
714 #pragma acc parallel copy(arr) \
715 num_gangs(8) num_workers(8) vector_length(32)
717 int j;
718 #pragma acc loop gang
719 for (j = 0; j < 32; j++)
720 if ((arr[j] % 2) != 0)
721 arr[j]++;
722 else
723 arr[j] += 2;
726 for (i = 0; i < 32; i++)
727 assert (arr[i] == ((i % 2) != 0) ? i + 1 : i + 2);
731 /* Test switch in worker-single/vector-single mode. */
733 void t24()
735 int arr[32], i;
737 for (i = 0; i < 32; i++)
738 arr[i] = i;
740 #pragma acc parallel copy(arr) \
741 num_gangs(8) num_workers(8) vector_length(32)
743 int j;
744 #pragma acc loop gang
745 for (j = 0; j < 32; j++)
746 switch (arr[j] % 5)
748 case 0: arr[j] += 1; break;
749 case 1: arr[j] += 2; break;
750 case 2: arr[j] += 3; break;
751 case 3: arr[j] += 4; break;
752 case 4: arr[j] += 5; break;
753 default: arr[j] += 99;
757 for (i = 0; i < 32; i++)
758 assert (arr[i] == i + (i % 5) + 1);
762 /* Test worker-single/vector-partitioned mode. */
764 void t25()
766 int arr[32 * 32], i;
768 for (i = 0; i < 32 * 32; i++)
769 arr[i] = i;
771 #pragma acc parallel copy(arr) \
772 num_gangs(8) num_workers(8) vector_length(32)
774 int j;
775 #pragma acc loop gang
776 for (j = 0; j < 32; j++)
778 int k;
779 #pragma acc loop vector
780 for (k = 0; k < 32; k++)
782 #pragma acc atomic
783 arr[j * 32 + k]++;
788 for (i = 0; i < 32 * 32; i++)
789 assert (arr[i] == i + 1);
793 /* Test worker-single, vector-partitioned, gang-redundant mode. */
795 #define ACTUAL_GANGS 8
796 void t27()
798 int n, arr[32], i;
799 int ondev;
801 for (i = 0; i < 32; i++)
802 arr[i] = 0;
804 n = 0;
806 #pragma acc parallel copy(n, arr) copyout(ondev) \
807 num_gangs(ACTUAL_GANGS) num_workers(8) vector_length(32)
809 int j;
811 ondev = acc_on_device (acc_device_not_host);
813 #pragma acc atomic
814 n++;
816 #pragma acc loop vector
817 for (j = 0; j < 32; j++)
819 #pragma acc atomic
820 arr[j] += 1;
823 #pragma acc atomic
824 n++;
827 int m = ondev ? ACTUAL_GANGS : 1;
829 assert (n == m * 2);
831 for (i = 0; i < 32; i++)
832 assert (arr[i] == m);
834 #undef ACTUAL_GANGS
837 /* Check if worker-single variables get broadcastd to vectors. */
839 #pragma acc routine
840 float t28_routine ()
842 return 2.71;
845 #define N 32
846 void t28()
848 float threads[N], v1 = 3.14;
850 for (int i = 0; i < N; i++)
851 threads[i] = -1;
853 #pragma acc parallel num_gangs (1) vector_length (32) copy (v1)
855 float val = t28_routine ();
857 #pragma acc loop vector
858 for (int i = 0; i < N; i++)
859 threads[i] = val + v1*i;
862 for (int i = 0; i < N; i++)
863 assert (fabs (threads[i] - (t28_routine () + v1*i)) < 0.0001);
865 #undef N
868 int main()
870 t1();
871 t2();
872 t4();
873 t5();
874 t7();
875 t8();
876 t9();
877 t10();
878 t11();
879 t12();
880 t13();
881 t16();
882 t17();
883 t18();
884 t19();
885 t20();
886 t21();
887 t22();
888 t23();
889 t24();
890 t25();
891 t27();
892 t28();
894 return 0;