2 /* { dg-additional-options "-lcuda" { target openacc_nvidia_accel_selected } } */
6 #if defined ACC_DEVICE_TYPE_nvidia
14 main (int argc
, char **argv
)
16 #if defined ACC_DEVICE_TYPE_nvidia
20 int N
= 128; //1024 * 1024;
21 float *a
, *b
, *c
, *d
, *e
;
25 #if defined ACC_DEVICE_TYPE_nvidia
26 acc_init (acc_device_nvidia
);
29 nbytes
= N
* sizeof (float);
31 a
= (float *) malloc (nbytes
);
32 b
= (float *) malloc (nbytes
);
33 c
= (float *) malloc (nbytes
);
34 d
= (float *) malloc (nbytes
);
35 e
= (float *) malloc (nbytes
);
37 for (i
= 0; i
< N
; i
++)
43 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
46 #pragma acc parallel async
50 for (ii
= 0; ii
< N
; ii
++)
58 for (i
= 0; i
< N
; i
++)
67 for (i
= 0; i
< N
; i
++)
73 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
76 #pragma acc parallel async (1)
80 for (ii
= 0; ii
< N
; ii
++)
88 for (i
= 0; i
< N
; i
++)
97 for (i
= 0; i
< N
; i
++)
105 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
108 #pragma acc parallel async (1)
112 for (ii
= 0; ii
< N
; ii
++)
113 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
116 #pragma acc parallel async (1)
120 for (ii
= 0; ii
< N
; ii
++)
121 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
125 #pragma acc parallel async (1)
129 for (ii
= 0; ii
< N
; ii
++)
130 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
137 for (i
= 0; i
< N
; i
++)
152 for (i
= 0; i
< N
; i
++)
161 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
164 #pragma acc parallel async (1)
168 for (ii
= 0; ii
< N
; ii
++)
169 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
172 #pragma acc parallel async (1)
176 for (ii
= 0; ii
< N
; ii
++)
177 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
180 #pragma acc parallel async (1)
184 for (ii
= 0; ii
< N
; ii
++)
185 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
188 #pragma acc parallel wait (1) async (1)
192 for (ii
= 0; ii
< N
; ii
++)
193 e
[ii
] = a
[ii
] + b
[ii
] + c
[ii
] + d
[ii
];
200 for (i
= 0; i
< N
; i
++)
219 #if defined ACC_DEVICE_TYPE_nvidia
220 r
= cuStreamCreate (&stream1
, CU_STREAM_NON_BLOCKING
);
221 if (r
!= CUDA_SUCCESS
)
223 fprintf (stderr
, "cuStreamCreate failed: %d\n", r
);
227 acc_set_cuda_stream (1, stream1
);
230 for (i
= 0; i
< N
; i
++)
236 #pragma acc data copy (a[0:N], b[0:N]) copyin (N)
239 #pragma acc parallel async (1)
243 for (ii
= 0; ii
< N
; ii
++)
251 for (i
= 0; i
< N
; i
++)
260 for (i
= 0; i
< N
; i
++)
268 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
271 #pragma acc parallel async (1)
275 for (ii
= 0; ii
< N
; ii
++)
276 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
279 #pragma acc parallel async (1)
283 for (ii
= 0; ii
< N
; ii
++)
284 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
287 #pragma acc parallel async (1)
291 for (ii
= 0; ii
< N
; ii
++)
292 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
299 for (i
= 0; i
< N
; i
++)
314 for (i
= 0; i
< N
; i
++)
323 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
326 #pragma acc parallel async (1)
330 for (ii
= 0; ii
< N
; ii
++)
331 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
334 #pragma acc parallel async (1)
338 for (ii
= 0; ii
< N
; ii
++)
339 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
342 #pragma acc parallel async (1)
346 for (ii
= 0; ii
< N
; ii
++)
347 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
350 #pragma acc parallel wait (1) async (1)
354 for (ii
= 0; ii
< N
; ii
++)
355 e
[ii
] = a
[ii
] + b
[ii
] + c
[ii
] + d
[ii
];
362 for (i
= 0; i
< N
; i
++)
380 for (i
= 0; i
< N
; i
++)
389 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
392 #pragma acc parallel async (1)
396 for (ii
= 0; ii
< N
; ii
++)
397 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
400 #pragma acc parallel async (1)
404 for (ii
= 0; ii
< N
; ii
++)
405 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
408 #pragma acc update host (a[0:N], b[0:N], c[0:N]) wait (1)
412 for (i
= 0; i
< N
; i
++)
425 for (i
= 0; i
< N
; i
++)
434 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
437 #pragma acc parallel async (1)
441 for (ii
= 0; ii
< N
; ii
++)
442 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
445 #pragma acc parallel async (1)
449 for (ii
= 0; ii
< N
; ii
++)
450 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
453 #pragma acc update host (a[0:N], b[0:N], c[0:N]) async (1)
459 for (i
= 0; i
< N
; i
++)
471 for (i
= 0; i
< N
; i
++)
477 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
480 #pragma acc kernels async
484 for (ii
= 0; ii
< N
; ii
++)
492 for (i
= 0; i
< N
; i
++)
501 for (i
= 0; i
< N
; i
++)
507 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
510 #pragma acc kernels async (1)
514 for (ii
= 0; ii
< N
; ii
++)
522 for (i
= 0; i
< N
; i
++)
531 for (i
= 0; i
< N
; i
++)
539 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
542 #pragma acc kernels async (1)
546 for (ii
= 0; ii
< N
; ii
++)
547 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
550 #pragma acc kernels async (1)
554 for (ii
= 0; ii
< N
; ii
++)
555 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
559 #pragma acc kernels async (1)
563 for (ii
= 0; ii
< N
; ii
++)
564 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
571 for (i
= 0; i
< N
; i
++)
586 for (i
= 0; i
< N
; i
++)
595 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
598 #pragma acc kernels async (1)
602 for (ii
= 0; ii
< N
; ii
++)
603 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
606 #pragma acc kernels async (1)
610 for (ii
= 0; ii
< N
; ii
++)
611 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
614 #pragma acc kernels async (1)
618 for (ii
= 0; ii
< N
; ii
++)
619 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
622 #pragma acc kernels wait (1) async (1)
626 for (ii
= 0; ii
< N
; ii
++)
627 e
[ii
] = a
[ii
] + b
[ii
] + c
[ii
] + d
[ii
];
634 for (i
= 0; i
< N
; i
++)
653 #if defined ACC_DEVICE_TYPE_nvidia
654 r
= cuStreamCreate (&stream1
, CU_STREAM_NON_BLOCKING
);
655 if (r
!= CUDA_SUCCESS
)
657 fprintf (stderr
, "cuStreamCreate failed: %d\n", r
);
661 acc_set_cuda_stream (1, stream1
);
664 for (i
= 0; i
< N
; i
++)
670 #pragma acc data copy (a[0:N], b[0:N]) copyin (N)
673 #pragma acc kernels async (1)
677 for (ii
= 0; ii
< N
; ii
++)
685 for (i
= 0; i
< N
; i
++)
694 for (i
= 0; i
< N
; i
++)
702 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
705 #pragma acc kernels async (1)
709 for (ii
= 0; ii
< N
; ii
++)
710 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
713 #pragma acc kernels async (1)
717 for (ii
= 0; ii
< N
; ii
++)
718 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
721 #pragma acc kernels async (1)
725 for (ii
= 0; ii
< N
; ii
++)
726 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
733 for (i
= 0; i
< N
; i
++)
748 for (i
= 0; i
< N
; i
++)
757 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
760 #pragma acc kernels async (1)
764 for (ii
= 0; ii
< N
; ii
++)
765 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
768 #pragma acc kernels async (1)
772 for (ii
= 0; ii
< N
; ii
++)
773 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
776 #pragma acc kernels async (1)
780 for (ii
= 0; ii
< N
; ii
++)
781 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
784 #pragma acc kernels wait (1) async (1)
788 for (ii
= 0; ii
< N
; ii
++)
789 e
[ii
] = a
[ii
] + b
[ii
] + c
[ii
] + d
[ii
];
796 for (i
= 0; i
< N
; i
++)
814 for (i
= 0; i
< N
; i
++)
823 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
826 #pragma acc kernels async (1)
830 for (ii
= 0; ii
< N
; ii
++)
831 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
834 #pragma acc kernels async (1)
838 for (ii
= 0; ii
< N
; ii
++)
839 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
842 #pragma acc update host (a[0:N], b[0:N], c[0:N]) wait (1)
846 for (i
= 0; i
< N
; i
++)
859 for (i
= 0; i
< N
; i
++)
868 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
871 #pragma acc kernels async (1)
875 for (ii
= 0; ii
< N
; ii
++)
876 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
879 #pragma acc kernels async (1)
883 for (ii
= 0; ii
< N
; ii
++)
884 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
887 #pragma acc update host (a[0:N], b[0:N], c[0:N]) async (1)
893 for (i
= 0; i
< N
; i
++)
905 #if defined ACC_DEVICE_TYPE_nvidia
906 acc_shutdown (acc_device_nvidia
);