1 /* { dg-do run { target openacc_nvidia_accel_selected } } */
2 /* { dg-additional-options "-lcuda" } */
12 main (int argc
, char **argv
)
16 int N
= 128; //1024 * 1024;
17 float *a
, *b
, *c
, *d
, *e
;
21 acc_init (acc_device_nvidia
);
23 nbytes
= N
* sizeof (float);
25 a
= (float *) malloc (nbytes
);
26 b
= (float *) malloc (nbytes
);
27 c
= (float *) malloc (nbytes
);
28 d
= (float *) malloc (nbytes
);
29 e
= (float *) malloc (nbytes
);
31 for (i
= 0; i
< N
; i
++)
37 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
40 #pragma acc parallel async
44 for (ii
= 0; ii
< N
; ii
++)
52 for (i
= 0; i
< N
; i
++)
61 for (i
= 0; i
< N
; i
++)
67 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
70 #pragma acc parallel async (1)
74 for (ii
= 0; ii
< N
; ii
++)
82 for (i
= 0; i
< N
; i
++)
91 for (i
= 0; i
< N
; i
++)
99 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
102 #pragma acc parallel async (1)
106 for (ii
= 0; ii
< N
; ii
++)
107 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
110 #pragma acc parallel async (1)
114 for (ii
= 0; ii
< N
; ii
++)
115 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
119 #pragma acc parallel async (1)
123 for (ii
= 0; ii
< N
; ii
++)
124 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
131 for (i
= 0; i
< N
; i
++)
146 for (i
= 0; i
< N
; i
++)
155 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
158 #pragma acc parallel async (1)
162 for (ii
= 0; ii
< N
; ii
++)
163 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
166 #pragma acc parallel async (1)
170 for (ii
= 0; ii
< N
; ii
++)
171 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
174 #pragma acc parallel async (1)
178 for (ii
= 0; ii
< N
; ii
++)
179 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
182 #pragma acc parallel wait (1) async (1)
186 for (ii
= 0; ii
< N
; ii
++)
187 e
[ii
] = a
[ii
] + b
[ii
] + c
[ii
] + d
[ii
];
194 for (i
= 0; i
< N
; i
++)
213 r
= cuStreamCreate (&stream1
, CU_STREAM_NON_BLOCKING
);
214 if (r
!= CUDA_SUCCESS
)
216 fprintf (stderr
, "cuStreamCreate failed: %d\n", r
);
220 acc_set_cuda_stream (1, stream1
);
222 for (i
= 0; i
< N
; i
++)
228 #pragma acc data copy (a[0:N], b[0:N]) copyin (N)
231 #pragma acc parallel async (1)
235 for (ii
= 0; ii
< N
; ii
++)
243 for (i
= 0; i
< N
; i
++)
252 for (i
= 0; i
< N
; i
++)
260 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
263 #pragma acc parallel async (1)
267 for (ii
= 0; ii
< N
; ii
++)
268 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
271 #pragma acc parallel async (1)
275 for (ii
= 0; ii
< N
; ii
++)
276 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
279 #pragma acc parallel async (1)
283 for (ii
= 0; ii
< N
; ii
++)
284 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
291 for (i
= 0; i
< N
; i
++)
306 for (i
= 0; i
< N
; i
++)
315 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
318 #pragma acc parallel async (1)
322 for (ii
= 0; ii
< N
; ii
++)
323 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
326 #pragma acc parallel async (1)
330 for (ii
= 0; ii
< N
; ii
++)
331 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
334 #pragma acc parallel async (1)
338 for (ii
= 0; ii
< N
; ii
++)
339 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
342 #pragma acc parallel wait (1) async (1)
346 for (ii
= 0; ii
< N
; ii
++)
347 e
[ii
] = a
[ii
] + b
[ii
] + c
[ii
] + d
[ii
];
354 for (i
= 0; i
< N
; i
++)
372 for (i
= 0; i
< N
; i
++)
381 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
384 #pragma acc parallel async (1)
388 for (ii
= 0; ii
< N
; ii
++)
389 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
392 #pragma acc parallel async (1)
396 for (ii
= 0; ii
< N
; ii
++)
397 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
400 #pragma acc update host (a[0:N], b[0:N], c[0:N]) wait (1)
404 for (i
= 0; i
< N
; i
++)
417 for (i
= 0; i
< N
; i
++)
426 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
429 #pragma acc parallel async (1)
433 for (ii
= 0; ii
< N
; ii
++)
434 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
437 #pragma acc parallel async (1)
441 for (ii
= 0; ii
< N
; ii
++)
442 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
445 #pragma acc update host (a[0:N], b[0:N], c[0:N]) async (1)
451 for (i
= 0; i
< N
; i
++)
463 for (i
= 0; i
< N
; i
++)
469 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
472 #pragma acc kernels async
476 for (ii
= 0; ii
< N
; ii
++)
484 for (i
= 0; i
< N
; i
++)
493 for (i
= 0; i
< N
; i
++)
499 #pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N)
502 #pragma acc kernels async (1)
506 for (ii
= 0; ii
< N
; ii
++)
514 for (i
= 0; i
< N
; i
++)
523 for (i
= 0; i
< N
; i
++)
531 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
534 #pragma acc kernels async (1)
538 for (ii
= 0; ii
< N
; ii
++)
539 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
542 #pragma acc kernels async (1)
546 for (ii
= 0; ii
< N
; ii
++)
547 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
551 #pragma acc kernels async (1)
555 for (ii
= 0; ii
< N
; ii
++)
556 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
563 for (i
= 0; i
< N
; i
++)
578 for (i
= 0; i
< N
; i
++)
587 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
590 #pragma acc kernels async (1)
594 for (ii
= 0; ii
< N
; ii
++)
595 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
598 #pragma acc kernels async (1)
602 for (ii
= 0; ii
< N
; ii
++)
603 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
606 #pragma acc kernels async (1)
610 for (ii
= 0; ii
< N
; ii
++)
611 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
614 #pragma acc kernels wait (1) async (1)
618 for (ii
= 0; ii
< N
; ii
++)
619 e
[ii
] = a
[ii
] + b
[ii
] + c
[ii
] + d
[ii
];
626 for (i
= 0; i
< N
; i
++)
645 r
= cuStreamCreate (&stream1
, CU_STREAM_NON_BLOCKING
);
646 if (r
!= CUDA_SUCCESS
)
648 fprintf (stderr
, "cuStreamCreate failed: %d\n", r
);
652 acc_set_cuda_stream (1, stream1
);
654 for (i
= 0; i
< N
; i
++)
660 #pragma acc data copy (a[0:N], b[0:N]) copyin (N)
663 #pragma acc kernels async (1)
667 for (ii
= 0; ii
< N
; ii
++)
675 for (i
= 0; i
< N
; i
++)
684 for (i
= 0; i
< N
; i
++)
692 #pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N)
695 #pragma acc kernels async (1)
699 for (ii
= 0; ii
< N
; ii
++)
700 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
703 #pragma acc kernels async (1)
707 for (ii
= 0; ii
< N
; ii
++)
708 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
711 #pragma acc kernels async (1)
715 for (ii
= 0; ii
< N
; ii
++)
716 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
723 for (i
= 0; i
< N
; i
++)
738 for (i
= 0; i
< N
; i
++)
747 #pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N)
750 #pragma acc kernels async (1)
754 for (ii
= 0; ii
< N
; ii
++)
755 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
758 #pragma acc kernels async (1)
762 for (ii
= 0; ii
< N
; ii
++)
763 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
766 #pragma acc kernels async (1)
770 for (ii
= 0; ii
< N
; ii
++)
771 d
[ii
] = ((a
[ii
] * a
[ii
] + a
[ii
]) / a
[ii
]) - a
[ii
];
774 #pragma acc kernels wait (1) async (1)
778 for (ii
= 0; ii
< N
; ii
++)
779 e
[ii
] = a
[ii
] + b
[ii
] + c
[ii
] + d
[ii
];
786 for (i
= 0; i
< N
; i
++)
804 for (i
= 0; i
< N
; i
++)
813 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
816 #pragma acc kernels async (1)
820 for (ii
= 0; ii
< N
; ii
++)
821 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
824 #pragma acc kernels async (1)
828 for (ii
= 0; ii
< N
; ii
++)
829 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
832 #pragma acc update host (a[0:N], b[0:N], c[0:N]) wait (1)
836 for (i
= 0; i
< N
; i
++)
849 for (i
= 0; i
< N
; i
++)
858 #pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N)
861 #pragma acc kernels async (1)
865 for (ii
= 0; ii
< N
; ii
++)
866 b
[ii
] = (a
[ii
] * a
[ii
] * a
[ii
]) / a
[ii
];
869 #pragma acc kernels async (1)
873 for (ii
= 0; ii
< N
; ii
++)
874 c
[ii
] = (a
[ii
] + a
[ii
] + a
[ii
] + a
[ii
]) / a
[ii
];
877 #pragma acc update host (a[0:N], b[0:N], c[0:N]) async (1)
883 for (i
= 0; i
< N
; i
++)
895 acc_shutdown (acc_device_nvidia
);