1 /* { dg-do run { target openacc_nvidia_accel_selected } } */
2 /* { dg-additional-options "-lcuda -lcublas -lcudart" } */
7 #include <cuda_runtime_api.h>
12 saxpy (int n
, float a
, float *x
, float *y
)
16 for (i
= 0; i
< n
; i
++)
18 y
[i
] = a
* x
[i
] + y
[i
];
23 context_check (CUcontext ctx1
)
28 r
= cuCtxGetCurrent (&ctx2
);
29 if (r
!= CUDA_SUCCESS
)
31 fprintf (stderr
, "cuCtxGetCurrent failed: %d\n", r
);
37 fprintf (stderr
, "new context established\n");
41 ctx3
= (CUcontext
) acc_get_current_cuda_context ();
45 fprintf (stderr
, "acc_get_current_cuda_context returned wrong value\n");
53 main (int argc
, char **argv
)
63 float *h_X
, *h_Y1
, *h_Y2
;
69 /* Test 2 - cuBLAS creates, OpenACC shares. */
71 s
= cublasCreate (&h
);
72 if (s
!= CUBLAS_STATUS_SUCCESS
)
74 fprintf (stderr
, "cublasCreate failed: %d\n", s
);
78 r
= cuCtxGetCurrent (&pctx
);
79 if (r
!= CUDA_SUCCESS
)
81 fprintf (stderr
, "cuCtxGetCurrent failed: %d\n", r
);
85 e
= cudaGetDevice (&dev
);
88 fprintf (stderr
, "cudaGetDevice failed: %d\n", e
);
92 acc_set_device_num (dev
, acc_device_nvidia
);
94 h_X
= (float *) malloc (N
* sizeof (float));
97 fprintf (stderr
, "malloc failed: for h_X\n");
101 h_Y1
= (float *) malloc (N
* sizeof (float));
104 fprintf (stderr
, "malloc failed: for h_Y1\n");
108 h_Y2
= (float *) malloc (N
* sizeof (float));
111 fprintf (stderr
, "malloc failed: for h_Y2\n");
115 for (i
= 0; i
< N
; i
++)
117 h_X
[i
] = rand () / (float) RAND_MAX
;
118 h_Y2
[i
] = h_Y1
[i
] = rand () / (float) RAND_MAX
;
121 d_X
= (float *) acc_copyin (&h_X
[0], N
* sizeof (float));
124 fprintf (stderr
, "copyin error h_X\n");
128 context_check (pctx
);
130 d_Y
= (float *) acc_copyin (&h_Y1
[0], N
* sizeof (float));
133 fprintf (stderr
, "copyin error h_Y1\n");
137 context_check (pctx
);
139 s
= cublasSaxpy (h
, N
, &alpha
, d_X
, 1, d_Y
, 1);
140 if (s
!= CUBLAS_STATUS_SUCCESS
)
142 fprintf (stderr
, "cublasSaxpy failed: %d\n", s
);
146 context_check (pctx
);
148 acc_memcpy_from_device (&h_Y1
[0], d_Y
, N
* sizeof (float));
150 context_check (pctx
);
152 #pragma acc parallel copyin (h_X[0:N]), copy (h_Y2[0:N]) copyin (alpha)
156 for (i
= 0; i
< N
; i
++)
158 h_Y2
[i
] = alpha
* h_X
[i
] + h_Y2
[i
];
162 context_check (pctx
);
167 for (i
= 0; i
< N
; ++i
)
171 diff
= h_Y1
[i
] - h_Y2
[i
];
172 error_norm
+= diff
* diff
;
173 ref_norm
+= h_Y2
[i
] * h_Y2
[i
];
176 error_norm
= (float) sqrt ((double) error_norm
);
177 ref_norm
= (float) sqrt ((double) ref_norm
);
179 if ((fabs (ref_norm
) < 1e-7) || ((error_norm
/ ref_norm
) >= 1e-6f
))
181 fprintf (stderr
, "math error\n");
192 context_check (pctx
);
194 s
= cublasDestroy (h
);
195 if (s
!= CUBLAS_STATUS_SUCCESS
)
197 fprintf (stderr
, "cublasDestroy failed: %d\n", s
);
201 acc_shutdown (acc_device_nvidia
);
203 r
= cuCtxGetCurrent (&ctx
);
204 if (r
!= CUDA_SUCCESS
)
206 fprintf (stderr
, "cuCtxGetCurrent failed: %d\n", r
);
212 fprintf (stderr
, "Expected context\n");
218 fprintf (stderr
, "Unexpected new context\n");