1 /* { dg-do run { target openacc_nvidia_accel_selected } } */
2 /* { dg-additional-options "-lcuda -lcublas -lcudart" } */
3 /* { dg-require-effective-target openacc_cublas } */
4 /* { dg-require-effective-target openacc_cudart } */
9 #include <cuda_runtime_api.h>
10 #include <cublas_v2.h>
14 saxpy (int n
, float a
, float *x
, float *y
)
18 for (i
= 0; i
< n
; i
++)
20 y
[i
] = a
* x
[i
] + y
[i
];
25 context_check (CUcontext ctx1
)
30 r
= cuCtxGetCurrent (&ctx2
);
31 if (r
!= CUDA_SUCCESS
)
33 fprintf (stderr
, "cuCtxGetCurrent failed: %d\n", r
);
39 fprintf (stderr
, "new context established\n");
43 ctx3
= (CUcontext
) acc_get_current_cuda_context ();
47 fprintf (stderr
, "acc_get_current_cuda_context returned wrong value\n");
55 main (int argc
, char **argv
)
65 float *h_X
, *h_Y1
, *h_Y2
;
71 /* Test 2 - cuBLAS creates, OpenACC shares. */
73 s
= cublasCreate (&h
);
74 if (s
!= CUBLAS_STATUS_SUCCESS
)
76 fprintf (stderr
, "cublasCreate failed: %d\n", s
);
80 r
= cuCtxGetCurrent (&pctx
);
81 if (r
!= CUDA_SUCCESS
)
83 fprintf (stderr
, "cuCtxGetCurrent failed: %d\n", r
);
87 e
= cudaGetDevice (&dev
);
90 fprintf (stderr
, "cudaGetDevice failed: %d\n", e
);
94 acc_set_device_num (dev
, acc_device_nvidia
);
96 h_X
= (float *) malloc (N
* sizeof (float));
99 fprintf (stderr
, "malloc failed: for h_X\n");
103 h_Y1
= (float *) malloc (N
* sizeof (float));
106 fprintf (stderr
, "malloc failed: for h_Y1\n");
110 h_Y2
= (float *) malloc (N
* sizeof (float));
113 fprintf (stderr
, "malloc failed: for h_Y2\n");
117 for (i
= 0; i
< N
; i
++)
119 h_X
[i
] = rand () / (float) RAND_MAX
;
120 h_Y2
[i
] = h_Y1
[i
] = rand () / (float) RAND_MAX
;
123 d_X
= (float *) acc_copyin (&h_X
[0], N
* sizeof (float));
126 fprintf (stderr
, "copyin error h_X\n");
130 context_check (pctx
);
132 d_Y
= (float *) acc_copyin (&h_Y1
[0], N
* sizeof (float));
135 fprintf (stderr
, "copyin error h_Y1\n");
139 context_check (pctx
);
141 s
= cublasSaxpy (h
, N
, &alpha
, d_X
, 1, d_Y
, 1);
142 if (s
!= CUBLAS_STATUS_SUCCESS
)
144 fprintf (stderr
, "cublasSaxpy failed: %d\n", s
);
148 context_check (pctx
);
150 acc_memcpy_from_device (&h_Y1
[0], d_Y
, N
* sizeof (float));
152 context_check (pctx
);
154 #pragma acc parallel present (h_X[0:N]), copy (h_Y2[0:N]) copyin (alpha)
158 for (i
= 0; i
< N
; i
++)
160 h_Y2
[i
] = alpha
* h_X
[i
] + h_Y2
[i
];
164 context_check (pctx
);
169 for (i
= 0; i
< N
; ++i
)
173 diff
= h_Y1
[i
] - h_Y2
[i
];
174 error_norm
+= diff
* diff
;
175 ref_norm
+= h_Y2
[i
] * h_Y2
[i
];
178 error_norm
= (float) sqrt ((double) error_norm
);
179 ref_norm
= (float) sqrt ((double) ref_norm
);
181 if ((fabs (ref_norm
) < 1e-7) || ((error_norm
/ ref_norm
) >= 1e-6f
))
183 fprintf (stderr
, "math error\n");
187 acc_delete (&h_X
[0], N
* sizeof (float));
188 acc_delete (&h_Y1
[0], N
* sizeof (float));
194 context_check (pctx
);
196 s
= cublasDestroy (h
);
197 if (s
!= CUBLAS_STATUS_SUCCESS
)
199 fprintf (stderr
, "cublasDestroy failed: %d\n", s
);
203 acc_shutdown (acc_device_nvidia
);
205 r
= cuCtxGetCurrent (&ctx
);
206 if (r
!= CUDA_SUCCESS
)
208 fprintf (stderr
, "cuCtxGetCurrent failed: %d\n", r
);
214 fprintf (stderr
, "Expected context\n");
220 fprintf (stderr
, "Unexpected new context\n");