1 /* { dg-do compile } */
2 /* { dg-require-effective-target offload_hsa } */
3 /* { dg-options "-fopenmp -fdump-tree-omplower-details" } */
7 void tiled_sgemm_tt(const int M
, const int N
, const int K
, const float alpha
, const float*A
, const int LDA
,
8 const float*B
, const int LDB
, const float beta
, float*C
, const int LDC
)
10 #pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N])
11 #pragma omp distribute collapse(2)
12 for (int C_row_start
=0 ; C_row_start
< M
; C_row_start
+=BLOCK_SIZE
)
13 for (int C_col_start
=0 ; C_col_start
< N
; C_col_start
+=BLOCK_SIZE
)
15 float As
[BLOCK_SIZE
][BLOCK_SIZE
];
16 float Bs
[BLOCK_SIZE
][BLOCK_SIZE
];
17 float Cs
[BLOCK_SIZE
][BLOCK_SIZE
];
20 #pragma omp parallel for collapse(2)
21 for (int row
=0 ; row
< BLOCK_SIZE
; row
++)
22 for (int col
=0 ; col
< BLOCK_SIZE
; col
++)
28 for (int kblock
= 0; kblock
< K
; kblock
+= BLOCK_SIZE
)
30 #pragma omp parallel for collapse(2)
31 for (int row
=0 ; row
< BLOCK_SIZE
; row
++)
32 for (int col
=0 ; col
< BLOCK_SIZE
; col
++)
34 C_row
= C_row_start
+ row
;
35 C_col
= C_col_start
+ col
;
36 if ((C_row
< M
) && (kblock
+ col
< K
))
37 As
[row
][col
] = A
[(C_row
*LDA
)+ kblock
+ col
];
40 if ((kblock
+ row
< K
) && C_col
< N
)
41 Bs
[row
][col
] = B
[((kblock
+row
)*LDB
)+ C_col
];
46 #pragma omp parallel for collapse(2)
47 for (int row
=0 ; row
< BLOCK_SIZE
; row
++)
48 for (int col
=0 ; col
< BLOCK_SIZE
; col
++)
50 for (int e
= 0; e
< BLOCK_SIZE
; ++e
)
51 Cs
[row
][col
] += As
[row
][e
] * Bs
[e
][col
];
53 } /* End for kblock .. */
56 #pragma omp parallel for collapse(2)
57 for (int row
=0 ; row
< BLOCK_SIZE
; row
++)
58 for (int col
=0 ; col
< BLOCK_SIZE
; col
++)
60 C_row
= C_row_start
+ row
;
61 C_col
= C_col_start
+ col
;
62 if ((C_row
< M
) && (C_col
< N
))
63 C
[(C_row
*LDC
)+C_col
] = alpha
*Cs
[row
][col
] + beta
*C
[(C_row
*LDC
)+C_col
];
65 } /* End distribute */
68 /* { dg-final { scan-tree-dump "Target construct will be turned into a gridified HSA kernel" "omplower" } } */