[AArch64] Merge stores of D-register values with different modes
[official-gcc.git] / gcc / testsuite / c-c++-common / gomp / gridify-3.c
blob8dbeaef74cd39551e9cdfff53ea626e8d67e5264
1 /* { dg-do compile } */
2 /* { dg-require-effective-target offload_hsa } */
3 /* { dg-options "-fopenmp -fdump-tree-omplower-details" } */
5 #define BLOCK_SIZE 16
7 void tiled_sgemm_tt(const int M, const int N, const int K, const float alpha, const float*A, const int LDA,
8 const float*B, const int LDB, const float beta, float*C, const int LDC)
10 #pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N])
11 #pragma omp distribute collapse(2)
12 for (int C_row_start=0 ; C_row_start < M ; C_row_start+=BLOCK_SIZE)
13 for (int C_col_start=0 ; C_col_start < N ; C_col_start+=BLOCK_SIZE)
15 float As[BLOCK_SIZE][BLOCK_SIZE];
16 float Bs[BLOCK_SIZE][BLOCK_SIZE];
17 float Cs[BLOCK_SIZE][BLOCK_SIZE];
18 int C_row, C_col;
20 #pragma omp parallel for collapse(2)
21 for (int row=0 ; row < BLOCK_SIZE ; row++)
22 for (int col=0 ; col < BLOCK_SIZE ; col++)
24 Cs[row][col] = 0.0;
28 for (int kblock = 0; kblock < K ; kblock += BLOCK_SIZE )
30 #pragma omp parallel for collapse(2)
31 for (int row=0 ; row < BLOCK_SIZE ; row++)
32 for (int col=0 ; col < BLOCK_SIZE ; col++)
34 C_row = C_row_start + row;
35 C_col = C_col_start + col;
36 if ((C_row < M) && (kblock + col < K))
37 As[row][col] = A[(C_row*LDA)+ kblock + col];
38 else
39 As[row][col] = 0;
40 if ((kblock + row < K) && C_col < N)
41 Bs[row][col] = B[((kblock+row)*LDB)+ C_col];
42 else
43 Bs[row][col] = 0;
46 #pragma omp parallel for collapse(2)
47 for (int row=0 ; row < BLOCK_SIZE ; row++)
48 for (int col=0 ; col < BLOCK_SIZE ; col++)
50 for (int e = 0; e < BLOCK_SIZE; ++e)
51 Cs[row][col] += As[row][e] * Bs[e][col];
53 } /* End for kblock .. */
56 #pragma omp parallel for collapse(2)
57 for (int row=0 ; row < BLOCK_SIZE ; row++)
58 for (int col=0 ; col < BLOCK_SIZE ; col++)
60 C_row = C_row_start + row;
61 C_col = C_col_start + col;
62 if ((C_row < M) && (C_col < N))
63 C[(C_row*LDC)+C_col] = alpha*Cs[row][col] + beta*C[(C_row*LDC)+C_col];
65 } /* End distribute */
68 /* { dg-final { scan-tree-dump "Target construct will be turned into a gridified HSA kernel" "omplower" } } */