gcc/testsuite/c-c++-common/gomp/gridify-3.c

   1 /* { dg-do compile } */
   2 /* { dg-require-effective-target offload_hsa } */
   3 /* { dg-options "-fopenmp -fdump-tree-omplower-details" } */
   4
   5 #define BLOCK_SIZE 16
   6
   7 void tiled_sgemm_tt(const int M, const int N, const int K, const float alpha, const float*A, const int LDA,
   8    const float*B, const int LDB, const float beta, float*C, const int LDC)
   9 {
  10 #pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N])
  11 #pragma omp distribute collapse(2)
  12    for (int C_row_start=0 ; C_row_start < M ; C_row_start+=BLOCK_SIZE)
  13       for (int C_col_start=0 ; C_col_start < N ; C_col_start+=BLOCK_SIZE)
  14         {
  15           float As[BLOCK_SIZE][BLOCK_SIZE];
  16           float Bs[BLOCK_SIZE][BLOCK_SIZE];
  17           float Cs[BLOCK_SIZE][BLOCK_SIZE];
  18           int C_row, C_col;
  19
  20 #pragma omp parallel for collapse(2)
  21          for (int row=0 ; row < BLOCK_SIZE ; row++)
  22            for (int col=0 ; col < BLOCK_SIZE ; col++)
  23              {
  24                Cs[row][col] = 0.0;
  25              }
  26
  27
  28          for (int kblock = 0; kblock  < K ; kblock += BLOCK_SIZE )
  29            {
  30 #pragma omp parallel for collapse(2)
  31              for (int row=0 ; row < BLOCK_SIZE ; row++)
  32                for (int col=0 ; col < BLOCK_SIZE ; col++)
  33                  {
  34                    C_row = C_row_start + row;
  35                    C_col = C_col_start + col;
  36                    if ((C_row < M) && (kblock + col < K))
  37                      As[row][col] = A[(C_row*LDA)+ kblock + col];
  38                    else
  39                      As[row][col] = 0;
  40                    if ((kblock + row < K) && C_col < N)
  41                      Bs[row][col] = B[((kblock+row)*LDB)+ C_col];
  42                    else
  43                      Bs[row][col] = 0;
  44                  }
  45
  46 #pragma omp parallel for collapse(2)
  47              for (int row=0 ; row < BLOCK_SIZE ; row++)
  48                for (int col=0 ; col < BLOCK_SIZE ; col++)
  49                  {
  50                    for (int e = 0; e < BLOCK_SIZE; ++e)
  51                      Cs[row][col] += As[row][e] * Bs[e][col];
  52                  }
  53          }  /* End for kblock .. */
  54
  55
  56 #pragma omp parallel for collapse(2)
  57          for (int row=0 ; row < BLOCK_SIZE ; row++)
  58            for (int col=0 ; col < BLOCK_SIZE ; col++)
  59              {
  60                C_row = C_row_start + row;
  61                C_col = C_col_start + col;
  62                if ((C_row < M) && (C_col < N))
  63                  C[(C_row*LDC)+C_col] = alpha*Cs[row][col] + beta*C[(C_row*LDC)+C_col];
  64              }
  65       } /* End distribute */
  66 }
  67
  68 /* { dg-final { scan-tree-dump "Target construct will be turned into a gridified HSA kernel" "omplower" } } */