libgfortran/generated/matmul_r4.c

   1 /* Implementation of the MATMUL intrinsic
   2    Copyright (C) 2002-2017 Free Software Foundation, Inc.
   3    Contributed by Paul Brook <paul@nowt.org>
   4
   5 This file is part of the GNU Fortran runtime library (libgfortran).
   6
   7 Libgfortran is free software; you can redistribute it and/or
   8 modify it under the terms of the GNU General Public
   9 License as published by the Free Software Foundation; either
  10 version 3 of the License, or (at your option) any later version.
  11
  12 Libgfortran is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 Under Section 7 of GPL version 3, you are granted additional
  18 permissions described in the GCC Runtime Library Exception, version
  19 3.1, as published by the Free Software Foundation.
  20
  21 You should have received a copy of the GNU General Public License and
  22 a copy of the GCC Runtime Library Exception along with this program;
  23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24 <http://www.gnu.org/licenses/>.  */
  25
  26 #include "libgfortran.h"
  27 #include <string.h>
  28 #include <assert.h>
  29
  30
  31 #if defined (HAVE_GFC_REAL_4)
  32
  33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
  34    passed to us by the front-end, in which case we call it for large
  35    matrices.  */
  36
  37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
  38                           const int *, const GFC_REAL_4 *, const GFC_REAL_4 *,
  39                           const int *, const GFC_REAL_4 *, const int *,
  40                           const GFC_REAL_4 *, GFC_REAL_4 *, const int *,
  41                           int, int);
  42
  43 /* The order of loops is different in the case of plain matrix
  44    multiplication C=MATMUL(A,B), and in the frequent special case where
  45    the argument A is the temporary result of a TRANSPOSE intrinsic:
  46    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
  47    looking at their strides.
  48
  49    The equivalent Fortran pseudo-code is:
  50
  51    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
  52    IF (.NOT.IS_TRANSPOSED(A)) THEN
  53      C = 0
  54      DO J=1,N
  55        DO K=1,COUNT
  56          DO I=1,M
  57            C(I,J) = C(I,J)+A(I,K)*B(K,J)
  58    ELSE
  59      DO J=1,N
  60        DO I=1,M
  61          S = 0
  62          DO K=1,COUNT
  63            S = S+A(I,K)*B(K,J)
  64          C(I,J) = S
  65    ENDIF
  66 */
  67
  68 /* If try_blas is set to a nonzero value, then the matmul function will
  69    see if there is a way to perform the matrix multiplication by a call
  70    to the BLAS gemm function.  */
  71
  72 extern void matmul_r4 (gfc_array_r4 * const restrict retarray,
  73         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
  74         int blas_limit, blas_call gemm);
  75 export_proto(matmul_r4);
  76
  77 #if defined(HAVE_AVX) && defined(HAVE_AVX2)
  78 /* REAL types generate identical code for AVX and AVX2.  Only generate
  79    an AVX2 function if we are dealing with integer.  */
  80 #undef HAVE_AVX2
  81 #endif
  82
  83
  84 /* Put exhaustive list of possible architectures here here, ORed together.  */
  85
  86 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
  87
  88 #ifdef HAVE_AVX
  89 static void
  90 matmul_r4_avx (gfc_array_r4 * const restrict retarray,
  91         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
  92         int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
  93 static void
  94 matmul_r4_avx (gfc_array_r4 * const restrict retarray,
  95         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
  96         int blas_limit, blas_call gemm)
  97 {
  98   const GFC_REAL_4 * restrict abase;
  99   const GFC_REAL_4 * restrict bbase;
 100   GFC_REAL_4 * restrict dest;
 101
 102   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
 103   index_type x, y, n, count, xcount, ycount;
 104
 105   assert (GFC_DESCRIPTOR_RANK (a) == 2
 106           || GFC_DESCRIPTOR_RANK (b) == 2);
 107
 108 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
 109
 110    Either A or B (but not both) can be rank 1:
 111
 112    o One-dimensional argument A is implicitly treated as a row matrix
 113      dimensioned [1,count], so xcount=1.
 114
 115    o One-dimensional argument B is implicitly treated as a column matrix
 116      dimensioned [count, 1], so ycount=1.
 117 */
 118
 119   if (retarray->base_addr == NULL)
 120     {
 121       if (GFC_DESCRIPTOR_RANK (a) == 1)
 122         {
 123           GFC_DIMENSION_SET(retarray->dim[0], 0,
 124                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
 125         }
 126       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 127         {
 128           GFC_DIMENSION_SET(retarray->dim[0], 0,
 129                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 130         }
 131       else
 132         {
 133           GFC_DIMENSION_SET(retarray->dim[0], 0,
 134                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 135
 136           GFC_DIMENSION_SET(retarray->dim[1], 0,
 137                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
 138                             GFC_DESCRIPTOR_EXTENT(retarray,0));
 139         }
 140
 141       retarray->base_addr
 142         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
 143       retarray->offset = 0;
 144     }
 145   else if (unlikely (compile_options.bounds_check))
 146     {
 147       index_type ret_extent, arg_extent;
 148
 149       if (GFC_DESCRIPTOR_RANK (a) == 1)
 150         {
 151           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 152           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 153           if (arg_extent != ret_extent)
 154             runtime_error ("Incorrect extent in return array in"
 155                            " MATMUL intrinsic: is %ld, should be %ld",
 156                            (long int) ret_extent, (long int) arg_extent);
 157         }
 158       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 159         {
 160           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 161           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 162           if (arg_extent != ret_extent)
 163             runtime_error ("Incorrect extent in return array in"
 164                            " MATMUL intrinsic: is %ld, should be %ld",
 165                            (long int) ret_extent, (long int) arg_extent);
 166         }
 167       else
 168         {
 169           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 170           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 171           if (arg_extent != ret_extent)
 172             runtime_error ("Incorrect extent in return array in"
 173                            " MATMUL intrinsic for dimension 1:"
 174                            " is %ld, should be %ld",
 175                            (long int) ret_extent, (long int) arg_extent);
 176
 177           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 178           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 179           if (arg_extent != ret_extent)
 180             runtime_error ("Incorrect extent in return array in"
 181                            " MATMUL intrinsic for dimension 2:"
 182                            " is %ld, should be %ld",
 183                            (long int) ret_extent, (long int) arg_extent);
 184         }
 185     }
 186
 187
 188   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 189     {
 190       /* One-dimensional result may be addressed in the code below
 191          either as a row or a column matrix. We want both cases to
 192          work. */
 193       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 194     }
 195   else
 196     {
 197       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 198       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 199     }
 200
 201
 202   if (GFC_DESCRIPTOR_RANK (a) == 1)
 203     {
 204       /* Treat it as a a row matrix A[1,count]. */
 205       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 206       aystride = 1;
 207
 208       xcount = 1;
 209       count = GFC_DESCRIPTOR_EXTENT(a,0);
 210     }
 211   else
 212     {
 213       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 214       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 215
 216       count = GFC_DESCRIPTOR_EXTENT(a,1);
 217       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 218     }
 219
 220   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 221     {
 222       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 223         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
 224     }
 225
 226   if (GFC_DESCRIPTOR_RANK (b) == 1)
 227     {
 228       /* Treat it as a column matrix B[count,1] */
 229       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 230
 231       /* bystride should never be used for 1-dimensional b.
 232          in case it is we want it to cause a segfault, rather than
 233          an incorrect result. */
 234       bystride = 0xDEADBEEF;
 235       ycount = 1;
 236     }
 237   else
 238     {
 239       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 240       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 241       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 242     }
 243
 244   abase = a->base_addr;
 245   bbase = b->base_addr;
 246   dest = retarray->base_addr;
 247
 248   /* Now that everything is set up, we perform the multiplication
 249      itself.  */
 250
 251 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 252 #define min(a,b) ((a) <= (b) ? (a) : (b))
 253 #define max(a,b) ((a) >= (b) ? (a) : (b))
 254
 255   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 256       && (bxstride == 1 || bystride == 1)
 257       && (((float) xcount) * ((float) ycount) * ((float) count)
 258           > POW3(blas_limit)))
 259     {
 260       const int m = xcount, n = ycount, k = count, ldc = rystride;
 261       const GFC_REAL_4 one = 1, zero = 0;
 262       const int lda = (axstride == 1) ? aystride : axstride,
 263                 ldb = (bxstride == 1) ? bystride : bxstride;
 264
 265       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 266         {
 267           assert (gemm != NULL);
 268           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
 269                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 270                 &ldc, 1, 1);
 271           return;
 272         }
 273     }
 274
 275   if (rxstride == 1 && axstride == 1 && bxstride == 1)
 276     {
 277       /* This block of code implements a tuned matmul, derived from
 278          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 279
 280                Bo Kagstrom and Per Ling
 281                Department of Computing Science
 282                Umea University
 283                S-901 87 Umea, Sweden
 284
 285          from netlib.org, translated to C, and modified for matmul.m4.  */
 286
 287       const GFC_REAL_4 *a, *b;
 288       GFC_REAL_4 *c;
 289       const index_type m = xcount, n = ycount, k = count;
 290
 291       /* System generated locals */
 292       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 293                  i1, i2, i3, i4, i5, i6;
 294
 295       /* Local variables */
 296       GFC_REAL_4 t1[65536], /* was [256][256] */
 297                  f11, f12, f21, f22, f31, f32, f41, f42,
 298                  f13, f14, f23, f24, f33, f34, f43, f44;
 299       index_type i, j, l, ii, jj, ll;
 300       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 301
 302       a = abase;
 303       b = bbase;
 304       c = retarray->base_addr;
 305
 306       /* Parameter adjustments */
 307       c_dim1 = rystride;
 308       c_offset = 1 + c_dim1;
 309       c -= c_offset;
 310       a_dim1 = aystride;
 311       a_offset = 1 + a_dim1;
 312       a -= a_offset;
 313       b_dim1 = bystride;
 314       b_offset = 1 + b_dim1;
 315       b -= b_offset;
 316
 317       /* Early exit if possible */
 318       if (m == 0 || n == 0 || k == 0)
 319         return;
 320
 321       /* Empty c first.  */
 322       for (j=1; j<=n; j++)
 323         for (i=1; i<=m; i++)
 324           c[i + j * c_dim1] = (GFC_REAL_4)0;
 325
 326       /* Start turning the crank. */
 327       i1 = n;
 328       for (jj = 1; jj <= i1; jj += 512)
 329         {
 330           /* Computing MIN */
 331           i2 = 512;
 332           i3 = n - jj + 1;
 333           jsec = min(i2,i3);
 334           ujsec = jsec - jsec % 4;
 335           i2 = k;
 336           for (ll = 1; ll <= i2; ll += 256)
 337             {
 338               /* Computing MIN */
 339               i3 = 256;
 340               i4 = k - ll + 1;
 341               lsec = min(i3,i4);
 342               ulsec = lsec - lsec % 2;
 343
 344               i3 = m;
 345               for (ii = 1; ii <= i3; ii += 256)
 346                 {
 347                   /* Computing MIN */
 348                   i4 = 256;
 349                   i5 = m - ii + 1;
 350                   isec = min(i4,i5);
 351                   uisec = isec - isec % 2;
 352                   i4 = ll + ulsec - 1;
 353                   for (l = ll; l <= i4; l += 2)
 354                     {
 355                       i5 = ii + uisec - 1;
 356                       for (i = ii; i <= i5; i += 2)
 357                         {
 358                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 359                                         a[i + l * a_dim1];
 360                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 361                                         a[i + (l + 1) * a_dim1];
 362                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 363                                         a[i + 1 + l * a_dim1];
 364                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 365                                         a[i + 1 + (l + 1) * a_dim1];
 366                         }
 367                       if (uisec < isec)
 368                         {
 369                           t1[l - ll + 1 + (isec << 8) - 257] =
 370                                     a[ii + isec - 1 + l * a_dim1];
 371                           t1[l - ll + 2 + (isec << 8) - 257] =
 372                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 373                         }
 374                     }
 375                   if (ulsec < lsec)
 376                     {
 377                       i4 = ii + isec - 1;
 378                       for (i = ii; i<= i4; ++i)
 379                         {
 380                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 381                                     a[i + (ll + lsec - 1) * a_dim1];
 382                         }
 383                     }
 384
 385                   uisec = isec - isec % 4;
 386                   i4 = jj + ujsec - 1;
 387                   for (j = jj; j <= i4; j += 4)
 388                     {
 389                       i5 = ii + uisec - 1;
 390                       for (i = ii; i <= i5; i += 4)
 391                         {
 392                           f11 = c[i + j * c_dim1];
 393                           f21 = c[i + 1 + j * c_dim1];
 394                           f12 = c[i + (j + 1) * c_dim1];
 395                           f22 = c[i + 1 + (j + 1) * c_dim1];
 396                           f13 = c[i + (j + 2) * c_dim1];
 397                           f23 = c[i + 1 + (j + 2) * c_dim1];
 398                           f14 = c[i + (j + 3) * c_dim1];
 399                           f24 = c[i + 1 + (j + 3) * c_dim1];
 400                           f31 = c[i + 2 + j * c_dim1];
 401                           f41 = c[i + 3 + j * c_dim1];
 402                           f32 = c[i + 2 + (j + 1) * c_dim1];
 403                           f42 = c[i + 3 + (j + 1) * c_dim1];
 404                           f33 = c[i + 2 + (j + 2) * c_dim1];
 405                           f43 = c[i + 3 + (j + 2) * c_dim1];
 406                           f34 = c[i + 2 + (j + 3) * c_dim1];
 407                           f44 = c[i + 3 + (j + 3) * c_dim1];
 408                           i6 = ll + lsec - 1;
 409                           for (l = ll; l <= i6; ++l)
 410                             {
 411                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 412                                       * b[l + j * b_dim1];
 413                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 414                                       * b[l + j * b_dim1];
 415                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 416                                       * b[l + (j + 1) * b_dim1];
 417                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 418                                       * b[l + (j + 1) * b_dim1];
 419                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 420                                       * b[l + (j + 2) * b_dim1];
 421                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 422                                       * b[l + (j + 2) * b_dim1];
 423                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 424                                       * b[l + (j + 3) * b_dim1];
 425                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 426                                       * b[l + (j + 3) * b_dim1];
 427                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 428                                       * b[l + j * b_dim1];
 429                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 430                                       * b[l + j * b_dim1];
 431                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 432                                       * b[l + (j + 1) * b_dim1];
 433                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 434                                       * b[l + (j + 1) * b_dim1];
 435                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 436                                       * b[l + (j + 2) * b_dim1];
 437                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 438                                       * b[l + (j + 2) * b_dim1];
 439                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 440                                       * b[l + (j + 3) * b_dim1];
 441                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 442                                       * b[l + (j + 3) * b_dim1];
 443                             }
 444                           c[i + j * c_dim1] = f11;
 445                           c[i + 1 + j * c_dim1] = f21;
 446                           c[i + (j + 1) * c_dim1] = f12;
 447                           c[i + 1 + (j + 1) * c_dim1] = f22;
 448                           c[i + (j + 2) * c_dim1] = f13;
 449                           c[i + 1 + (j + 2) * c_dim1] = f23;
 450                           c[i + (j + 3) * c_dim1] = f14;
 451                           c[i + 1 + (j + 3) * c_dim1] = f24;
 452                           c[i + 2 + j * c_dim1] = f31;
 453                           c[i + 3 + j * c_dim1] = f41;
 454                           c[i + 2 + (j + 1) * c_dim1] = f32;
 455                           c[i + 3 + (j + 1) * c_dim1] = f42;
 456                           c[i + 2 + (j + 2) * c_dim1] = f33;
 457                           c[i + 3 + (j + 2) * c_dim1] = f43;
 458                           c[i + 2 + (j + 3) * c_dim1] = f34;
 459                           c[i + 3 + (j + 3) * c_dim1] = f44;
 460                         }
 461                       if (uisec < isec)
 462                         {
 463                           i5 = ii + isec - 1;
 464                           for (i = ii + uisec; i <= i5; ++i)
 465                             {
 466                               f11 = c[i + j * c_dim1];
 467                               f12 = c[i + (j + 1) * c_dim1];
 468                               f13 = c[i + (j + 2) * c_dim1];
 469                               f14 = c[i + (j + 3) * c_dim1];
 470                               i6 = ll + lsec - 1;
 471                               for (l = ll; l <= i6; ++l)
 472                                 {
 473                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 474                                           257] * b[l + j * b_dim1];
 475                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 476                                           257] * b[l + (j + 1) * b_dim1];
 477                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 478                                           257] * b[l + (j + 2) * b_dim1];
 479                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 480                                           257] * b[l + (j + 3) * b_dim1];
 481                                 }
 482                               c[i + j * c_dim1] = f11;
 483                               c[i + (j + 1) * c_dim1] = f12;
 484                               c[i + (j + 2) * c_dim1] = f13;
 485                               c[i + (j + 3) * c_dim1] = f14;
 486                             }
 487                         }
 488                     }
 489                   if (ujsec < jsec)
 490                     {
 491                       i4 = jj + jsec - 1;
 492                       for (j = jj + ujsec; j <= i4; ++j)
 493                         {
 494                           i5 = ii + uisec - 1;
 495                           for (i = ii; i <= i5; i += 4)
 496                             {
 497                               f11 = c[i + j * c_dim1];
 498                               f21 = c[i + 1 + j * c_dim1];
 499                               f31 = c[i + 2 + j * c_dim1];
 500                               f41 = c[i + 3 + j * c_dim1];
 501                               i6 = ll + lsec - 1;
 502                               for (l = ll; l <= i6; ++l)
 503                                 {
 504                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 505                                           257] * b[l + j * b_dim1];
 506                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
 507                                           257] * b[l + j * b_dim1];
 508                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
 509                                           257] * b[l + j * b_dim1];
 510                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
 511                                           257] * b[l + j * b_dim1];
 512                                 }
 513                               c[i + j * c_dim1] = f11;
 514                               c[i + 1 + j * c_dim1] = f21;
 515                               c[i + 2 + j * c_dim1] = f31;
 516                               c[i + 3 + j * c_dim1] = f41;
 517                             }
 518                           i5 = ii + isec - 1;
 519                           for (i = ii + uisec; i <= i5; ++i)
 520                             {
 521                               f11 = c[i + j * c_dim1];
 522                               i6 = ll + lsec - 1;
 523                               for (l = ll; l <= i6; ++l)
 524                                 {
 525                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 526                                           257] * b[l + j * b_dim1];
 527                                 }
 528                               c[i + j * c_dim1] = f11;
 529                             }
 530                         }
 531                     }
 532                 }
 533             }
 534         }
 535       return;
 536     }
 537   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
 538     {
 539       if (GFC_DESCRIPTOR_RANK (a) != 1)
 540         {
 541           const GFC_REAL_4 *restrict abase_x;
 542           const GFC_REAL_4 *restrict bbase_y;
 543           GFC_REAL_4 *restrict dest_y;
 544           GFC_REAL_4 s;
 545
 546           for (y = 0; y < ycount; y++)
 547             {
 548               bbase_y = &bbase[y*bystride];
 549               dest_y = &dest[y*rystride];
 550               for (x = 0; x < xcount; x++)
 551                 {
 552                   abase_x = &abase[x*axstride];
 553                   s = (GFC_REAL_4) 0;
 554                   for (n = 0; n < count; n++)
 555                     s += abase_x[n] * bbase_y[n];
 556                   dest_y[x] = s;
 557                 }
 558             }
 559         }
 560       else
 561         {
 562           const GFC_REAL_4 *restrict bbase_y;
 563           GFC_REAL_4 s;
 564
 565           for (y = 0; y < ycount; y++)
 566             {
 567               bbase_y = &bbase[y*bystride];
 568               s = (GFC_REAL_4) 0;
 569               for (n = 0; n < count; n++)
 570                 s += abase[n*axstride] * bbase_y[n];
 571               dest[y*rystride] = s;
 572             }
 573         }
 574     }
 575   else if (axstride < aystride)
 576     {
 577       for (y = 0; y < ycount; y++)
 578         for (x = 0; x < xcount; x++)
 579           dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
 580
 581       for (y = 0; y < ycount; y++)
 582         for (n = 0; n < count; n++)
 583           for (x = 0; x < xcount; x++)
 584             /* dest[x,y] += a[x,n] * b[n,y] */
 585             dest[x*rxstride + y*rystride] +=
 586                                         abase[x*axstride + n*aystride] *
 587                                         bbase[n*bxstride + y*bystride];
 588     }
 589   else if (GFC_DESCRIPTOR_RANK (a) == 1)
 590     {
 591       const GFC_REAL_4 *restrict bbase_y;
 592       GFC_REAL_4 s;
 593
 594       for (y = 0; y < ycount; y++)
 595         {
 596           bbase_y = &bbase[y*bystride];
 597           s = (GFC_REAL_4) 0;
 598           for (n = 0; n < count; n++)
 599             s += abase[n*axstride] * bbase_y[n*bxstride];
 600           dest[y*rxstride] = s;
 601         }
 602     }
 603   else
 604     {
 605       const GFC_REAL_4 *restrict abase_x;
 606       const GFC_REAL_4 *restrict bbase_y;
 607       GFC_REAL_4 *restrict dest_y;
 608       GFC_REAL_4 s;
 609
 610       for (y = 0; y < ycount; y++)
 611         {
 612           bbase_y = &bbase[y*bystride];
 613           dest_y = &dest[y*rystride];
 614           for (x = 0; x < xcount; x++)
 615             {
 616               abase_x = &abase[x*axstride];
 617               s = (GFC_REAL_4) 0;
 618               for (n = 0; n < count; n++)
 619                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
 620               dest_y[x*rxstride] = s;
 621             }
 622         }
 623     }
 624 }
 625 #undef POW3
 626 #undef min
 627 #undef max
 628
 629 #endif /* HAVE_AVX */
 630
 631 #ifdef HAVE_AVX2
 632 static void
 633 matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
 634         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
 635         int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
 636 static void
 637 matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
 638         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
 639         int blas_limit, blas_call gemm)
 640 {
 641   const GFC_REAL_4 * restrict abase;
 642   const GFC_REAL_4 * restrict bbase;
 643   GFC_REAL_4 * restrict dest;
 644
 645   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
 646   index_type x, y, n, count, xcount, ycount;
 647
 648   assert (GFC_DESCRIPTOR_RANK (a) == 2
 649           || GFC_DESCRIPTOR_RANK (b) == 2);
 650
 651 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
 652
 653    Either A or B (but not both) can be rank 1:
 654
 655    o One-dimensional argument A is implicitly treated as a row matrix
 656      dimensioned [1,count], so xcount=1.
 657
 658    o One-dimensional argument B is implicitly treated as a column matrix
 659      dimensioned [count, 1], so ycount=1.
 660 */
 661
 662   if (retarray->base_addr == NULL)
 663     {
 664       if (GFC_DESCRIPTOR_RANK (a) == 1)
 665         {
 666           GFC_DIMENSION_SET(retarray->dim[0], 0,
 667                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
 668         }
 669       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 670         {
 671           GFC_DIMENSION_SET(retarray->dim[0], 0,
 672                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 673         }
 674       else
 675         {
 676           GFC_DIMENSION_SET(retarray->dim[0], 0,
 677                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 678
 679           GFC_DIMENSION_SET(retarray->dim[1], 0,
 680                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
 681                             GFC_DESCRIPTOR_EXTENT(retarray,0));
 682         }
 683
 684       retarray->base_addr
 685         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
 686       retarray->offset = 0;
 687     }
 688   else if (unlikely (compile_options.bounds_check))
 689     {
 690       index_type ret_extent, arg_extent;
 691
 692       if (GFC_DESCRIPTOR_RANK (a) == 1)
 693         {
 694           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 695           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 696           if (arg_extent != ret_extent)
 697             runtime_error ("Incorrect extent in return array in"
 698                            " MATMUL intrinsic: is %ld, should be %ld",
 699                            (long int) ret_extent, (long int) arg_extent);
 700         }
 701       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 702         {
 703           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 704           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 705           if (arg_extent != ret_extent)
 706             runtime_error ("Incorrect extent in return array in"
 707                            " MATMUL intrinsic: is %ld, should be %ld",
 708                            (long int) ret_extent, (long int) arg_extent);
 709         }
 710       else
 711         {
 712           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 713           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 714           if (arg_extent != ret_extent)
 715             runtime_error ("Incorrect extent in return array in"
 716                            " MATMUL intrinsic for dimension 1:"
 717                            " is %ld, should be %ld",
 718                            (long int) ret_extent, (long int) arg_extent);
 719
 720           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 721           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 722           if (arg_extent != ret_extent)
 723             runtime_error ("Incorrect extent in return array in"
 724                            " MATMUL intrinsic for dimension 2:"
 725                            " is %ld, should be %ld",
 726                            (long int) ret_extent, (long int) arg_extent);
 727         }
 728     }
 729
 730
 731   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 732     {
 733       /* One-dimensional result may be addressed in the code below
 734          either as a row or a column matrix. We want both cases to
 735          work. */
 736       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 737     }
 738   else
 739     {
 740       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 741       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 742     }
 743
 744
 745   if (GFC_DESCRIPTOR_RANK (a) == 1)
 746     {
 747       /* Treat it as a a row matrix A[1,count]. */
 748       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 749       aystride = 1;
 750
 751       xcount = 1;
 752       count = GFC_DESCRIPTOR_EXTENT(a,0);
 753     }
 754   else
 755     {
 756       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 757       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 758
 759       count = GFC_DESCRIPTOR_EXTENT(a,1);
 760       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 761     }
 762
 763   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 764     {
 765       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 766         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
 767     }
 768
 769   if (GFC_DESCRIPTOR_RANK (b) == 1)
 770     {
 771       /* Treat it as a column matrix B[count,1] */
 772       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 773
 774       /* bystride should never be used for 1-dimensional b.
 775          in case it is we want it to cause a segfault, rather than
 776          an incorrect result. */
 777       bystride = 0xDEADBEEF;
 778       ycount = 1;
 779     }
 780   else
 781     {
 782       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 783       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 784       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 785     }
 786
 787   abase = a->base_addr;
 788   bbase = b->base_addr;
 789   dest = retarray->base_addr;
 790
 791   /* Now that everything is set up, we perform the multiplication
 792      itself.  */
 793
 794 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 795 #define min(a,b) ((a) <= (b) ? (a) : (b))
 796 #define max(a,b) ((a) >= (b) ? (a) : (b))
 797
 798   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 799       && (bxstride == 1 || bystride == 1)
 800       && (((float) xcount) * ((float) ycount) * ((float) count)
 801           > POW3(blas_limit)))
 802     {
 803       const int m = xcount, n = ycount, k = count, ldc = rystride;
 804       const GFC_REAL_4 one = 1, zero = 0;
 805       const int lda = (axstride == 1) ? aystride : axstride,
 806                 ldb = (bxstride == 1) ? bystride : bxstride;
 807
 808       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 809         {
 810           assert (gemm != NULL);
 811           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
 812                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 813                 &ldc, 1, 1);
 814           return;
 815         }
 816     }
 817
 818   if (rxstride == 1 && axstride == 1 && bxstride == 1)
 819     {
 820       /* This block of code implements a tuned matmul, derived from
 821          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 822
 823                Bo Kagstrom and Per Ling
 824                Department of Computing Science
 825                Umea University
 826                S-901 87 Umea, Sweden
 827
 828          from netlib.org, translated to C, and modified for matmul.m4.  */
 829
 830       const GFC_REAL_4 *a, *b;
 831       GFC_REAL_4 *c;
 832       const index_type m = xcount, n = ycount, k = count;
 833
 834       /* System generated locals */
 835       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 836                  i1, i2, i3, i4, i5, i6;
 837
 838       /* Local variables */
 839       GFC_REAL_4 t1[65536], /* was [256][256] */
 840                  f11, f12, f21, f22, f31, f32, f41, f42,
 841                  f13, f14, f23, f24, f33, f34, f43, f44;
 842       index_type i, j, l, ii, jj, ll;
 843       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 844
 845       a = abase;
 846       b = bbase;
 847       c = retarray->base_addr;
 848
 849       /* Parameter adjustments */
 850       c_dim1 = rystride;
 851       c_offset = 1 + c_dim1;
 852       c -= c_offset;
 853       a_dim1 = aystride;
 854       a_offset = 1 + a_dim1;
 855       a -= a_offset;
 856       b_dim1 = bystride;
 857       b_offset = 1 + b_dim1;
 858       b -= b_offset;
 859
 860       /* Early exit if possible */
 861       if (m == 0 || n == 0 || k == 0)
 862         return;
 863
 864       /* Empty c first.  */
 865       for (j=1; j<=n; j++)
 866         for (i=1; i<=m; i++)
 867           c[i + j * c_dim1] = (GFC_REAL_4)0;
 868
 869       /* Start turning the crank. */
 870       i1 = n;
 871       for (jj = 1; jj <= i1; jj += 512)
 872         {
 873           /* Computing MIN */
 874           i2 = 512;
 875           i3 = n - jj + 1;
 876           jsec = min(i2,i3);
 877           ujsec = jsec - jsec % 4;
 878           i2 = k;
 879           for (ll = 1; ll <= i2; ll += 256)
 880             {
 881               /* Computing MIN */
 882               i3 = 256;
 883               i4 = k - ll + 1;
 884               lsec = min(i3,i4);
 885               ulsec = lsec - lsec % 2;
 886
 887               i3 = m;
 888               for (ii = 1; ii <= i3; ii += 256)
 889                 {
 890                   /* Computing MIN */
 891                   i4 = 256;
 892                   i5 = m - ii + 1;
 893                   isec = min(i4,i5);
 894                   uisec = isec - isec % 2;
 895                   i4 = ll + ulsec - 1;
 896                   for (l = ll; l <= i4; l += 2)
 897                     {
 898                       i5 = ii + uisec - 1;
 899                       for (i = ii; i <= i5; i += 2)
 900                         {
 901                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 902                                         a[i + l * a_dim1];
 903                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 904                                         a[i + (l + 1) * a_dim1];
 905                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 906                                         a[i + 1 + l * a_dim1];
 907                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 908                                         a[i + 1 + (l + 1) * a_dim1];
 909                         }
 910                       if (uisec < isec)
 911                         {
 912                           t1[l - ll + 1 + (isec << 8) - 257] =
 913                                     a[ii + isec - 1 + l * a_dim1];
 914                           t1[l - ll + 2 + (isec << 8) - 257] =
 915                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 916                         }
 917                     }
 918                   if (ulsec < lsec)
 919                     {
 920                       i4 = ii + isec - 1;
 921                       for (i = ii; i<= i4; ++i)
 922                         {
 923                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 924                                     a[i + (ll + lsec - 1) * a_dim1];
 925                         }
 926                     }
 927
 928                   uisec = isec - isec % 4;
 929                   i4 = jj + ujsec - 1;
 930                   for (j = jj; j <= i4; j += 4)
 931                     {
 932                       i5 = ii + uisec - 1;
 933                       for (i = ii; i <= i5; i += 4)
 934                         {
 935                           f11 = c[i + j * c_dim1];
 936                           f21 = c[i + 1 + j * c_dim1];
 937                           f12 = c[i + (j + 1) * c_dim1];
 938                           f22 = c[i + 1 + (j + 1) * c_dim1];
 939                           f13 = c[i + (j + 2) * c_dim1];
 940                           f23 = c[i + 1 + (j + 2) * c_dim1];
 941                           f14 = c[i + (j + 3) * c_dim1];
 942                           f24 = c[i + 1 + (j + 3) * c_dim1];
 943                           f31 = c[i + 2 + j * c_dim1];
 944                           f41 = c[i + 3 + j * c_dim1];
 945                           f32 = c[i + 2 + (j + 1) * c_dim1];
 946                           f42 = c[i + 3 + (j + 1) * c_dim1];
 947                           f33 = c[i + 2 + (j + 2) * c_dim1];
 948                           f43 = c[i + 3 + (j + 2) * c_dim1];
 949                           f34 = c[i + 2 + (j + 3) * c_dim1];
 950                           f44 = c[i + 3 + (j + 3) * c_dim1];
 951                           i6 = ll + lsec - 1;
 952                           for (l = ll; l <= i6; ++l)
 953                             {
 954                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 955                                       * b[l + j * b_dim1];
 956                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 957                                       * b[l + j * b_dim1];
 958                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 959                                       * b[l + (j + 1) * b_dim1];
 960                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 961                                       * b[l + (j + 1) * b_dim1];
 962                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 963                                       * b[l + (j + 2) * b_dim1];
 964                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 965                                       * b[l + (j + 2) * b_dim1];
 966                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 967                                       * b[l + (j + 3) * b_dim1];
 968                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 969                                       * b[l + (j + 3) * b_dim1];
 970                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 971                                       * b[l + j * b_dim1];
 972                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 973                                       * b[l + j * b_dim1];
 974                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 975                                       * b[l + (j + 1) * b_dim1];
 976                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 977                                       * b[l + (j + 1) * b_dim1];
 978                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 979                                       * b[l + (j + 2) * b_dim1];
 980                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 981                                       * b[l + (j + 2) * b_dim1];
 982                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 983                                       * b[l + (j + 3) * b_dim1];
 984                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 985                                       * b[l + (j + 3) * b_dim1];
 986                             }
 987                           c[i + j * c_dim1] = f11;
 988                           c[i + 1 + j * c_dim1] = f21;
 989                           c[i + (j + 1) * c_dim1] = f12;
 990                           c[i + 1 + (j + 1) * c_dim1] = f22;
 991                           c[i + (j + 2) * c_dim1] = f13;
 992                           c[i + 1 + (j + 2) * c_dim1] = f23;
 993                           c[i + (j + 3) * c_dim1] = f14;
 994                           c[i + 1 + (j + 3) * c_dim1] = f24;
 995                           c[i + 2 + j * c_dim1] = f31;
 996                           c[i + 3 + j * c_dim1] = f41;
 997                           c[i + 2 + (j + 1) * c_dim1] = f32;
 998                           c[i + 3 + (j + 1) * c_dim1] = f42;
 999                           c[i + 2 + (j + 2) * c_dim1] = f33;
1000                           c[i + 3 + (j + 2) * c_dim1] = f43;
1001                           c[i + 2 + (j + 3) * c_dim1] = f34;
1002                           c[i + 3 + (j + 3) * c_dim1] = f44;
1003                         }
1004                       if (uisec < isec)
1005                         {
1006                           i5 = ii + isec - 1;
1007                           for (i = ii + uisec; i <= i5; ++i)
1008                             {
1009                               f11 = c[i + j * c_dim1];
1010                               f12 = c[i + (j + 1) * c_dim1];
1011                               f13 = c[i + (j + 2) * c_dim1];
1012                               f14 = c[i + (j + 3) * c_dim1];
1013                               i6 = ll + lsec - 1;
1014                               for (l = ll; l <= i6; ++l)
1015                                 {
1016                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1017                                           257] * b[l + j * b_dim1];
1018                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1019                                           257] * b[l + (j + 1) * b_dim1];
1020                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1021                                           257] * b[l + (j + 2) * b_dim1];
1022                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1023                                           257] * b[l + (j + 3) * b_dim1];
1024                                 }
1025                               c[i + j * c_dim1] = f11;
1026                               c[i + (j + 1) * c_dim1] = f12;
1027                               c[i + (j + 2) * c_dim1] = f13;
1028                               c[i + (j + 3) * c_dim1] = f14;
1029                             }
1030                         }
1031                     }
1032                   if (ujsec < jsec)
1033                     {
1034                       i4 = jj + jsec - 1;
1035                       for (j = jj + ujsec; j <= i4; ++j)
1036                         {
1037                           i5 = ii + uisec - 1;
1038                           for (i = ii; i <= i5; i += 4)
1039                             {
1040                               f11 = c[i + j * c_dim1];
1041                               f21 = c[i + 1 + j * c_dim1];
1042                               f31 = c[i + 2 + j * c_dim1];
1043                               f41 = c[i + 3 + j * c_dim1];
1044                               i6 = ll + lsec - 1;
1045                               for (l = ll; l <= i6; ++l)
1046                                 {
1047                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1048                                           257] * b[l + j * b_dim1];
1049                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1050                                           257] * b[l + j * b_dim1];
1051                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1052                                           257] * b[l + j * b_dim1];
1053                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1054                                           257] * b[l + j * b_dim1];
1055                                 }
1056                               c[i + j * c_dim1] = f11;
1057                               c[i + 1 + j * c_dim1] = f21;
1058                               c[i + 2 + j * c_dim1] = f31;
1059                               c[i + 3 + j * c_dim1] = f41;
1060                             }
1061                           i5 = ii + isec - 1;
1062                           for (i = ii + uisec; i <= i5; ++i)
1063                             {
1064                               f11 = c[i + j * c_dim1];
1065                               i6 = ll + lsec - 1;
1066                               for (l = ll; l <= i6; ++l)
1067                                 {
1068                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1069                                           257] * b[l + j * b_dim1];
1070                                 }
1071                               c[i + j * c_dim1] = f11;
1072                             }
1073                         }
1074                     }
1075                 }
1076             }
1077         }
1078       return;
1079     }
1080   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1081     {
1082       if (GFC_DESCRIPTOR_RANK (a) != 1)
1083         {
1084           const GFC_REAL_4 *restrict abase_x;
1085           const GFC_REAL_4 *restrict bbase_y;
1086           GFC_REAL_4 *restrict dest_y;
1087           GFC_REAL_4 s;
1088
1089           for (y = 0; y < ycount; y++)
1090             {
1091               bbase_y = &bbase[y*bystride];
1092               dest_y = &dest[y*rystride];
1093               for (x = 0; x < xcount; x++)
1094                 {
1095                   abase_x = &abase[x*axstride];
1096                   s = (GFC_REAL_4) 0;
1097                   for (n = 0; n < count; n++)
1098                     s += abase_x[n] * bbase_y[n];
1099                   dest_y[x] = s;
1100                 }
1101             }
1102         }
1103       else
1104         {
1105           const GFC_REAL_4 *restrict bbase_y;
1106           GFC_REAL_4 s;
1107
1108           for (y = 0; y < ycount; y++)
1109             {
1110               bbase_y = &bbase[y*bystride];
1111               s = (GFC_REAL_4) 0;
1112               for (n = 0; n < count; n++)
1113                 s += abase[n*axstride] * bbase_y[n];
1114               dest[y*rystride] = s;
1115             }
1116         }
1117     }
1118   else if (axstride < aystride)
1119     {
1120       for (y = 0; y < ycount; y++)
1121         for (x = 0; x < xcount; x++)
1122           dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
1123
1124       for (y = 0; y < ycount; y++)
1125         for (n = 0; n < count; n++)
1126           for (x = 0; x < xcount; x++)
1127             /* dest[x,y] += a[x,n] * b[n,y] */
1128             dest[x*rxstride + y*rystride] +=
1129                                         abase[x*axstride + n*aystride] *
1130                                         bbase[n*bxstride + y*bystride];
1131     }
1132   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1133     {
1134       const GFC_REAL_4 *restrict bbase_y;
1135       GFC_REAL_4 s;
1136
1137       for (y = 0; y < ycount; y++)
1138         {
1139           bbase_y = &bbase[y*bystride];
1140           s = (GFC_REAL_4) 0;
1141           for (n = 0; n < count; n++)
1142             s += abase[n*axstride] * bbase_y[n*bxstride];
1143           dest[y*rxstride] = s;
1144         }
1145     }
1146   else
1147     {
1148       const GFC_REAL_4 *restrict abase_x;
1149       const GFC_REAL_4 *restrict bbase_y;
1150       GFC_REAL_4 *restrict dest_y;
1151       GFC_REAL_4 s;
1152
1153       for (y = 0; y < ycount; y++)
1154         {
1155           bbase_y = &bbase[y*bystride];
1156           dest_y = &dest[y*rystride];
1157           for (x = 0; x < xcount; x++)
1158             {
1159               abase_x = &abase[x*axstride];
1160               s = (GFC_REAL_4) 0;
1161               for (n = 0; n < count; n++)
1162                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1163               dest_y[x*rxstride] = s;
1164             }
1165         }
1166     }
1167 }
1168 #undef POW3
1169 #undef min
1170 #undef max
1171
1172 #endif /* HAVE_AVX2 */
1173
1174 #ifdef HAVE_AVX512F
1175 static void
1176 matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
1177         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1178         int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1179 static void
1180 matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
1181         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1182         int blas_limit, blas_call gemm)
1183 {
1184   const GFC_REAL_4 * restrict abase;
1185   const GFC_REAL_4 * restrict bbase;
1186   GFC_REAL_4 * restrict dest;
1187
1188   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1189   index_type x, y, n, count, xcount, ycount;
1190
1191   assert (GFC_DESCRIPTOR_RANK (a) == 2
1192           || GFC_DESCRIPTOR_RANK (b) == 2);
1193
1194 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1195
1196    Either A or B (but not both) can be rank 1:
1197
1198    o One-dimensional argument A is implicitly treated as a row matrix
1199      dimensioned [1,count], so xcount=1.
1200
1201    o One-dimensional argument B is implicitly treated as a column matrix
1202      dimensioned [count, 1], so ycount=1.
1203 */
1204
1205   if (retarray->base_addr == NULL)
1206     {
1207       if (GFC_DESCRIPTOR_RANK (a) == 1)
1208         {
1209           GFC_DIMENSION_SET(retarray->dim[0], 0,
1210                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1211         }
1212       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1213         {
1214           GFC_DIMENSION_SET(retarray->dim[0], 0,
1215                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1216         }
1217       else
1218         {
1219           GFC_DIMENSION_SET(retarray->dim[0], 0,
1220                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1221
1222           GFC_DIMENSION_SET(retarray->dim[1], 0,
1223                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1224                             GFC_DESCRIPTOR_EXTENT(retarray,0));
1225         }
1226
1227       retarray->base_addr
1228         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
1229       retarray->offset = 0;
1230     }
1231   else if (unlikely (compile_options.bounds_check))
1232     {
1233       index_type ret_extent, arg_extent;
1234
1235       if (GFC_DESCRIPTOR_RANK (a) == 1)
1236         {
1237           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1238           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1239           if (arg_extent != ret_extent)
1240             runtime_error ("Incorrect extent in return array in"
1241                            " MATMUL intrinsic: is %ld, should be %ld",
1242                            (long int) ret_extent, (long int) arg_extent);
1243         }
1244       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1245         {
1246           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1247           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1248           if (arg_extent != ret_extent)
1249             runtime_error ("Incorrect extent in return array in"
1250                            " MATMUL intrinsic: is %ld, should be %ld",
1251                            (long int) ret_extent, (long int) arg_extent);
1252         }
1253       else
1254         {
1255           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1256           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1257           if (arg_extent != ret_extent)
1258             runtime_error ("Incorrect extent in return array in"
1259                            " MATMUL intrinsic for dimension 1:"
1260                            " is %ld, should be %ld",
1261                            (long int) ret_extent, (long int) arg_extent);
1262
1263           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1264           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1265           if (arg_extent != ret_extent)
1266             runtime_error ("Incorrect extent in return array in"
1267                            " MATMUL intrinsic for dimension 2:"
1268                            " is %ld, should be %ld",
1269                            (long int) ret_extent, (long int) arg_extent);
1270         }
1271     }
1272
1273
1274   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1275     {
1276       /* One-dimensional result may be addressed in the code below
1277          either as a row or a column matrix. We want both cases to
1278          work. */
1279       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1280     }
1281   else
1282     {
1283       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1284       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1285     }
1286
1287
1288   if (GFC_DESCRIPTOR_RANK (a) == 1)
1289     {
1290       /* Treat it as a a row matrix A[1,count]. */
1291       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1292       aystride = 1;
1293
1294       xcount = 1;
1295       count = GFC_DESCRIPTOR_EXTENT(a,0);
1296     }
1297   else
1298     {
1299       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1300       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1301
1302       count = GFC_DESCRIPTOR_EXTENT(a,1);
1303       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1304     }
1305
1306   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1307     {
1308       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1309         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1310     }
1311
1312   if (GFC_DESCRIPTOR_RANK (b) == 1)
1313     {
1314       /* Treat it as a column matrix B[count,1] */
1315       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1316
1317       /* bystride should never be used for 1-dimensional b.
1318          in case it is we want it to cause a segfault, rather than
1319          an incorrect result. */
1320       bystride = 0xDEADBEEF;
1321       ycount = 1;
1322     }
1323   else
1324     {
1325       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1326       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1327       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1328     }
1329
1330   abase = a->base_addr;
1331   bbase = b->base_addr;
1332   dest = retarray->base_addr;
1333
1334   /* Now that everything is set up, we perform the multiplication
1335      itself.  */
1336
1337 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1338 #define min(a,b) ((a) <= (b) ? (a) : (b))
1339 #define max(a,b) ((a) >= (b) ? (a) : (b))
1340
1341   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1342       && (bxstride == 1 || bystride == 1)
1343       && (((float) xcount) * ((float) ycount) * ((float) count)
1344           > POW3(blas_limit)))
1345     {
1346       const int m = xcount, n = ycount, k = count, ldc = rystride;
1347       const GFC_REAL_4 one = 1, zero = 0;
1348       const int lda = (axstride == 1) ? aystride : axstride,
1349                 ldb = (bxstride == 1) ? bystride : bxstride;
1350
1351       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1352         {
1353           assert (gemm != NULL);
1354           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1355                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1356                 &ldc, 1, 1);
1357           return;
1358         }
1359     }
1360
1361   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1362     {
1363       /* This block of code implements a tuned matmul, derived from
1364          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1365
1366                Bo Kagstrom and Per Ling
1367                Department of Computing Science
1368                Umea University
1369                S-901 87 Umea, Sweden
1370
1371          from netlib.org, translated to C, and modified for matmul.m4.  */
1372
1373       const GFC_REAL_4 *a, *b;
1374       GFC_REAL_4 *c;
1375       const index_type m = xcount, n = ycount, k = count;
1376
1377       /* System generated locals */
1378       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1379                  i1, i2, i3, i4, i5, i6;
1380
1381       /* Local variables */
1382       GFC_REAL_4 t1[65536], /* was [256][256] */
1383                  f11, f12, f21, f22, f31, f32, f41, f42,
1384                  f13, f14, f23, f24, f33, f34, f43, f44;
1385       index_type i, j, l, ii, jj, ll;
1386       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1387
1388       a = abase;
1389       b = bbase;
1390       c = retarray->base_addr;
1391
1392       /* Parameter adjustments */
1393       c_dim1 = rystride;
1394       c_offset = 1 + c_dim1;
1395       c -= c_offset;
1396       a_dim1 = aystride;
1397       a_offset = 1 + a_dim1;
1398       a -= a_offset;
1399       b_dim1 = bystride;
1400       b_offset = 1 + b_dim1;
1401       b -= b_offset;
1402
1403       /* Early exit if possible */
1404       if (m == 0 || n == 0 || k == 0)
1405         return;
1406
1407       /* Empty c first.  */
1408       for (j=1; j<=n; j++)
1409         for (i=1; i<=m; i++)
1410           c[i + j * c_dim1] = (GFC_REAL_4)0;
1411
1412       /* Start turning the crank. */
1413       i1 = n;
1414       for (jj = 1; jj <= i1; jj += 512)
1415         {
1416           /* Computing MIN */
1417           i2 = 512;
1418           i3 = n - jj + 1;
1419           jsec = min(i2,i3);
1420           ujsec = jsec - jsec % 4;
1421           i2 = k;
1422           for (ll = 1; ll <= i2; ll += 256)
1423             {
1424               /* Computing MIN */
1425               i3 = 256;
1426               i4 = k - ll + 1;
1427               lsec = min(i3,i4);
1428               ulsec = lsec - lsec % 2;
1429
1430               i3 = m;
1431               for (ii = 1; ii <= i3; ii += 256)
1432                 {
1433                   /* Computing MIN */
1434                   i4 = 256;
1435                   i5 = m - ii + 1;
1436                   isec = min(i4,i5);
1437                   uisec = isec - isec % 2;
1438                   i4 = ll + ulsec - 1;
1439                   for (l = ll; l <= i4; l += 2)
1440                     {
1441                       i5 = ii + uisec - 1;
1442                       for (i = ii; i <= i5; i += 2)
1443                         {
1444                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1445                                         a[i + l * a_dim1];
1446                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1447                                         a[i + (l + 1) * a_dim1];
1448                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1449                                         a[i + 1 + l * a_dim1];
1450                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1451                                         a[i + 1 + (l + 1) * a_dim1];
1452                         }
1453                       if (uisec < isec)
1454                         {
1455                           t1[l - ll + 1 + (isec << 8) - 257] =
1456                                     a[ii + isec - 1 + l * a_dim1];
1457                           t1[l - ll + 2 + (isec << 8) - 257] =
1458                                     a[ii + isec - 1 + (l + 1) * a_dim1];
1459                         }
1460                     }
1461                   if (ulsec < lsec)
1462                     {
1463                       i4 = ii + isec - 1;
1464                       for (i = ii; i<= i4; ++i)
1465                         {
1466                           t1[lsec + ((i - ii + 1) << 8) - 257] =
1467                                     a[i + (ll + lsec - 1) * a_dim1];
1468                         }
1469                     }
1470
1471                   uisec = isec - isec % 4;
1472                   i4 = jj + ujsec - 1;
1473                   for (j = jj; j <= i4; j += 4)
1474                     {
1475                       i5 = ii + uisec - 1;
1476                       for (i = ii; i <= i5; i += 4)
1477                         {
1478                           f11 = c[i + j * c_dim1];
1479                           f21 = c[i + 1 + j * c_dim1];
1480                           f12 = c[i + (j + 1) * c_dim1];
1481                           f22 = c[i + 1 + (j + 1) * c_dim1];
1482                           f13 = c[i + (j + 2) * c_dim1];
1483                           f23 = c[i + 1 + (j + 2) * c_dim1];
1484                           f14 = c[i + (j + 3) * c_dim1];
1485                           f24 = c[i + 1 + (j + 3) * c_dim1];
1486                           f31 = c[i + 2 + j * c_dim1];
1487                           f41 = c[i + 3 + j * c_dim1];
1488                           f32 = c[i + 2 + (j + 1) * c_dim1];
1489                           f42 = c[i + 3 + (j + 1) * c_dim1];
1490                           f33 = c[i + 2 + (j + 2) * c_dim1];
1491                           f43 = c[i + 3 + (j + 2) * c_dim1];
1492                           f34 = c[i + 2 + (j + 3) * c_dim1];
1493                           f44 = c[i + 3 + (j + 3) * c_dim1];
1494                           i6 = ll + lsec - 1;
1495                           for (l = ll; l <= i6; ++l)
1496                             {
1497                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1498                                       * b[l + j * b_dim1];
1499                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1500                                       * b[l + j * b_dim1];
1501                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1502                                       * b[l + (j + 1) * b_dim1];
1503                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1504                                       * b[l + (j + 1) * b_dim1];
1505                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1506                                       * b[l + (j + 2) * b_dim1];
1507                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1508                                       * b[l + (j + 2) * b_dim1];
1509                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1510                                       * b[l + (j + 3) * b_dim1];
1511                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1512                                       * b[l + (j + 3) * b_dim1];
1513                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1514                                       * b[l + j * b_dim1];
1515                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1516                                       * b[l + j * b_dim1];
1517                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1518                                       * b[l + (j + 1) * b_dim1];
1519                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1520                                       * b[l + (j + 1) * b_dim1];
1521                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1522                                       * b[l + (j + 2) * b_dim1];
1523                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1524                                       * b[l + (j + 2) * b_dim1];
1525                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1526                                       * b[l + (j + 3) * b_dim1];
1527                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1528                                       * b[l + (j + 3) * b_dim1];
1529                             }
1530                           c[i + j * c_dim1] = f11;
1531                           c[i + 1 + j * c_dim1] = f21;
1532                           c[i + (j + 1) * c_dim1] = f12;
1533                           c[i + 1 + (j + 1) * c_dim1] = f22;
1534                           c[i + (j + 2) * c_dim1] = f13;
1535                           c[i + 1 + (j + 2) * c_dim1] = f23;
1536                           c[i + (j + 3) * c_dim1] = f14;
1537                           c[i + 1 + (j + 3) * c_dim1] = f24;
1538                           c[i + 2 + j * c_dim1] = f31;
1539                           c[i + 3 + j * c_dim1] = f41;
1540                           c[i + 2 + (j + 1) * c_dim1] = f32;
1541                           c[i + 3 + (j + 1) * c_dim1] = f42;
1542                           c[i + 2 + (j + 2) * c_dim1] = f33;
1543                           c[i + 3 + (j + 2) * c_dim1] = f43;
1544                           c[i + 2 + (j + 3) * c_dim1] = f34;
1545                           c[i + 3 + (j + 3) * c_dim1] = f44;
1546                         }
1547                       if (uisec < isec)
1548                         {
1549                           i5 = ii + isec - 1;
1550                           for (i = ii + uisec; i <= i5; ++i)
1551                             {
1552                               f11 = c[i + j * c_dim1];
1553                               f12 = c[i + (j + 1) * c_dim1];
1554                               f13 = c[i + (j + 2) * c_dim1];
1555                               f14 = c[i + (j + 3) * c_dim1];
1556                               i6 = ll + lsec - 1;
1557                               for (l = ll; l <= i6; ++l)
1558                                 {
1559                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1560                                           257] * b[l + j * b_dim1];
1561                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1562                                           257] * b[l + (j + 1) * b_dim1];
1563                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1564                                           257] * b[l + (j + 2) * b_dim1];
1565                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1566                                           257] * b[l + (j + 3) * b_dim1];
1567                                 }
1568                               c[i + j * c_dim1] = f11;
1569                               c[i + (j + 1) * c_dim1] = f12;
1570                               c[i + (j + 2) * c_dim1] = f13;
1571                               c[i + (j + 3) * c_dim1] = f14;
1572                             }
1573                         }
1574                     }
1575                   if (ujsec < jsec)
1576                     {
1577                       i4 = jj + jsec - 1;
1578                       for (j = jj + ujsec; j <= i4; ++j)
1579                         {
1580                           i5 = ii + uisec - 1;
1581                           for (i = ii; i <= i5; i += 4)
1582                             {
1583                               f11 = c[i + j * c_dim1];
1584                               f21 = c[i + 1 + j * c_dim1];
1585                               f31 = c[i + 2 + j * c_dim1];
1586                               f41 = c[i + 3 + j * c_dim1];
1587                               i6 = ll + lsec - 1;
1588                               for (l = ll; l <= i6; ++l)
1589                                 {
1590                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1591                                           257] * b[l + j * b_dim1];
1592                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1593                                           257] * b[l + j * b_dim1];
1594                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1595                                           257] * b[l + j * b_dim1];
1596                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1597                                           257] * b[l + j * b_dim1];
1598                                 }
1599                               c[i + j * c_dim1] = f11;
1600                               c[i + 1 + j * c_dim1] = f21;
1601                               c[i + 2 + j * c_dim1] = f31;
1602                               c[i + 3 + j * c_dim1] = f41;
1603                             }
1604                           i5 = ii + isec - 1;
1605                           for (i = ii + uisec; i <= i5; ++i)
1606                             {
1607                               f11 = c[i + j * c_dim1];
1608                               i6 = ll + lsec - 1;
1609                               for (l = ll; l <= i6; ++l)
1610                                 {
1611                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1612                                           257] * b[l + j * b_dim1];
1613                                 }
1614                               c[i + j * c_dim1] = f11;
1615                             }
1616                         }
1617                     }
1618                 }
1619             }
1620         }
1621       return;
1622     }
1623   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1624     {
1625       if (GFC_DESCRIPTOR_RANK (a) != 1)
1626         {
1627           const GFC_REAL_4 *restrict abase_x;
1628           const GFC_REAL_4 *restrict bbase_y;
1629           GFC_REAL_4 *restrict dest_y;
1630           GFC_REAL_4 s;
1631
1632           for (y = 0; y < ycount; y++)
1633             {
1634               bbase_y = &bbase[y*bystride];
1635               dest_y = &dest[y*rystride];
1636               for (x = 0; x < xcount; x++)
1637                 {
1638                   abase_x = &abase[x*axstride];
1639                   s = (GFC_REAL_4) 0;
1640                   for (n = 0; n < count; n++)
1641                     s += abase_x[n] * bbase_y[n];
1642                   dest_y[x] = s;
1643                 }
1644             }
1645         }
1646       else
1647         {
1648           const GFC_REAL_4 *restrict bbase_y;
1649           GFC_REAL_4 s;
1650
1651           for (y = 0; y < ycount; y++)
1652             {
1653               bbase_y = &bbase[y*bystride];
1654               s = (GFC_REAL_4) 0;
1655               for (n = 0; n < count; n++)
1656                 s += abase[n*axstride] * bbase_y[n];
1657               dest[y*rystride] = s;
1658             }
1659         }
1660     }
1661   else if (axstride < aystride)
1662     {
1663       for (y = 0; y < ycount; y++)
1664         for (x = 0; x < xcount; x++)
1665           dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
1666
1667       for (y = 0; y < ycount; y++)
1668         for (n = 0; n < count; n++)
1669           for (x = 0; x < xcount; x++)
1670             /* dest[x,y] += a[x,n] * b[n,y] */
1671             dest[x*rxstride + y*rystride] +=
1672                                         abase[x*axstride + n*aystride] *
1673                                         bbase[n*bxstride + y*bystride];
1674     }
1675   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1676     {
1677       const GFC_REAL_4 *restrict bbase_y;
1678       GFC_REAL_4 s;
1679
1680       for (y = 0; y < ycount; y++)
1681         {
1682           bbase_y = &bbase[y*bystride];
1683           s = (GFC_REAL_4) 0;
1684           for (n = 0; n < count; n++)
1685             s += abase[n*axstride] * bbase_y[n*bxstride];
1686           dest[y*rxstride] = s;
1687         }
1688     }
1689   else
1690     {
1691       const GFC_REAL_4 *restrict abase_x;
1692       const GFC_REAL_4 *restrict bbase_y;
1693       GFC_REAL_4 *restrict dest_y;
1694       GFC_REAL_4 s;
1695
1696       for (y = 0; y < ycount; y++)
1697         {
1698           bbase_y = &bbase[y*bystride];
1699           dest_y = &dest[y*rystride];
1700           for (x = 0; x < xcount; x++)
1701             {
1702               abase_x = &abase[x*axstride];
1703               s = (GFC_REAL_4) 0;
1704               for (n = 0; n < count; n++)
1705                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1706               dest_y[x*rxstride] = s;
1707             }
1708         }
1709     }
1710 }
1711 #undef POW3
1712 #undef min
1713 #undef max
1714
1715 #endif  /* HAVE_AVX512F */
1716
1717 /* Function to fall back to if there is no special processor-specific version.  */
1718 static void
1719 matmul_r4_vanilla (gfc_array_r4 * const restrict retarray,
1720         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1721         int blas_limit, blas_call gemm)
1722 {
1723   const GFC_REAL_4 * restrict abase;
1724   const GFC_REAL_4 * restrict bbase;
1725   GFC_REAL_4 * restrict dest;
1726
1727   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1728   index_type x, y, n, count, xcount, ycount;
1729
1730   assert (GFC_DESCRIPTOR_RANK (a) == 2
1731           || GFC_DESCRIPTOR_RANK (b) == 2);
1732
1733 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1734
1735    Either A or B (but not both) can be rank 1:
1736
1737    o One-dimensional argument A is implicitly treated as a row matrix
1738      dimensioned [1,count], so xcount=1.
1739
1740    o One-dimensional argument B is implicitly treated as a column matrix
1741      dimensioned [count, 1], so ycount=1.
1742 */
1743
1744   if (retarray->base_addr == NULL)
1745     {
1746       if (GFC_DESCRIPTOR_RANK (a) == 1)
1747         {
1748           GFC_DIMENSION_SET(retarray->dim[0], 0,
1749                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1750         }
1751       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1752         {
1753           GFC_DIMENSION_SET(retarray->dim[0], 0,
1754                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1755         }
1756       else
1757         {
1758           GFC_DIMENSION_SET(retarray->dim[0], 0,
1759                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1760
1761           GFC_DIMENSION_SET(retarray->dim[1], 0,
1762                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1763                             GFC_DESCRIPTOR_EXTENT(retarray,0));
1764         }
1765
1766       retarray->base_addr
1767         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
1768       retarray->offset = 0;
1769     }
1770   else if (unlikely (compile_options.bounds_check))
1771     {
1772       index_type ret_extent, arg_extent;
1773
1774       if (GFC_DESCRIPTOR_RANK (a) == 1)
1775         {
1776           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1777           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1778           if (arg_extent != ret_extent)
1779             runtime_error ("Incorrect extent in return array in"
1780                            " MATMUL intrinsic: is %ld, should be %ld",
1781                            (long int) ret_extent, (long int) arg_extent);
1782         }
1783       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1784         {
1785           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1786           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1787           if (arg_extent != ret_extent)
1788             runtime_error ("Incorrect extent in return array in"
1789                            " MATMUL intrinsic: is %ld, should be %ld",
1790                            (long int) ret_extent, (long int) arg_extent);
1791         }
1792       else
1793         {
1794           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1795           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1796           if (arg_extent != ret_extent)
1797             runtime_error ("Incorrect extent in return array in"
1798                            " MATMUL intrinsic for dimension 1:"
1799                            " is %ld, should be %ld",
1800                            (long int) ret_extent, (long int) arg_extent);
1801
1802           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1803           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1804           if (arg_extent != ret_extent)
1805             runtime_error ("Incorrect extent in return array in"
1806                            " MATMUL intrinsic for dimension 2:"
1807                            " is %ld, should be %ld",
1808                            (long int) ret_extent, (long int) arg_extent);
1809         }
1810     }
1811
1812
1813   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1814     {
1815       /* One-dimensional result may be addressed in the code below
1816          either as a row or a column matrix. We want both cases to
1817          work. */
1818       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1819     }
1820   else
1821     {
1822       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1823       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1824     }
1825
1826
1827   if (GFC_DESCRIPTOR_RANK (a) == 1)
1828     {
1829       /* Treat it as a a row matrix A[1,count]. */
1830       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1831       aystride = 1;
1832
1833       xcount = 1;
1834       count = GFC_DESCRIPTOR_EXTENT(a,0);
1835     }
1836   else
1837     {
1838       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1839       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1840
1841       count = GFC_DESCRIPTOR_EXTENT(a,1);
1842       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1843     }
1844
1845   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1846     {
1847       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1848         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1849     }
1850
1851   if (GFC_DESCRIPTOR_RANK (b) == 1)
1852     {
1853       /* Treat it as a column matrix B[count,1] */
1854       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1855
1856       /* bystride should never be used for 1-dimensional b.
1857          in case it is we want it to cause a segfault, rather than
1858          an incorrect result. */
1859       bystride = 0xDEADBEEF;
1860       ycount = 1;
1861     }
1862   else
1863     {
1864       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1865       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1866       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1867     }
1868
1869   abase = a->base_addr;
1870   bbase = b->base_addr;
1871   dest = retarray->base_addr;
1872
1873   /* Now that everything is set up, we perform the multiplication
1874      itself.  */
1875
1876 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1877 #define min(a,b) ((a) <= (b) ? (a) : (b))
1878 #define max(a,b) ((a) >= (b) ? (a) : (b))
1879
1880   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1881       && (bxstride == 1 || bystride == 1)
1882       && (((float) xcount) * ((float) ycount) * ((float) count)
1883           > POW3(blas_limit)))
1884     {
1885       const int m = xcount, n = ycount, k = count, ldc = rystride;
1886       const GFC_REAL_4 one = 1, zero = 0;
1887       const int lda = (axstride == 1) ? aystride : axstride,
1888                 ldb = (bxstride == 1) ? bystride : bxstride;
1889
1890       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1891         {
1892           assert (gemm != NULL);
1893           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1894                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1895                 &ldc, 1, 1);
1896           return;
1897         }
1898     }
1899
1900   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1901     {
1902       /* This block of code implements a tuned matmul, derived from
1903          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1904
1905                Bo Kagstrom and Per Ling
1906                Department of Computing Science
1907                Umea University
1908                S-901 87 Umea, Sweden
1909
1910          from netlib.org, translated to C, and modified for matmul.m4.  */
1911
1912       const GFC_REAL_4 *a, *b;
1913       GFC_REAL_4 *c;
1914       const index_type m = xcount, n = ycount, k = count;
1915
1916       /* System generated locals */
1917       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1918                  i1, i2, i3, i4, i5, i6;
1919
1920       /* Local variables */
1921       GFC_REAL_4 t1[65536], /* was [256][256] */
1922                  f11, f12, f21, f22, f31, f32, f41, f42,
1923                  f13, f14, f23, f24, f33, f34, f43, f44;
1924       index_type i, j, l, ii, jj, ll;
1925       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1926
1927       a = abase;
1928       b = bbase;
1929       c = retarray->base_addr;
1930
1931       /* Parameter adjustments */
1932       c_dim1 = rystride;
1933       c_offset = 1 + c_dim1;
1934       c -= c_offset;
1935       a_dim1 = aystride;
1936       a_offset = 1 + a_dim1;
1937       a -= a_offset;
1938       b_dim1 = bystride;
1939       b_offset = 1 + b_dim1;
1940       b -= b_offset;
1941
1942       /* Early exit if possible */
1943       if (m == 0 || n == 0 || k == 0)
1944         return;
1945
1946       /* Empty c first.  */
1947       for (j=1; j<=n; j++)
1948         for (i=1; i<=m; i++)
1949           c[i + j * c_dim1] = (GFC_REAL_4)0;
1950
1951       /* Start turning the crank. */
1952       i1 = n;
1953       for (jj = 1; jj <= i1; jj += 512)
1954         {
1955           /* Computing MIN */
1956           i2 = 512;
1957           i3 = n - jj + 1;
1958           jsec = min(i2,i3);
1959           ujsec = jsec - jsec % 4;
1960           i2 = k;
1961           for (ll = 1; ll <= i2; ll += 256)
1962             {
1963               /* Computing MIN */
1964               i3 = 256;
1965               i4 = k - ll + 1;
1966               lsec = min(i3,i4);
1967               ulsec = lsec - lsec % 2;
1968
1969               i3 = m;
1970               for (ii = 1; ii <= i3; ii += 256)
1971                 {
1972                   /* Computing MIN */
1973                   i4 = 256;
1974                   i5 = m - ii + 1;
1975                   isec = min(i4,i5);
1976                   uisec = isec - isec % 2;
1977                   i4 = ll + ulsec - 1;
1978                   for (l = ll; l <= i4; l += 2)
1979                     {
1980                       i5 = ii + uisec - 1;
1981                       for (i = ii; i <= i5; i += 2)
1982                         {
1983                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1984                                         a[i + l * a_dim1];
1985                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1986                                         a[i + (l + 1) * a_dim1];
1987                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1988                                         a[i + 1 + l * a_dim1];
1989                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1990                                         a[i + 1 + (l + 1) * a_dim1];
1991                         }
1992                       if (uisec < isec)
1993                         {
1994                           t1[l - ll + 1 + (isec << 8) - 257] =
1995                                     a[ii + isec - 1 + l * a_dim1];
1996                           t1[l - ll + 2 + (isec << 8) - 257] =
1997                                     a[ii + isec - 1 + (l + 1) * a_dim1];
1998                         }
1999                     }
2000                   if (ulsec < lsec)
2001                     {
2002                       i4 = ii + isec - 1;
2003                       for (i = ii; i<= i4; ++i)
2004                         {
2005                           t1[lsec + ((i - ii + 1) << 8) - 257] =
2006                                     a[i + (ll + lsec - 1) * a_dim1];
2007                         }
2008                     }
2009
2010                   uisec = isec - isec % 4;
2011                   i4 = jj + ujsec - 1;
2012                   for (j = jj; j <= i4; j += 4)
2013                     {
2014                       i5 = ii + uisec - 1;
2015                       for (i = ii; i <= i5; i += 4)
2016                         {
2017                           f11 = c[i + j * c_dim1];
2018                           f21 = c[i + 1 + j * c_dim1];
2019                           f12 = c[i + (j + 1) * c_dim1];
2020                           f22 = c[i + 1 + (j + 1) * c_dim1];
2021                           f13 = c[i + (j + 2) * c_dim1];
2022                           f23 = c[i + 1 + (j + 2) * c_dim1];
2023                           f14 = c[i + (j + 3) * c_dim1];
2024                           f24 = c[i + 1 + (j + 3) * c_dim1];
2025                           f31 = c[i + 2 + j * c_dim1];
2026                           f41 = c[i + 3 + j * c_dim1];
2027                           f32 = c[i + 2 + (j + 1) * c_dim1];
2028                           f42 = c[i + 3 + (j + 1) * c_dim1];
2029                           f33 = c[i + 2 + (j + 2) * c_dim1];
2030                           f43 = c[i + 3 + (j + 2) * c_dim1];
2031                           f34 = c[i + 2 + (j + 3) * c_dim1];
2032                           f44 = c[i + 3 + (j + 3) * c_dim1];
2033                           i6 = ll + lsec - 1;
2034                           for (l = ll; l <= i6; ++l)
2035                             {
2036                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2037                                       * b[l + j * b_dim1];
2038                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2039                                       * b[l + j * b_dim1];
2040                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2041                                       * b[l + (j + 1) * b_dim1];
2042                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2043                                       * b[l + (j + 1) * b_dim1];
2044                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2045                                       * b[l + (j + 2) * b_dim1];
2046                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2047                                       * b[l + (j + 2) * b_dim1];
2048                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2049                                       * b[l + (j + 3) * b_dim1];
2050                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2051                                       * b[l + (j + 3) * b_dim1];
2052                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2053                                       * b[l + j * b_dim1];
2054                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2055                                       * b[l + j * b_dim1];
2056                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2057                                       * b[l + (j + 1) * b_dim1];
2058                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2059                                       * b[l + (j + 1) * b_dim1];
2060                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2061                                       * b[l + (j + 2) * b_dim1];
2062                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2063                                       * b[l + (j + 2) * b_dim1];
2064                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2065                                       * b[l + (j + 3) * b_dim1];
2066                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2067                                       * b[l + (j + 3) * b_dim1];
2068                             }
2069                           c[i + j * c_dim1] = f11;
2070                           c[i + 1 + j * c_dim1] = f21;
2071                           c[i + (j + 1) * c_dim1] = f12;
2072                           c[i + 1 + (j + 1) * c_dim1] = f22;
2073                           c[i + (j + 2) * c_dim1] = f13;
2074                           c[i + 1 + (j + 2) * c_dim1] = f23;
2075                           c[i + (j + 3) * c_dim1] = f14;
2076                           c[i + 1 + (j + 3) * c_dim1] = f24;
2077                           c[i + 2 + j * c_dim1] = f31;
2078                           c[i + 3 + j * c_dim1] = f41;
2079                           c[i + 2 + (j + 1) * c_dim1] = f32;
2080                           c[i + 3 + (j + 1) * c_dim1] = f42;
2081                           c[i + 2 + (j + 2) * c_dim1] = f33;
2082                           c[i + 3 + (j + 2) * c_dim1] = f43;
2083                           c[i + 2 + (j + 3) * c_dim1] = f34;
2084                           c[i + 3 + (j + 3) * c_dim1] = f44;
2085                         }
2086                       if (uisec < isec)
2087                         {
2088                           i5 = ii + isec - 1;
2089                           for (i = ii + uisec; i <= i5; ++i)
2090                             {
2091                               f11 = c[i + j * c_dim1];
2092                               f12 = c[i + (j + 1) * c_dim1];
2093                               f13 = c[i + (j + 2) * c_dim1];
2094                               f14 = c[i + (j + 3) * c_dim1];
2095                               i6 = ll + lsec - 1;
2096                               for (l = ll; l <= i6; ++l)
2097                                 {
2098                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2099                                           257] * b[l + j * b_dim1];
2100                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2101                                           257] * b[l + (j + 1) * b_dim1];
2102                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2103                                           257] * b[l + (j + 2) * b_dim1];
2104                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2105                                           257] * b[l + (j + 3) * b_dim1];
2106                                 }
2107                               c[i + j * c_dim1] = f11;
2108                               c[i + (j + 1) * c_dim1] = f12;
2109                               c[i + (j + 2) * c_dim1] = f13;
2110                               c[i + (j + 3) * c_dim1] = f14;
2111                             }
2112                         }
2113                     }
2114                   if (ujsec < jsec)
2115                     {
2116                       i4 = jj + jsec - 1;
2117                       for (j = jj + ujsec; j <= i4; ++j)
2118                         {
2119                           i5 = ii + uisec - 1;
2120                           for (i = ii; i <= i5; i += 4)
2121                             {
2122                               f11 = c[i + j * c_dim1];
2123                               f21 = c[i + 1 + j * c_dim1];
2124                               f31 = c[i + 2 + j * c_dim1];
2125                               f41 = c[i + 3 + j * c_dim1];
2126                               i6 = ll + lsec - 1;
2127                               for (l = ll; l <= i6; ++l)
2128                                 {
2129                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2130                                           257] * b[l + j * b_dim1];
2131                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2132                                           257] * b[l + j * b_dim1];
2133                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2134                                           257] * b[l + j * b_dim1];
2135                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2136                                           257] * b[l + j * b_dim1];
2137                                 }
2138                               c[i + j * c_dim1] = f11;
2139                               c[i + 1 + j * c_dim1] = f21;
2140                               c[i + 2 + j * c_dim1] = f31;
2141                               c[i + 3 + j * c_dim1] = f41;
2142                             }
2143                           i5 = ii + isec - 1;
2144                           for (i = ii + uisec; i <= i5; ++i)
2145                             {
2146                               f11 = c[i + j * c_dim1];
2147                               i6 = ll + lsec - 1;
2148                               for (l = ll; l <= i6; ++l)
2149                                 {
2150                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2151                                           257] * b[l + j * b_dim1];
2152                                 }
2153                               c[i + j * c_dim1] = f11;
2154                             }
2155                         }
2156                     }
2157                 }
2158             }
2159         }
2160       return;
2161     }
2162   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2163     {
2164       if (GFC_DESCRIPTOR_RANK (a) != 1)
2165         {
2166           const GFC_REAL_4 *restrict abase_x;
2167           const GFC_REAL_4 *restrict bbase_y;
2168           GFC_REAL_4 *restrict dest_y;
2169           GFC_REAL_4 s;
2170
2171           for (y = 0; y < ycount; y++)
2172             {
2173               bbase_y = &bbase[y*bystride];
2174               dest_y = &dest[y*rystride];
2175               for (x = 0; x < xcount; x++)
2176                 {
2177                   abase_x = &abase[x*axstride];
2178                   s = (GFC_REAL_4) 0;
2179                   for (n = 0; n < count; n++)
2180                     s += abase_x[n] * bbase_y[n];
2181                   dest_y[x] = s;
2182                 }
2183             }
2184         }
2185       else
2186         {
2187           const GFC_REAL_4 *restrict bbase_y;
2188           GFC_REAL_4 s;
2189
2190           for (y = 0; y < ycount; y++)
2191             {
2192               bbase_y = &bbase[y*bystride];
2193               s = (GFC_REAL_4) 0;
2194               for (n = 0; n < count; n++)
2195                 s += abase[n*axstride] * bbase_y[n];
2196               dest[y*rystride] = s;
2197             }
2198         }
2199     }
2200   else if (axstride < aystride)
2201     {
2202       for (y = 0; y < ycount; y++)
2203         for (x = 0; x < xcount; x++)
2204           dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
2205
2206       for (y = 0; y < ycount; y++)
2207         for (n = 0; n < count; n++)
2208           for (x = 0; x < xcount; x++)
2209             /* dest[x,y] += a[x,n] * b[n,y] */
2210             dest[x*rxstride + y*rystride] +=
2211                                         abase[x*axstride + n*aystride] *
2212                                         bbase[n*bxstride + y*bystride];
2213     }
2214   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2215     {
2216       const GFC_REAL_4 *restrict bbase_y;
2217       GFC_REAL_4 s;
2218
2219       for (y = 0; y < ycount; y++)
2220         {
2221           bbase_y = &bbase[y*bystride];
2222           s = (GFC_REAL_4) 0;
2223           for (n = 0; n < count; n++)
2224             s += abase[n*axstride] * bbase_y[n*bxstride];
2225           dest[y*rxstride] = s;
2226         }
2227     }
2228   else
2229     {
2230       const GFC_REAL_4 *restrict abase_x;
2231       const GFC_REAL_4 *restrict bbase_y;
2232       GFC_REAL_4 *restrict dest_y;
2233       GFC_REAL_4 s;
2234
2235       for (y = 0; y < ycount; y++)
2236         {
2237           bbase_y = &bbase[y*bystride];
2238           dest_y = &dest[y*rystride];
2239           for (x = 0; x < xcount; x++)
2240             {
2241               abase_x = &abase[x*axstride];
2242               s = (GFC_REAL_4) 0;
2243               for (n = 0; n < count; n++)
2244                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2245               dest_y[x*rxstride] = s;
2246             }
2247         }
2248     }
2249 }
2250 #undef POW3
2251 #undef min
2252 #undef max
2253
2254
2255 /* Compiling main function, with selection code for the processor.  */
2256
2257 /* Currently, this is i386 only.  Adjust for other architectures.  */
2258
2259 #include <config/i386/cpuinfo.h>
2260 void matmul_r4 (gfc_array_r4 * const restrict retarray,
2261         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2262         int blas_limit, blas_call gemm)
2263 {
2264   static void (*matmul_p) (gfc_array_r4 * const restrict retarray,
2265         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2266         int blas_limit, blas_call gemm) = NULL;
2267
2268   if (matmul_p == NULL)
2269     {
2270       matmul_p = matmul_r4_vanilla;
2271       if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2272         {
2273           /* Run down the available processors in order of preference.  */
2274 #ifdef HAVE_AVX512F
2275           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2276             {
2277               matmul_p = matmul_r4_avx512f;
2278               goto tailcall;
2279             }
2280
2281 #endif  /* HAVE_AVX512F */
2282
2283 #ifdef HAVE_AVX2
2284           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2285             {
2286               matmul_p = matmul_r4_avx2;
2287               goto tailcall;
2288             }
2289
2290 #endif
2291
2292 #ifdef HAVE_AVX
2293           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2294             {
2295               matmul_p = matmul_r4_avx;
2296               goto tailcall;
2297             }
2298 #endif  /* HAVE_AVX */
2299         }
2300    }
2301
2302 tailcall:
2303    (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm);
2304 }
2305
2306 #else  /* Just the vanilla function.  */
2307
2308 void
2309 matmul_r4 (gfc_array_r4 * const restrict retarray,
2310         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2311         int blas_limit, blas_call gemm)
2312 {
2313   const GFC_REAL_4 * restrict abase;
2314   const GFC_REAL_4 * restrict bbase;
2315   GFC_REAL_4 * restrict dest;
2316
2317   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2318   index_type x, y, n, count, xcount, ycount;
2319
2320   assert (GFC_DESCRIPTOR_RANK (a) == 2
2321           || GFC_DESCRIPTOR_RANK (b) == 2);
2322
2323 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2324
2325    Either A or B (but not both) can be rank 1:
2326
2327    o One-dimensional argument A is implicitly treated as a row matrix
2328      dimensioned [1,count], so xcount=1.
2329
2330    o One-dimensional argument B is implicitly treated as a column matrix
2331      dimensioned [count, 1], so ycount=1.
2332 */
2333
2334   if (retarray->base_addr == NULL)
2335     {
2336       if (GFC_DESCRIPTOR_RANK (a) == 1)
2337         {
2338           GFC_DIMENSION_SET(retarray->dim[0], 0,
2339                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2340         }
2341       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2342         {
2343           GFC_DIMENSION_SET(retarray->dim[0], 0,
2344                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2345         }
2346       else
2347         {
2348           GFC_DIMENSION_SET(retarray->dim[0], 0,
2349                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2350
2351           GFC_DIMENSION_SET(retarray->dim[1], 0,
2352                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2353                             GFC_DESCRIPTOR_EXTENT(retarray,0));
2354         }
2355
2356       retarray->base_addr
2357         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
2358       retarray->offset = 0;
2359     }
2360   else if (unlikely (compile_options.bounds_check))
2361     {
2362       index_type ret_extent, arg_extent;
2363
2364       if (GFC_DESCRIPTOR_RANK (a) == 1)
2365         {
2366           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2367           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2368           if (arg_extent != ret_extent)
2369             runtime_error ("Incorrect extent in return array in"
2370                            " MATMUL intrinsic: is %ld, should be %ld",
2371                            (long int) ret_extent, (long int) arg_extent);
2372         }
2373       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2374         {
2375           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2376           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2377           if (arg_extent != ret_extent)
2378             runtime_error ("Incorrect extent in return array in"
2379                            " MATMUL intrinsic: is %ld, should be %ld",
2380                            (long int) ret_extent, (long int) arg_extent);
2381         }
2382       else
2383         {
2384           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2385           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2386           if (arg_extent != ret_extent)
2387             runtime_error ("Incorrect extent in return array in"
2388                            " MATMUL intrinsic for dimension 1:"
2389                            " is %ld, should be %ld",
2390                            (long int) ret_extent, (long int) arg_extent);
2391
2392           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2393           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2394           if (arg_extent != ret_extent)
2395             runtime_error ("Incorrect extent in return array in"
2396                            " MATMUL intrinsic for dimension 2:"
2397                            " is %ld, should be %ld",
2398                            (long int) ret_extent, (long int) arg_extent);
2399         }
2400     }
2401
2402
2403   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2404     {
2405       /* One-dimensional result may be addressed in the code below
2406          either as a row or a column matrix. We want both cases to
2407          work. */
2408       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2409     }
2410   else
2411     {
2412       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2413       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2414     }
2415
2416
2417   if (GFC_DESCRIPTOR_RANK (a) == 1)
2418     {
2419       /* Treat it as a a row matrix A[1,count]. */
2420       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2421       aystride = 1;
2422
2423       xcount = 1;
2424       count = GFC_DESCRIPTOR_EXTENT(a,0);
2425     }
2426   else
2427     {
2428       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2429       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2430
2431       count = GFC_DESCRIPTOR_EXTENT(a,1);
2432       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2433     }
2434
2435   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2436     {
2437       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2438         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2439     }
2440
2441   if (GFC_DESCRIPTOR_RANK (b) == 1)
2442     {
2443       /* Treat it as a column matrix B[count,1] */
2444       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2445
2446       /* bystride should never be used for 1-dimensional b.
2447          in case it is we want it to cause a segfault, rather than
2448          an incorrect result. */
2449       bystride = 0xDEADBEEF;
2450       ycount = 1;
2451     }
2452   else
2453     {
2454       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2455       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2456       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2457     }
2458
2459   abase = a->base_addr;
2460   bbase = b->base_addr;
2461   dest = retarray->base_addr;
2462
2463   /* Now that everything is set up, we perform the multiplication
2464      itself.  */
2465
2466 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2467 #define min(a,b) ((a) <= (b) ? (a) : (b))
2468 #define max(a,b) ((a) >= (b) ? (a) : (b))
2469
2470   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2471       && (bxstride == 1 || bystride == 1)
2472       && (((float) xcount) * ((float) ycount) * ((float) count)
2473           > POW3(blas_limit)))
2474     {
2475       const int m = xcount, n = ycount, k = count, ldc = rystride;
2476       const GFC_REAL_4 one = 1, zero = 0;
2477       const int lda = (axstride == 1) ? aystride : axstride,
2478                 ldb = (bxstride == 1) ? bystride : bxstride;
2479
2480       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2481         {
2482           assert (gemm != NULL);
2483           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2484                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2485                 &ldc, 1, 1);
2486           return;
2487         }
2488     }
2489
2490   if (rxstride == 1 && axstride == 1 && bxstride == 1)
2491     {
2492       /* This block of code implements a tuned matmul, derived from
2493          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2494
2495                Bo Kagstrom and Per Ling
2496                Department of Computing Science
2497                Umea University
2498                S-901 87 Umea, Sweden
2499
2500          from netlib.org, translated to C, and modified for matmul.m4.  */
2501
2502       const GFC_REAL_4 *a, *b;
2503       GFC_REAL_4 *c;
2504       const index_type m = xcount, n = ycount, k = count;
2505
2506       /* System generated locals */
2507       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2508                  i1, i2, i3, i4, i5, i6;
2509
2510       /* Local variables */
2511       GFC_REAL_4 t1[65536], /* was [256][256] */
2512                  f11, f12, f21, f22, f31, f32, f41, f42,
2513                  f13, f14, f23, f24, f33, f34, f43, f44;
2514       index_type i, j, l, ii, jj, ll;
2515       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2516
2517       a = abase;
2518       b = bbase;
2519       c = retarray->base_addr;
2520
2521       /* Parameter adjustments */
2522       c_dim1 = rystride;
2523       c_offset = 1 + c_dim1;
2524       c -= c_offset;
2525       a_dim1 = aystride;
2526       a_offset = 1 + a_dim1;
2527       a -= a_offset;
2528       b_dim1 = bystride;
2529       b_offset = 1 + b_dim1;
2530       b -= b_offset;
2531
2532       /* Early exit if possible */
2533       if (m == 0 || n == 0 || k == 0)
2534         return;
2535
2536       /* Empty c first.  */
2537       for (j=1; j<=n; j++)
2538         for (i=1; i<=m; i++)
2539           c[i + j * c_dim1] = (GFC_REAL_4)0;
2540
2541       /* Start turning the crank. */
2542       i1 = n;
2543       for (jj = 1; jj <= i1; jj += 512)
2544         {
2545           /* Computing MIN */
2546           i2 = 512;
2547           i3 = n - jj + 1;
2548           jsec = min(i2,i3);
2549           ujsec = jsec - jsec % 4;
2550           i2 = k;
2551           for (ll = 1; ll <= i2; ll += 256)
2552             {
2553               /* Computing MIN */
2554               i3 = 256;
2555               i4 = k - ll + 1;
2556               lsec = min(i3,i4);
2557               ulsec = lsec - lsec % 2;
2558
2559               i3 = m;
2560               for (ii = 1; ii <= i3; ii += 256)
2561                 {
2562                   /* Computing MIN */
2563                   i4 = 256;
2564                   i5 = m - ii + 1;
2565                   isec = min(i4,i5);
2566                   uisec = isec - isec % 2;
2567                   i4 = ll + ulsec - 1;
2568                   for (l = ll; l <= i4; l += 2)
2569                     {
2570                       i5 = ii + uisec - 1;
2571                       for (i = ii; i <= i5; i += 2)
2572                         {
2573                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2574                                         a[i + l * a_dim1];
2575                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2576                                         a[i + (l + 1) * a_dim1];
2577                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2578                                         a[i + 1 + l * a_dim1];
2579                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2580                                         a[i + 1 + (l + 1) * a_dim1];
2581                         }
2582                       if (uisec < isec)
2583                         {
2584                           t1[l - ll + 1 + (isec << 8) - 257] =
2585                                     a[ii + isec - 1 + l * a_dim1];
2586                           t1[l - ll + 2 + (isec << 8) - 257] =
2587                                     a[ii + isec - 1 + (l + 1) * a_dim1];
2588                         }
2589                     }
2590                   if (ulsec < lsec)
2591                     {
2592                       i4 = ii + isec - 1;
2593                       for (i = ii; i<= i4; ++i)
2594                         {
2595                           t1[lsec + ((i - ii + 1) << 8) - 257] =
2596                                     a[i + (ll + lsec - 1) * a_dim1];
2597                         }
2598                     }
2599
2600                   uisec = isec - isec % 4;
2601                   i4 = jj + ujsec - 1;
2602                   for (j = jj; j <= i4; j += 4)
2603                     {
2604                       i5 = ii + uisec - 1;
2605                       for (i = ii; i <= i5; i += 4)
2606                         {
2607                           f11 = c[i + j * c_dim1];
2608                           f21 = c[i + 1 + j * c_dim1];
2609                           f12 = c[i + (j + 1) * c_dim1];
2610                           f22 = c[i + 1 + (j + 1) * c_dim1];
2611                           f13 = c[i + (j + 2) * c_dim1];
2612                           f23 = c[i + 1 + (j + 2) * c_dim1];
2613                           f14 = c[i + (j + 3) * c_dim1];
2614                           f24 = c[i + 1 + (j + 3) * c_dim1];
2615                           f31 = c[i + 2 + j * c_dim1];
2616                           f41 = c[i + 3 + j * c_dim1];
2617                           f32 = c[i + 2 + (j + 1) * c_dim1];
2618                           f42 = c[i + 3 + (j + 1) * c_dim1];
2619                           f33 = c[i + 2 + (j + 2) * c_dim1];
2620                           f43 = c[i + 3 + (j + 2) * c_dim1];
2621                           f34 = c[i + 2 + (j + 3) * c_dim1];
2622                           f44 = c[i + 3 + (j + 3) * c_dim1];
2623                           i6 = ll + lsec - 1;
2624                           for (l = ll; l <= i6; ++l)
2625                             {
2626                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2627                                       * b[l + j * b_dim1];
2628                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2629                                       * b[l + j * b_dim1];
2630                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2631                                       * b[l + (j + 1) * b_dim1];
2632                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2633                                       * b[l + (j + 1) * b_dim1];
2634                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2635                                       * b[l + (j + 2) * b_dim1];
2636                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2637                                       * b[l + (j + 2) * b_dim1];
2638                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2639                                       * b[l + (j + 3) * b_dim1];
2640                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2641                                       * b[l + (j + 3) * b_dim1];
2642                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2643                                       * b[l + j * b_dim1];
2644                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2645                                       * b[l + j * b_dim1];
2646                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2647                                       * b[l + (j + 1) * b_dim1];
2648                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2649                                       * b[l + (j + 1) * b_dim1];
2650                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2651                                       * b[l + (j + 2) * b_dim1];
2652                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2653                                       * b[l + (j + 2) * b_dim1];
2654                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2655                                       * b[l + (j + 3) * b_dim1];
2656                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2657                                       * b[l + (j + 3) * b_dim1];
2658                             }
2659                           c[i + j * c_dim1] = f11;
2660                           c[i + 1 + j * c_dim1] = f21;
2661                           c[i + (j + 1) * c_dim1] = f12;
2662                           c[i + 1 + (j + 1) * c_dim1] = f22;
2663                           c[i + (j + 2) * c_dim1] = f13;
2664                           c[i + 1 + (j + 2) * c_dim1] = f23;
2665                           c[i + (j + 3) * c_dim1] = f14;
2666                           c[i + 1 + (j + 3) * c_dim1] = f24;
2667                           c[i + 2 + j * c_dim1] = f31;
2668                           c[i + 3 + j * c_dim1] = f41;
2669                           c[i + 2 + (j + 1) * c_dim1] = f32;
2670                           c[i + 3 + (j + 1) * c_dim1] = f42;
2671                           c[i + 2 + (j + 2) * c_dim1] = f33;
2672                           c[i + 3 + (j + 2) * c_dim1] = f43;
2673                           c[i + 2 + (j + 3) * c_dim1] = f34;
2674                           c[i + 3 + (j + 3) * c_dim1] = f44;
2675                         }
2676                       if (uisec < isec)
2677                         {
2678                           i5 = ii + isec - 1;
2679                           for (i = ii + uisec; i <= i5; ++i)
2680                             {
2681                               f11 = c[i + j * c_dim1];
2682                               f12 = c[i + (j + 1) * c_dim1];
2683                               f13 = c[i + (j + 2) * c_dim1];
2684                               f14 = c[i + (j + 3) * c_dim1];
2685                               i6 = ll + lsec - 1;
2686                               for (l = ll; l <= i6; ++l)
2687                                 {
2688                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2689                                           257] * b[l + j * b_dim1];
2690                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2691                                           257] * b[l + (j + 1) * b_dim1];
2692                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2693                                           257] * b[l + (j + 2) * b_dim1];
2694                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2695                                           257] * b[l + (j + 3) * b_dim1];
2696                                 }
2697                               c[i + j * c_dim1] = f11;
2698                               c[i + (j + 1) * c_dim1] = f12;
2699                               c[i + (j + 2) * c_dim1] = f13;
2700                               c[i + (j + 3) * c_dim1] = f14;
2701                             }
2702                         }
2703                     }
2704                   if (ujsec < jsec)
2705                     {
2706                       i4 = jj + jsec - 1;
2707                       for (j = jj + ujsec; j <= i4; ++j)
2708                         {
2709                           i5 = ii + uisec - 1;
2710                           for (i = ii; i <= i5; i += 4)
2711                             {
2712                               f11 = c[i + j * c_dim1];
2713                               f21 = c[i + 1 + j * c_dim1];
2714                               f31 = c[i + 2 + j * c_dim1];
2715                               f41 = c[i + 3 + j * c_dim1];
2716                               i6 = ll + lsec - 1;
2717                               for (l = ll; l <= i6; ++l)
2718                                 {
2719                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2720                                           257] * b[l + j * b_dim1];
2721                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2722                                           257] * b[l + j * b_dim1];
2723                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2724                                           257] * b[l + j * b_dim1];
2725                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2726                                           257] * b[l + j * b_dim1];
2727                                 }
2728                               c[i + j * c_dim1] = f11;
2729                               c[i + 1 + j * c_dim1] = f21;
2730                               c[i + 2 + j * c_dim1] = f31;
2731                               c[i + 3 + j * c_dim1] = f41;
2732                             }
2733                           i5 = ii + isec - 1;
2734                           for (i = ii + uisec; i <= i5; ++i)
2735                             {
2736                               f11 = c[i + j * c_dim1];
2737                               i6 = ll + lsec - 1;
2738                               for (l = ll; l <= i6; ++l)
2739                                 {
2740                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2741                                           257] * b[l + j * b_dim1];
2742                                 }
2743                               c[i + j * c_dim1] = f11;
2744                             }
2745                         }
2746                     }
2747                 }
2748             }
2749         }
2750       return;
2751     }
2752   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2753     {
2754       if (GFC_DESCRIPTOR_RANK (a) != 1)
2755         {
2756           const GFC_REAL_4 *restrict abase_x;
2757           const GFC_REAL_4 *restrict bbase_y;
2758           GFC_REAL_4 *restrict dest_y;
2759           GFC_REAL_4 s;
2760
2761           for (y = 0; y < ycount; y++)
2762             {
2763               bbase_y = &bbase[y*bystride];
2764               dest_y = &dest[y*rystride];
2765               for (x = 0; x < xcount; x++)
2766                 {
2767                   abase_x = &abase[x*axstride];
2768                   s = (GFC_REAL_4) 0;
2769                   for (n = 0; n < count; n++)
2770                     s += abase_x[n] * bbase_y[n];
2771                   dest_y[x] = s;
2772                 }
2773             }
2774         }
2775       else
2776         {
2777           const GFC_REAL_4 *restrict bbase_y;
2778           GFC_REAL_4 s;
2779
2780           for (y = 0; y < ycount; y++)
2781             {
2782               bbase_y = &bbase[y*bystride];
2783               s = (GFC_REAL_4) 0;
2784               for (n = 0; n < count; n++)
2785                 s += abase[n*axstride] * bbase_y[n];
2786               dest[y*rystride] = s;
2787             }
2788         }
2789     }
2790   else if (axstride < aystride)
2791     {
2792       for (y = 0; y < ycount; y++)
2793         for (x = 0; x < xcount; x++)
2794           dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
2795
2796       for (y = 0; y < ycount; y++)
2797         for (n = 0; n < count; n++)
2798           for (x = 0; x < xcount; x++)
2799             /* dest[x,y] += a[x,n] * b[n,y] */
2800             dest[x*rxstride + y*rystride] +=
2801                                         abase[x*axstride + n*aystride] *
2802                                         bbase[n*bxstride + y*bystride];
2803     }
2804   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2805     {
2806       const GFC_REAL_4 *restrict bbase_y;
2807       GFC_REAL_4 s;
2808
2809       for (y = 0; y < ycount; y++)
2810         {
2811           bbase_y = &bbase[y*bystride];
2812           s = (GFC_REAL_4) 0;
2813           for (n = 0; n < count; n++)
2814             s += abase[n*axstride] * bbase_y[n*bxstride];
2815           dest[y*rxstride] = s;
2816         }
2817     }
2818   else
2819     {
2820       const GFC_REAL_4 *restrict abase_x;
2821       const GFC_REAL_4 *restrict bbase_y;
2822       GFC_REAL_4 *restrict dest_y;
2823       GFC_REAL_4 s;
2824
2825       for (y = 0; y < ycount; y++)
2826         {
2827           bbase_y = &bbase[y*bystride];
2828           dest_y = &dest[y*rystride];
2829           for (x = 0; x < xcount; x++)
2830             {
2831               abase_x = &abase[x*axstride];
2832               s = (GFC_REAL_4) 0;
2833               for (n = 0; n < count; n++)
2834                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2835               dest_y[x*rxstride] = s;
2836             }
2837         }
2838     }
2839 }
2840 #undef POW3
2841 #undef min
2842 #undef max
2843
2844 #endif
2845 #endif
2846