libgfortran/generated/matmul_r4.c

   1 /* Implementation of the MATMUL intrinsic
   2    Copyright (C) 2002-2016 Free Software Foundation, Inc.
   3    Contributed by Paul Brook <paul@nowt.org>
   4
   5 This file is part of the GNU Fortran runtime library (libgfortran).
   6
   7 Libgfortran is free software; you can redistribute it and/or
   8 modify it under the terms of the GNU General Public
   9 License as published by the Free Software Foundation; either
  10 version 3 of the License, or (at your option) any later version.
  11
  12 Libgfortran is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 Under Section 7 of GPL version 3, you are granted additional
  18 permissions described in the GCC Runtime Library Exception, version
  19 3.1, as published by the Free Software Foundation.
  20
  21 You should have received a copy of the GNU General Public License and
  22 a copy of the GCC Runtime Library Exception along with this program;
  23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24 <http://www.gnu.org/licenses/>.  */
  25
  26 #include "libgfortran.h"
  27 #include <stdlib.h>
  28 #include <string.h>
  29 #include <assert.h>
  30
  31
  32 #if defined (HAVE_GFC_REAL_4)
  33
  34 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
  35    passed to us by the front-end, in which case we call it for large
  36    matrices.  */
  37
  38 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
  39                           const int *, const GFC_REAL_4 *, const GFC_REAL_4 *,
  40                           const int *, const GFC_REAL_4 *, const int *,
  41                           const GFC_REAL_4 *, GFC_REAL_4 *, const int *,
  42                           int, int);
  43
  44 /* The order of loops is different in the case of plain matrix
  45    multiplication C=MATMUL(A,B), and in the frequent special case where
  46    the argument A is the temporary result of a TRANSPOSE intrinsic:
  47    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
  48    looking at their strides.
  49
  50    The equivalent Fortran pseudo-code is:
  51
  52    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
  53    IF (.NOT.IS_TRANSPOSED(A)) THEN
  54      C = 0
  55      DO J=1,N
  56        DO K=1,COUNT
  57          DO I=1,M
  58            C(I,J) = C(I,J)+A(I,K)*B(K,J)
  59    ELSE
  60      DO J=1,N
  61        DO I=1,M
  62          S = 0
  63          DO K=1,COUNT
  64            S = S+A(I,K)*B(K,J)
  65          C(I,J) = S
  66    ENDIF
  67 */
  68
  69 /* If try_blas is set to a nonzero value, then the matmul function will
  70    see if there is a way to perform the matrix multiplication by a call
  71    to the BLAS gemm function.  */
  72
  73 extern void matmul_r4 (gfc_array_r4 * const restrict retarray,
  74         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
  75         int blas_limit, blas_call gemm);
  76 export_proto(matmul_r4);
  77
  78 #if defined(HAVE_AVX) && defined(HAVE_AVX2)
  79 /* REAL types generate identical code for AVX and AVX2.  Only generate
  80    an AVX2 function if we are dealing with integer.  */
  81 #undef HAVE_AVX2
  82 #endif
  83
  84
  85 /* Put exhaustive list of possible architectures here here, ORed together.  */
  86
  87 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
  88
  89 #ifdef HAVE_AVX
  90 static void
  91 matmul_r4_avx (gfc_array_r4 * const restrict retarray,
  92         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
  93         int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
  94 static void
  95 matmul_r4_avx (gfc_array_r4 * const restrict retarray,
  96         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
  97         int blas_limit, blas_call gemm)
  98 {
  99   const GFC_REAL_4 * restrict abase;
 100   const GFC_REAL_4 * restrict bbase;
 101   GFC_REAL_4 * restrict dest;
 102
 103   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
 104   index_type x, y, n, count, xcount, ycount;
 105
 106   assert (GFC_DESCRIPTOR_RANK (a) == 2
 107           || GFC_DESCRIPTOR_RANK (b) == 2);
 108
 109 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
 110
 111    Either A or B (but not both) can be rank 1:
 112
 113    o One-dimensional argument A is implicitly treated as a row matrix
 114      dimensioned [1,count], so xcount=1.
 115
 116    o One-dimensional argument B is implicitly treated as a column matrix
 117      dimensioned [count, 1], so ycount=1.
 118 */
 119
 120   if (retarray->base_addr == NULL)
 121     {
 122       if (GFC_DESCRIPTOR_RANK (a) == 1)
 123         {
 124           GFC_DIMENSION_SET(retarray->dim[0], 0,
 125                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
 126         }
 127       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 128         {
 129           GFC_DIMENSION_SET(retarray->dim[0], 0,
 130                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 131         }
 132       else
 133         {
 134           GFC_DIMENSION_SET(retarray->dim[0], 0,
 135                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 136
 137           GFC_DIMENSION_SET(retarray->dim[1], 0,
 138                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
 139                             GFC_DESCRIPTOR_EXTENT(retarray,0));
 140         }
 141
 142       retarray->base_addr
 143         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
 144       retarray->offset = 0;
 145     }
 146   else if (unlikely (compile_options.bounds_check))
 147     {
 148       index_type ret_extent, arg_extent;
 149
 150       if (GFC_DESCRIPTOR_RANK (a) == 1)
 151         {
 152           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 153           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 154           if (arg_extent != ret_extent)
 155             runtime_error ("Incorrect extent in return array in"
 156                            " MATMUL intrinsic: is %ld, should be %ld",
 157                            (long int) ret_extent, (long int) arg_extent);
 158         }
 159       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 160         {
 161           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 162           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 163           if (arg_extent != ret_extent)
 164             runtime_error ("Incorrect extent in return array in"
 165                            " MATMUL intrinsic: is %ld, should be %ld",
 166                            (long int) ret_extent, (long int) arg_extent);
 167         }
 168       else
 169         {
 170           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 171           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 172           if (arg_extent != ret_extent)
 173             runtime_error ("Incorrect extent in return array in"
 174                            " MATMUL intrinsic for dimension 1:"
 175                            " is %ld, should be %ld",
 176                            (long int) ret_extent, (long int) arg_extent);
 177
 178           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 179           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 180           if (arg_extent != ret_extent)
 181             runtime_error ("Incorrect extent in return array in"
 182                            " MATMUL intrinsic for dimension 2:"
 183                            " is %ld, should be %ld",
 184                            (long int) ret_extent, (long int) arg_extent);
 185         }
 186     }
 187
 188
 189   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 190     {
 191       /* One-dimensional result may be addressed in the code below
 192          either as a row or a column matrix. We want both cases to
 193          work. */
 194       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 195     }
 196   else
 197     {
 198       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 199       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 200     }
 201
 202
 203   if (GFC_DESCRIPTOR_RANK (a) == 1)
 204     {
 205       /* Treat it as a a row matrix A[1,count]. */
 206       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 207       aystride = 1;
 208
 209       xcount = 1;
 210       count = GFC_DESCRIPTOR_EXTENT(a,0);
 211     }
 212   else
 213     {
 214       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 215       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 216
 217       count = GFC_DESCRIPTOR_EXTENT(a,1);
 218       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 219     }
 220
 221   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 222     {
 223       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 224         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
 225     }
 226
 227   if (GFC_DESCRIPTOR_RANK (b) == 1)
 228     {
 229       /* Treat it as a column matrix B[count,1] */
 230       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 231
 232       /* bystride should never be used for 1-dimensional b.
 233          in case it is we want it to cause a segfault, rather than
 234          an incorrect result. */
 235       bystride = 0xDEADBEEF;
 236       ycount = 1;
 237     }
 238   else
 239     {
 240       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 241       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 242       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 243     }
 244
 245   abase = a->base_addr;
 246   bbase = b->base_addr;
 247   dest = retarray->base_addr;
 248
 249   /* Now that everything is set up, we perform the multiplication
 250      itself.  */
 251
 252 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 253 #define min(a,b) ((a) <= (b) ? (a) : (b))
 254 #define max(a,b) ((a) >= (b) ? (a) : (b))
 255
 256   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 257       && (bxstride == 1 || bystride == 1)
 258       && (((float) xcount) * ((float) ycount) * ((float) count)
 259           > POW3(blas_limit)))
 260     {
 261       const int m = xcount, n = ycount, k = count, ldc = rystride;
 262       const GFC_REAL_4 one = 1, zero = 0;
 263       const int lda = (axstride == 1) ? aystride : axstride,
 264                 ldb = (bxstride == 1) ? bystride : bxstride;
 265
 266       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 267         {
 268           assert (gemm != NULL);
 269           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
 270                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 271                 &ldc, 1, 1);
 272           return;
 273         }
 274     }
 275
 276   if (rxstride == 1 && axstride == 1 && bxstride == 1)
 277     {
 278       /* This block of code implements a tuned matmul, derived from
 279          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 280
 281                Bo Kagstrom and Per Ling
 282                Department of Computing Science
 283                Umea University
 284                S-901 87 Umea, Sweden
 285
 286          from netlib.org, translated to C, and modified for matmul.m4.  */
 287
 288       const GFC_REAL_4 *a, *b;
 289       GFC_REAL_4 *c;
 290       const index_type m = xcount, n = ycount, k = count;
 291
 292       /* System generated locals */
 293       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 294                  i1, i2, i3, i4, i5, i6;
 295
 296       /* Local variables */
 297       GFC_REAL_4 t1[65536], /* was [256][256] */
 298                  f11, f12, f21, f22, f31, f32, f41, f42,
 299                  f13, f14, f23, f24, f33, f34, f43, f44;
 300       index_type i, j, l, ii, jj, ll;
 301       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 302
 303       a = abase;
 304       b = bbase;
 305       c = retarray->base_addr;
 306
 307       /* Parameter adjustments */
 308       c_dim1 = rystride;
 309       c_offset = 1 + c_dim1;
 310       c -= c_offset;
 311       a_dim1 = aystride;
 312       a_offset = 1 + a_dim1;
 313       a -= a_offset;
 314       b_dim1 = bystride;
 315       b_offset = 1 + b_dim1;
 316       b -= b_offset;
 317
 318       /* Early exit if possible */
 319       if (m == 0 || n == 0 || k == 0)
 320         return;
 321
 322       /* Empty c first.  */
 323       for (j=1; j<=n; j++)
 324         for (i=1; i<=m; i++)
 325           c[i + j * c_dim1] = (GFC_REAL_4)0;
 326
 327       /* Start turning the crank. */
 328       i1 = n;
 329       for (jj = 1; jj <= i1; jj += 512)
 330         {
 331           /* Computing MIN */
 332           i2 = 512;
 333           i3 = n - jj + 1;
 334           jsec = min(i2,i3);
 335           ujsec = jsec - jsec % 4;
 336           i2 = k;
 337           for (ll = 1; ll <= i2; ll += 256)
 338             {
 339               /* Computing MIN */
 340               i3 = 256;
 341               i4 = k - ll + 1;
 342               lsec = min(i3,i4);
 343               ulsec = lsec - lsec % 2;
 344
 345               i3 = m;
 346               for (ii = 1; ii <= i3; ii += 256)
 347                 {
 348                   /* Computing MIN */
 349                   i4 = 256;
 350                   i5 = m - ii + 1;
 351                   isec = min(i4,i5);
 352                   uisec = isec - isec % 2;
 353                   i4 = ll + ulsec - 1;
 354                   for (l = ll; l <= i4; l += 2)
 355                     {
 356                       i5 = ii + uisec - 1;
 357                       for (i = ii; i <= i5; i += 2)
 358                         {
 359                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 360                                         a[i + l * a_dim1];
 361                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 362                                         a[i + (l + 1) * a_dim1];
 363                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 364                                         a[i + 1 + l * a_dim1];
 365                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 366                                         a[i + 1 + (l + 1) * a_dim1];
 367                         }
 368                       if (uisec < isec)
 369                         {
 370                           t1[l - ll + 1 + (isec << 8) - 257] =
 371                                     a[ii + isec - 1 + l * a_dim1];
 372                           t1[l - ll + 2 + (isec << 8) - 257] =
 373                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 374                         }
 375                     }
 376                   if (ulsec < lsec)
 377                     {
 378                       i4 = ii + isec - 1;
 379                       for (i = ii; i<= i4; ++i)
 380                         {
 381                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 382                                     a[i + (ll + lsec - 1) * a_dim1];
 383                         }
 384                     }
 385
 386                   uisec = isec - isec % 4;
 387                   i4 = jj + ujsec - 1;
 388                   for (j = jj; j <= i4; j += 4)
 389                     {
 390                       i5 = ii + uisec - 1;
 391                       for (i = ii; i <= i5; i += 4)
 392                         {
 393                           f11 = c[i + j * c_dim1];
 394                           f21 = c[i + 1 + j * c_dim1];
 395                           f12 = c[i + (j + 1) * c_dim1];
 396                           f22 = c[i + 1 + (j + 1) * c_dim1];
 397                           f13 = c[i + (j + 2) * c_dim1];
 398                           f23 = c[i + 1 + (j + 2) * c_dim1];
 399                           f14 = c[i + (j + 3) * c_dim1];
 400                           f24 = c[i + 1 + (j + 3) * c_dim1];
 401                           f31 = c[i + 2 + j * c_dim1];
 402                           f41 = c[i + 3 + j * c_dim1];
 403                           f32 = c[i + 2 + (j + 1) * c_dim1];
 404                           f42 = c[i + 3 + (j + 1) * c_dim1];
 405                           f33 = c[i + 2 + (j + 2) * c_dim1];
 406                           f43 = c[i + 3 + (j + 2) * c_dim1];
 407                           f34 = c[i + 2 + (j + 3) * c_dim1];
 408                           f44 = c[i + 3 + (j + 3) * c_dim1];
 409                           i6 = ll + lsec - 1;
 410                           for (l = ll; l <= i6; ++l)
 411                             {
 412                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 413                                       * b[l + j * b_dim1];
 414                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 415                                       * b[l + j * b_dim1];
 416                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 417                                       * b[l + (j + 1) * b_dim1];
 418                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 419                                       * b[l + (j + 1) * b_dim1];
 420                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 421                                       * b[l + (j + 2) * b_dim1];
 422                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 423                                       * b[l + (j + 2) * b_dim1];
 424                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 425                                       * b[l + (j + 3) * b_dim1];
 426                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 427                                       * b[l + (j + 3) * b_dim1];
 428                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 429                                       * b[l + j * b_dim1];
 430                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 431                                       * b[l + j * b_dim1];
 432                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 433                                       * b[l + (j + 1) * b_dim1];
 434                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 435                                       * b[l + (j + 1) * b_dim1];
 436                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 437                                       * b[l + (j + 2) * b_dim1];
 438                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 439                                       * b[l + (j + 2) * b_dim1];
 440                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 441                                       * b[l + (j + 3) * b_dim1];
 442                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 443                                       * b[l + (j + 3) * b_dim1];
 444                             }
 445                           c[i + j * c_dim1] = f11;
 446                           c[i + 1 + j * c_dim1] = f21;
 447                           c[i + (j + 1) * c_dim1] = f12;
 448                           c[i + 1 + (j + 1) * c_dim1] = f22;
 449                           c[i + (j + 2) * c_dim1] = f13;
 450                           c[i + 1 + (j + 2) * c_dim1] = f23;
 451                           c[i + (j + 3) * c_dim1] = f14;
 452                           c[i + 1 + (j + 3) * c_dim1] = f24;
 453                           c[i + 2 + j * c_dim1] = f31;
 454                           c[i + 3 + j * c_dim1] = f41;
 455                           c[i + 2 + (j + 1) * c_dim1] = f32;
 456                           c[i + 3 + (j + 1) * c_dim1] = f42;
 457                           c[i + 2 + (j + 2) * c_dim1] = f33;
 458                           c[i + 3 + (j + 2) * c_dim1] = f43;
 459                           c[i + 2 + (j + 3) * c_dim1] = f34;
 460                           c[i + 3 + (j + 3) * c_dim1] = f44;
 461                         }
 462                       if (uisec < isec)
 463                         {
 464                           i5 = ii + isec - 1;
 465                           for (i = ii + uisec; i <= i5; ++i)
 466                             {
 467                               f11 = c[i + j * c_dim1];
 468                               f12 = c[i + (j + 1) * c_dim1];
 469                               f13 = c[i + (j + 2) * c_dim1];
 470                               f14 = c[i + (j + 3) * c_dim1];
 471                               i6 = ll + lsec - 1;
 472                               for (l = ll; l <= i6; ++l)
 473                                 {
 474                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 475                                           257] * b[l + j * b_dim1];
 476                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 477                                           257] * b[l + (j + 1) * b_dim1];
 478                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 479                                           257] * b[l + (j + 2) * b_dim1];
 480                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 481                                           257] * b[l + (j + 3) * b_dim1];
 482                                 }
 483                               c[i + j * c_dim1] = f11;
 484                               c[i + (j + 1) * c_dim1] = f12;
 485                               c[i + (j + 2) * c_dim1] = f13;
 486                               c[i + (j + 3) * c_dim1] = f14;
 487                             }
 488                         }
 489                     }
 490                   if (ujsec < jsec)
 491                     {
 492                       i4 = jj + jsec - 1;
 493                       for (j = jj + ujsec; j <= i4; ++j)
 494                         {
 495                           i5 = ii + uisec - 1;
 496                           for (i = ii; i <= i5; i += 4)
 497                             {
 498                               f11 = c[i + j * c_dim1];
 499                               f21 = c[i + 1 + j * c_dim1];
 500                               f31 = c[i + 2 + j * c_dim1];
 501                               f41 = c[i + 3 + j * c_dim1];
 502                               i6 = ll + lsec - 1;
 503                               for (l = ll; l <= i6; ++l)
 504                                 {
 505                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 506                                           257] * b[l + j * b_dim1];
 507                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
 508                                           257] * b[l + j * b_dim1];
 509                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
 510                                           257] * b[l + j * b_dim1];
 511                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
 512                                           257] * b[l + j * b_dim1];
 513                                 }
 514                               c[i + j * c_dim1] = f11;
 515                               c[i + 1 + j * c_dim1] = f21;
 516                               c[i + 2 + j * c_dim1] = f31;
 517                               c[i + 3 + j * c_dim1] = f41;
 518                             }
 519                           i5 = ii + isec - 1;
 520                           for (i = ii + uisec; i <= i5; ++i)
 521                             {
 522                               f11 = c[i + j * c_dim1];
 523                               i6 = ll + lsec - 1;
 524                               for (l = ll; l <= i6; ++l)
 525                                 {
 526                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 527                                           257] * b[l + j * b_dim1];
 528                                 }
 529                               c[i + j * c_dim1] = f11;
 530                             }
 531                         }
 532                     }
 533                 }
 534             }
 535         }
 536       return;
 537     }
 538   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
 539     {
 540       if (GFC_DESCRIPTOR_RANK (a) != 1)
 541         {
 542           const GFC_REAL_4 *restrict abase_x;
 543           const GFC_REAL_4 *restrict bbase_y;
 544           GFC_REAL_4 *restrict dest_y;
 545           GFC_REAL_4 s;
 546
 547           for (y = 0; y < ycount; y++)
 548             {
 549               bbase_y = &bbase[y*bystride];
 550               dest_y = &dest[y*rystride];
 551               for (x = 0; x < xcount; x++)
 552                 {
 553                   abase_x = &abase[x*axstride];
 554                   s = (GFC_REAL_4) 0;
 555                   for (n = 0; n < count; n++)
 556                     s += abase_x[n] * bbase_y[n];
 557                   dest_y[x] = s;
 558                 }
 559             }
 560         }
 561       else
 562         {
 563           const GFC_REAL_4 *restrict bbase_y;
 564           GFC_REAL_4 s;
 565
 566           for (y = 0; y < ycount; y++)
 567             {
 568               bbase_y = &bbase[y*bystride];
 569               s = (GFC_REAL_4) 0;
 570               for (n = 0; n < count; n++)
 571                 s += abase[n*axstride] * bbase_y[n];
 572               dest[y*rystride] = s;
 573             }
 574         }
 575     }
 576   else if (axstride < aystride)
 577     {
 578       for (y = 0; y < ycount; y++)
 579         for (x = 0; x < xcount; x++)
 580           dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
 581
 582       for (y = 0; y < ycount; y++)
 583         for (n = 0; n < count; n++)
 584           for (x = 0; x < xcount; x++)
 585             /* dest[x,y] += a[x,n] * b[n,y] */
 586             dest[x*rxstride + y*rystride] +=
 587                                         abase[x*axstride + n*aystride] *
 588                                         bbase[n*bxstride + y*bystride];
 589     }
 590   else if (GFC_DESCRIPTOR_RANK (a) == 1)
 591     {
 592       const GFC_REAL_4 *restrict bbase_y;
 593       GFC_REAL_4 s;
 594
 595       for (y = 0; y < ycount; y++)
 596         {
 597           bbase_y = &bbase[y*bystride];
 598           s = (GFC_REAL_4) 0;
 599           for (n = 0; n < count; n++)
 600             s += abase[n*axstride] * bbase_y[n*bxstride];
 601           dest[y*rxstride] = s;
 602         }
 603     }
 604   else
 605     {
 606       const GFC_REAL_4 *restrict abase_x;
 607       const GFC_REAL_4 *restrict bbase_y;
 608       GFC_REAL_4 *restrict dest_y;
 609       GFC_REAL_4 s;
 610
 611       for (y = 0; y < ycount; y++)
 612         {
 613           bbase_y = &bbase[y*bystride];
 614           dest_y = &dest[y*rystride];
 615           for (x = 0; x < xcount; x++)
 616             {
 617               abase_x = &abase[x*axstride];
 618               s = (GFC_REAL_4) 0;
 619               for (n = 0; n < count; n++)
 620                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
 621               dest_y[x*rxstride] = s;
 622             }
 623         }
 624     }
 625 }
 626 #undef POW3
 627 #undef min
 628 #undef max
 629
 630 #endif /* HAVE_AVX */
 631
 632 #ifdef HAVE_AVX2
 633 static void
 634 matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
 635         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
 636         int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
 637 static void
 638 matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
 639         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
 640         int blas_limit, blas_call gemm)
 641 {
 642   const GFC_REAL_4 * restrict abase;
 643   const GFC_REAL_4 * restrict bbase;
 644   GFC_REAL_4 * restrict dest;
 645
 646   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
 647   index_type x, y, n, count, xcount, ycount;
 648
 649   assert (GFC_DESCRIPTOR_RANK (a) == 2
 650           || GFC_DESCRIPTOR_RANK (b) == 2);
 651
 652 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
 653
 654    Either A or B (but not both) can be rank 1:
 655
 656    o One-dimensional argument A is implicitly treated as a row matrix
 657      dimensioned [1,count], so xcount=1.
 658
 659    o One-dimensional argument B is implicitly treated as a column matrix
 660      dimensioned [count, 1], so ycount=1.
 661 */
 662
 663   if (retarray->base_addr == NULL)
 664     {
 665       if (GFC_DESCRIPTOR_RANK (a) == 1)
 666         {
 667           GFC_DIMENSION_SET(retarray->dim[0], 0,
 668                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
 669         }
 670       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 671         {
 672           GFC_DIMENSION_SET(retarray->dim[0], 0,
 673                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 674         }
 675       else
 676         {
 677           GFC_DIMENSION_SET(retarray->dim[0], 0,
 678                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 679
 680           GFC_DIMENSION_SET(retarray->dim[1], 0,
 681                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
 682                             GFC_DESCRIPTOR_EXTENT(retarray,0));
 683         }
 684
 685       retarray->base_addr
 686         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
 687       retarray->offset = 0;
 688     }
 689   else if (unlikely (compile_options.bounds_check))
 690     {
 691       index_type ret_extent, arg_extent;
 692
 693       if (GFC_DESCRIPTOR_RANK (a) == 1)
 694         {
 695           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 696           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 697           if (arg_extent != ret_extent)
 698             runtime_error ("Incorrect extent in return array in"
 699                            " MATMUL intrinsic: is %ld, should be %ld",
 700                            (long int) ret_extent, (long int) arg_extent);
 701         }
 702       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 703         {
 704           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 705           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 706           if (arg_extent != ret_extent)
 707             runtime_error ("Incorrect extent in return array in"
 708                            " MATMUL intrinsic: is %ld, should be %ld",
 709                            (long int) ret_extent, (long int) arg_extent);
 710         }
 711       else
 712         {
 713           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 714           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 715           if (arg_extent != ret_extent)
 716             runtime_error ("Incorrect extent in return array in"
 717                            " MATMUL intrinsic for dimension 1:"
 718                            " is %ld, should be %ld",
 719                            (long int) ret_extent, (long int) arg_extent);
 720
 721           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 722           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 723           if (arg_extent != ret_extent)
 724             runtime_error ("Incorrect extent in return array in"
 725                            " MATMUL intrinsic for dimension 2:"
 726                            " is %ld, should be %ld",
 727                            (long int) ret_extent, (long int) arg_extent);
 728         }
 729     }
 730
 731
 732   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 733     {
 734       /* One-dimensional result may be addressed in the code below
 735          either as a row or a column matrix. We want both cases to
 736          work. */
 737       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 738     }
 739   else
 740     {
 741       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 742       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 743     }
 744
 745
 746   if (GFC_DESCRIPTOR_RANK (a) == 1)
 747     {
 748       /* Treat it as a a row matrix A[1,count]. */
 749       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 750       aystride = 1;
 751
 752       xcount = 1;
 753       count = GFC_DESCRIPTOR_EXTENT(a,0);
 754     }
 755   else
 756     {
 757       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 758       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 759
 760       count = GFC_DESCRIPTOR_EXTENT(a,1);
 761       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 762     }
 763
 764   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 765     {
 766       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 767         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
 768     }
 769
 770   if (GFC_DESCRIPTOR_RANK (b) == 1)
 771     {
 772       /* Treat it as a column matrix B[count,1] */
 773       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 774
 775       /* bystride should never be used for 1-dimensional b.
 776          in case it is we want it to cause a segfault, rather than
 777          an incorrect result. */
 778       bystride = 0xDEADBEEF;
 779       ycount = 1;
 780     }
 781   else
 782     {
 783       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 784       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 785       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 786     }
 787
 788   abase = a->base_addr;
 789   bbase = b->base_addr;
 790   dest = retarray->base_addr;
 791
 792   /* Now that everything is set up, we perform the multiplication
 793      itself.  */
 794
 795 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 796 #define min(a,b) ((a) <= (b) ? (a) : (b))
 797 #define max(a,b) ((a) >= (b) ? (a) : (b))
 798
 799   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 800       && (bxstride == 1 || bystride == 1)
 801       && (((float) xcount) * ((float) ycount) * ((float) count)
 802           > POW3(blas_limit)))
 803     {
 804       const int m = xcount, n = ycount, k = count, ldc = rystride;
 805       const GFC_REAL_4 one = 1, zero = 0;
 806       const int lda = (axstride == 1) ? aystride : axstride,
 807                 ldb = (bxstride == 1) ? bystride : bxstride;
 808
 809       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 810         {
 811           assert (gemm != NULL);
 812           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
 813                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 814                 &ldc, 1, 1);
 815           return;
 816         }
 817     }
 818
 819   if (rxstride == 1 && axstride == 1 && bxstride == 1)
 820     {
 821       /* This block of code implements a tuned matmul, derived from
 822          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 823
 824                Bo Kagstrom and Per Ling
 825                Department of Computing Science
 826                Umea University
 827                S-901 87 Umea, Sweden
 828
 829          from netlib.org, translated to C, and modified for matmul.m4.  */
 830
 831       const GFC_REAL_4 *a, *b;
 832       GFC_REAL_4 *c;
 833       const index_type m = xcount, n = ycount, k = count;
 834
 835       /* System generated locals */
 836       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 837                  i1, i2, i3, i4, i5, i6;
 838
 839       /* Local variables */
 840       GFC_REAL_4 t1[65536], /* was [256][256] */
 841                  f11, f12, f21, f22, f31, f32, f41, f42,
 842                  f13, f14, f23, f24, f33, f34, f43, f44;
 843       index_type i, j, l, ii, jj, ll;
 844       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 845
 846       a = abase;
 847       b = bbase;
 848       c = retarray->base_addr;
 849
 850       /* Parameter adjustments */
 851       c_dim1 = rystride;
 852       c_offset = 1 + c_dim1;
 853       c -= c_offset;
 854       a_dim1 = aystride;
 855       a_offset = 1 + a_dim1;
 856       a -= a_offset;
 857       b_dim1 = bystride;
 858       b_offset = 1 + b_dim1;
 859       b -= b_offset;
 860
 861       /* Early exit if possible */
 862       if (m == 0 || n == 0 || k == 0)
 863         return;
 864
 865       /* Empty c first.  */
 866       for (j=1; j<=n; j++)
 867         for (i=1; i<=m; i++)
 868           c[i + j * c_dim1] = (GFC_REAL_4)0;
 869
 870       /* Start turning the crank. */
 871       i1 = n;
 872       for (jj = 1; jj <= i1; jj += 512)
 873         {
 874           /* Computing MIN */
 875           i2 = 512;
 876           i3 = n - jj + 1;
 877           jsec = min(i2,i3);
 878           ujsec = jsec - jsec % 4;
 879           i2 = k;
 880           for (ll = 1; ll <= i2; ll += 256)
 881             {
 882               /* Computing MIN */
 883               i3 = 256;
 884               i4 = k - ll + 1;
 885               lsec = min(i3,i4);
 886               ulsec = lsec - lsec % 2;
 887
 888               i3 = m;
 889               for (ii = 1; ii <= i3; ii += 256)
 890                 {
 891                   /* Computing MIN */
 892                   i4 = 256;
 893                   i5 = m - ii + 1;
 894                   isec = min(i4,i5);
 895                   uisec = isec - isec % 2;
 896                   i4 = ll + ulsec - 1;
 897                   for (l = ll; l <= i4; l += 2)
 898                     {
 899                       i5 = ii + uisec - 1;
 900                       for (i = ii; i <= i5; i += 2)
 901                         {
 902                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 903                                         a[i + l * a_dim1];
 904                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 905                                         a[i + (l + 1) * a_dim1];
 906                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 907                                         a[i + 1 + l * a_dim1];
 908                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 909                                         a[i + 1 + (l + 1) * a_dim1];
 910                         }
 911                       if (uisec < isec)
 912                         {
 913                           t1[l - ll + 1 + (isec << 8) - 257] =
 914                                     a[ii + isec - 1 + l * a_dim1];
 915                           t1[l - ll + 2 + (isec << 8) - 257] =
 916                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 917                         }
 918                     }
 919                   if (ulsec < lsec)
 920                     {
 921                       i4 = ii + isec - 1;
 922                       for (i = ii; i<= i4; ++i)
 923                         {
 924                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 925                                     a[i + (ll + lsec - 1) * a_dim1];
 926                         }
 927                     }
 928
 929                   uisec = isec - isec % 4;
 930                   i4 = jj + ujsec - 1;
 931                   for (j = jj; j <= i4; j += 4)
 932                     {
 933                       i5 = ii + uisec - 1;
 934                       for (i = ii; i <= i5; i += 4)
 935                         {
 936                           f11 = c[i + j * c_dim1];
 937                           f21 = c[i + 1 + j * c_dim1];
 938                           f12 = c[i + (j + 1) * c_dim1];
 939                           f22 = c[i + 1 + (j + 1) * c_dim1];
 940                           f13 = c[i + (j + 2) * c_dim1];
 941                           f23 = c[i + 1 + (j + 2) * c_dim1];
 942                           f14 = c[i + (j + 3) * c_dim1];
 943                           f24 = c[i + 1 + (j + 3) * c_dim1];
 944                           f31 = c[i + 2 + j * c_dim1];
 945                           f41 = c[i + 3 + j * c_dim1];
 946                           f32 = c[i + 2 + (j + 1) * c_dim1];
 947                           f42 = c[i + 3 + (j + 1) * c_dim1];
 948                           f33 = c[i + 2 + (j + 2) * c_dim1];
 949                           f43 = c[i + 3 + (j + 2) * c_dim1];
 950                           f34 = c[i + 2 + (j + 3) * c_dim1];
 951                           f44 = c[i + 3 + (j + 3) * c_dim1];
 952                           i6 = ll + lsec - 1;
 953                           for (l = ll; l <= i6; ++l)
 954                             {
 955                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 956                                       * b[l + j * b_dim1];
 957                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 958                                       * b[l + j * b_dim1];
 959                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 960                                       * b[l + (j + 1) * b_dim1];
 961                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 962                                       * b[l + (j + 1) * b_dim1];
 963                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 964                                       * b[l + (j + 2) * b_dim1];
 965                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 966                                       * b[l + (j + 2) * b_dim1];
 967                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 968                                       * b[l + (j + 3) * b_dim1];
 969                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 970                                       * b[l + (j + 3) * b_dim1];
 971                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 972                                       * b[l + j * b_dim1];
 973                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 974                                       * b[l + j * b_dim1];
 975                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 976                                       * b[l + (j + 1) * b_dim1];
 977                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 978                                       * b[l + (j + 1) * b_dim1];
 979                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 980                                       * b[l + (j + 2) * b_dim1];
 981                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 982                                       * b[l + (j + 2) * b_dim1];
 983                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 984                                       * b[l + (j + 3) * b_dim1];
 985                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 986                                       * b[l + (j + 3) * b_dim1];
 987                             }
 988                           c[i + j * c_dim1] = f11;
 989                           c[i + 1 + j * c_dim1] = f21;
 990                           c[i + (j + 1) * c_dim1] = f12;
 991                           c[i + 1 + (j + 1) * c_dim1] = f22;
 992                           c[i + (j + 2) * c_dim1] = f13;
 993                           c[i + 1 + (j + 2) * c_dim1] = f23;
 994                           c[i + (j + 3) * c_dim1] = f14;
 995                           c[i + 1 + (j + 3) * c_dim1] = f24;
 996                           c[i + 2 + j * c_dim1] = f31;
 997                           c[i + 3 + j * c_dim1] = f41;
 998                           c[i + 2 + (j + 1) * c_dim1] = f32;
 999                           c[i + 3 + (j + 1) * c_dim1] = f42;
1000                           c[i + 2 + (j + 2) * c_dim1] = f33;
1001                           c[i + 3 + (j + 2) * c_dim1] = f43;
1002                           c[i + 2 + (j + 3) * c_dim1] = f34;
1003                           c[i + 3 + (j + 3) * c_dim1] = f44;
1004                         }
1005                       if (uisec < isec)
1006                         {
1007                           i5 = ii + isec - 1;
1008                           for (i = ii + uisec; i <= i5; ++i)
1009                             {
1010                               f11 = c[i + j * c_dim1];
1011                               f12 = c[i + (j + 1) * c_dim1];
1012                               f13 = c[i + (j + 2) * c_dim1];
1013                               f14 = c[i + (j + 3) * c_dim1];
1014                               i6 = ll + lsec - 1;
1015                               for (l = ll; l <= i6; ++l)
1016                                 {
1017                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1018                                           257] * b[l + j * b_dim1];
1019                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1020                                           257] * b[l + (j + 1) * b_dim1];
1021                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1022                                           257] * b[l + (j + 2) * b_dim1];
1023                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1024                                           257] * b[l + (j + 3) * b_dim1];
1025                                 }
1026                               c[i + j * c_dim1] = f11;
1027                               c[i + (j + 1) * c_dim1] = f12;
1028                               c[i + (j + 2) * c_dim1] = f13;
1029                               c[i + (j + 3) * c_dim1] = f14;
1030                             }
1031                         }
1032                     }
1033                   if (ujsec < jsec)
1034                     {
1035                       i4 = jj + jsec - 1;
1036                       for (j = jj + ujsec; j <= i4; ++j)
1037                         {
1038                           i5 = ii + uisec - 1;
1039                           for (i = ii; i <= i5; i += 4)
1040                             {
1041                               f11 = c[i + j * c_dim1];
1042                               f21 = c[i + 1 + j * c_dim1];
1043                               f31 = c[i + 2 + j * c_dim1];
1044                               f41 = c[i + 3 + j * c_dim1];
1045                               i6 = ll + lsec - 1;
1046                               for (l = ll; l <= i6; ++l)
1047                                 {
1048                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1049                                           257] * b[l + j * b_dim1];
1050                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1051                                           257] * b[l + j * b_dim1];
1052                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1053                                           257] * b[l + j * b_dim1];
1054                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1055                                           257] * b[l + j * b_dim1];
1056                                 }
1057                               c[i + j * c_dim1] = f11;
1058                               c[i + 1 + j * c_dim1] = f21;
1059                               c[i + 2 + j * c_dim1] = f31;
1060                               c[i + 3 + j * c_dim1] = f41;
1061                             }
1062                           i5 = ii + isec - 1;
1063                           for (i = ii + uisec; i <= i5; ++i)
1064                             {
1065                               f11 = c[i + j * c_dim1];
1066                               i6 = ll + lsec - 1;
1067                               for (l = ll; l <= i6; ++l)
1068                                 {
1069                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1070                                           257] * b[l + j * b_dim1];
1071                                 }
1072                               c[i + j * c_dim1] = f11;
1073                             }
1074                         }
1075                     }
1076                 }
1077             }
1078         }
1079       return;
1080     }
1081   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1082     {
1083       if (GFC_DESCRIPTOR_RANK (a) != 1)
1084         {
1085           const GFC_REAL_4 *restrict abase_x;
1086           const GFC_REAL_4 *restrict bbase_y;
1087           GFC_REAL_4 *restrict dest_y;
1088           GFC_REAL_4 s;
1089
1090           for (y = 0; y < ycount; y++)
1091             {
1092               bbase_y = &bbase[y*bystride];
1093               dest_y = &dest[y*rystride];
1094               for (x = 0; x < xcount; x++)
1095                 {
1096                   abase_x = &abase[x*axstride];
1097                   s = (GFC_REAL_4) 0;
1098                   for (n = 0; n < count; n++)
1099                     s += abase_x[n] * bbase_y[n];
1100                   dest_y[x] = s;
1101                 }
1102             }
1103         }
1104       else
1105         {
1106           const GFC_REAL_4 *restrict bbase_y;
1107           GFC_REAL_4 s;
1108
1109           for (y = 0; y < ycount; y++)
1110             {
1111               bbase_y = &bbase[y*bystride];
1112               s = (GFC_REAL_4) 0;
1113               for (n = 0; n < count; n++)
1114                 s += abase[n*axstride] * bbase_y[n];
1115               dest[y*rystride] = s;
1116             }
1117         }
1118     }
1119   else if (axstride < aystride)
1120     {
1121       for (y = 0; y < ycount; y++)
1122         for (x = 0; x < xcount; x++)
1123           dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
1124
1125       for (y = 0; y < ycount; y++)
1126         for (n = 0; n < count; n++)
1127           for (x = 0; x < xcount; x++)
1128             /* dest[x,y] += a[x,n] * b[n,y] */
1129             dest[x*rxstride + y*rystride] +=
1130                                         abase[x*axstride + n*aystride] *
1131                                         bbase[n*bxstride + y*bystride];
1132     }
1133   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1134     {
1135       const GFC_REAL_4 *restrict bbase_y;
1136       GFC_REAL_4 s;
1137
1138       for (y = 0; y < ycount; y++)
1139         {
1140           bbase_y = &bbase[y*bystride];
1141           s = (GFC_REAL_4) 0;
1142           for (n = 0; n < count; n++)
1143             s += abase[n*axstride] * bbase_y[n*bxstride];
1144           dest[y*rxstride] = s;
1145         }
1146     }
1147   else
1148     {
1149       const GFC_REAL_4 *restrict abase_x;
1150       const GFC_REAL_4 *restrict bbase_y;
1151       GFC_REAL_4 *restrict dest_y;
1152       GFC_REAL_4 s;
1153
1154       for (y = 0; y < ycount; y++)
1155         {
1156           bbase_y = &bbase[y*bystride];
1157           dest_y = &dest[y*rystride];
1158           for (x = 0; x < xcount; x++)
1159             {
1160               abase_x = &abase[x*axstride];
1161               s = (GFC_REAL_4) 0;
1162               for (n = 0; n < count; n++)
1163                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1164               dest_y[x*rxstride] = s;
1165             }
1166         }
1167     }
1168 }
1169 #undef POW3
1170 #undef min
1171 #undef max
1172
1173 #endif /* HAVE_AVX2 */
1174
1175 #ifdef HAVE_AVX512F
1176 static void
1177 matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
1178         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1179         int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1180 static void
1181 matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
1182         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1183         int blas_limit, blas_call gemm)
1184 {
1185   const GFC_REAL_4 * restrict abase;
1186   const GFC_REAL_4 * restrict bbase;
1187   GFC_REAL_4 * restrict dest;
1188
1189   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1190   index_type x, y, n, count, xcount, ycount;
1191
1192   assert (GFC_DESCRIPTOR_RANK (a) == 2
1193           || GFC_DESCRIPTOR_RANK (b) == 2);
1194
1195 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1196
1197    Either A or B (but not both) can be rank 1:
1198
1199    o One-dimensional argument A is implicitly treated as a row matrix
1200      dimensioned [1,count], so xcount=1.
1201
1202    o One-dimensional argument B is implicitly treated as a column matrix
1203      dimensioned [count, 1], so ycount=1.
1204 */
1205
1206   if (retarray->base_addr == NULL)
1207     {
1208       if (GFC_DESCRIPTOR_RANK (a) == 1)
1209         {
1210           GFC_DIMENSION_SET(retarray->dim[0], 0,
1211                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1212         }
1213       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1214         {
1215           GFC_DIMENSION_SET(retarray->dim[0], 0,
1216                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1217         }
1218       else
1219         {
1220           GFC_DIMENSION_SET(retarray->dim[0], 0,
1221                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1222
1223           GFC_DIMENSION_SET(retarray->dim[1], 0,
1224                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1225                             GFC_DESCRIPTOR_EXTENT(retarray,0));
1226         }
1227
1228       retarray->base_addr
1229         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
1230       retarray->offset = 0;
1231     }
1232   else if (unlikely (compile_options.bounds_check))
1233     {
1234       index_type ret_extent, arg_extent;
1235
1236       if (GFC_DESCRIPTOR_RANK (a) == 1)
1237         {
1238           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1239           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1240           if (arg_extent != ret_extent)
1241             runtime_error ("Incorrect extent in return array in"
1242                            " MATMUL intrinsic: is %ld, should be %ld",
1243                            (long int) ret_extent, (long int) arg_extent);
1244         }
1245       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1246         {
1247           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1248           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1249           if (arg_extent != ret_extent)
1250             runtime_error ("Incorrect extent in return array in"
1251                            " MATMUL intrinsic: is %ld, should be %ld",
1252                            (long int) ret_extent, (long int) arg_extent);
1253         }
1254       else
1255         {
1256           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1257           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1258           if (arg_extent != ret_extent)
1259             runtime_error ("Incorrect extent in return array in"
1260                            " MATMUL intrinsic for dimension 1:"
1261                            " is %ld, should be %ld",
1262                            (long int) ret_extent, (long int) arg_extent);
1263
1264           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1265           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1266           if (arg_extent != ret_extent)
1267             runtime_error ("Incorrect extent in return array in"
1268                            " MATMUL intrinsic for dimension 2:"
1269                            " is %ld, should be %ld",
1270                            (long int) ret_extent, (long int) arg_extent);
1271         }
1272     }
1273
1274
1275   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1276     {
1277       /* One-dimensional result may be addressed in the code below
1278          either as a row or a column matrix. We want both cases to
1279          work. */
1280       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1281     }
1282   else
1283     {
1284       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1285       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1286     }
1287
1288
1289   if (GFC_DESCRIPTOR_RANK (a) == 1)
1290     {
1291       /* Treat it as a a row matrix A[1,count]. */
1292       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1293       aystride = 1;
1294
1295       xcount = 1;
1296       count = GFC_DESCRIPTOR_EXTENT(a,0);
1297     }
1298   else
1299     {
1300       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1301       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1302
1303       count = GFC_DESCRIPTOR_EXTENT(a,1);
1304       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1305     }
1306
1307   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1308     {
1309       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1310         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1311     }
1312
1313   if (GFC_DESCRIPTOR_RANK (b) == 1)
1314     {
1315       /* Treat it as a column matrix B[count,1] */
1316       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1317
1318       /* bystride should never be used for 1-dimensional b.
1319          in case it is we want it to cause a segfault, rather than
1320          an incorrect result. */
1321       bystride = 0xDEADBEEF;
1322       ycount = 1;
1323     }
1324   else
1325     {
1326       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1327       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1328       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1329     }
1330
1331   abase = a->base_addr;
1332   bbase = b->base_addr;
1333   dest = retarray->base_addr;
1334
1335   /* Now that everything is set up, we perform the multiplication
1336      itself.  */
1337
1338 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1339 #define min(a,b) ((a) <= (b) ? (a) : (b))
1340 #define max(a,b) ((a) >= (b) ? (a) : (b))
1341
1342   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1343       && (bxstride == 1 || bystride == 1)
1344       && (((float) xcount) * ((float) ycount) * ((float) count)
1345           > POW3(blas_limit)))
1346     {
1347       const int m = xcount, n = ycount, k = count, ldc = rystride;
1348       const GFC_REAL_4 one = 1, zero = 0;
1349       const int lda = (axstride == 1) ? aystride : axstride,
1350                 ldb = (bxstride == 1) ? bystride : bxstride;
1351
1352       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1353         {
1354           assert (gemm != NULL);
1355           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1356                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1357                 &ldc, 1, 1);
1358           return;
1359         }
1360     }
1361
1362   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1363     {
1364       /* This block of code implements a tuned matmul, derived from
1365          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1366
1367                Bo Kagstrom and Per Ling
1368                Department of Computing Science
1369                Umea University
1370                S-901 87 Umea, Sweden
1371
1372          from netlib.org, translated to C, and modified for matmul.m4.  */
1373
1374       const GFC_REAL_4 *a, *b;
1375       GFC_REAL_4 *c;
1376       const index_type m = xcount, n = ycount, k = count;
1377
1378       /* System generated locals */
1379       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1380                  i1, i2, i3, i4, i5, i6;
1381
1382       /* Local variables */
1383       GFC_REAL_4 t1[65536], /* was [256][256] */
1384                  f11, f12, f21, f22, f31, f32, f41, f42,
1385                  f13, f14, f23, f24, f33, f34, f43, f44;
1386       index_type i, j, l, ii, jj, ll;
1387       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1388
1389       a = abase;
1390       b = bbase;
1391       c = retarray->base_addr;
1392
1393       /* Parameter adjustments */
1394       c_dim1 = rystride;
1395       c_offset = 1 + c_dim1;
1396       c -= c_offset;
1397       a_dim1 = aystride;
1398       a_offset = 1 + a_dim1;
1399       a -= a_offset;
1400       b_dim1 = bystride;
1401       b_offset = 1 + b_dim1;
1402       b -= b_offset;
1403
1404       /* Early exit if possible */
1405       if (m == 0 || n == 0 || k == 0)
1406         return;
1407
1408       /* Empty c first.  */
1409       for (j=1; j<=n; j++)
1410         for (i=1; i<=m; i++)
1411           c[i + j * c_dim1] = (GFC_REAL_4)0;
1412
1413       /* Start turning the crank. */
1414       i1 = n;
1415       for (jj = 1; jj <= i1; jj += 512)
1416         {
1417           /* Computing MIN */
1418           i2 = 512;
1419           i3 = n - jj + 1;
1420           jsec = min(i2,i3);
1421           ujsec = jsec - jsec % 4;
1422           i2 = k;
1423           for (ll = 1; ll <= i2; ll += 256)
1424             {
1425               /* Computing MIN */
1426               i3 = 256;
1427               i4 = k - ll + 1;
1428               lsec = min(i3,i4);
1429               ulsec = lsec - lsec % 2;
1430
1431               i3 = m;
1432               for (ii = 1; ii <= i3; ii += 256)
1433                 {
1434                   /* Computing MIN */
1435                   i4 = 256;
1436                   i5 = m - ii + 1;
1437                   isec = min(i4,i5);
1438                   uisec = isec - isec % 2;
1439                   i4 = ll + ulsec - 1;
1440                   for (l = ll; l <= i4; l += 2)
1441                     {
1442                       i5 = ii + uisec - 1;
1443                       for (i = ii; i <= i5; i += 2)
1444                         {
1445                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1446                                         a[i + l * a_dim1];
1447                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1448                                         a[i + (l + 1) * a_dim1];
1449                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1450                                         a[i + 1 + l * a_dim1];
1451                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1452                                         a[i + 1 + (l + 1) * a_dim1];
1453                         }
1454                       if (uisec < isec)
1455                         {
1456                           t1[l - ll + 1 + (isec << 8) - 257] =
1457                                     a[ii + isec - 1 + l * a_dim1];
1458                           t1[l - ll + 2 + (isec << 8) - 257] =
1459                                     a[ii + isec - 1 + (l + 1) * a_dim1];
1460                         }
1461                     }
1462                   if (ulsec < lsec)
1463                     {
1464                       i4 = ii + isec - 1;
1465                       for (i = ii; i<= i4; ++i)
1466                         {
1467                           t1[lsec + ((i - ii + 1) << 8) - 257] =
1468                                     a[i + (ll + lsec - 1) * a_dim1];
1469                         }
1470                     }
1471
1472                   uisec = isec - isec % 4;
1473                   i4 = jj + ujsec - 1;
1474                   for (j = jj; j <= i4; j += 4)
1475                     {
1476                       i5 = ii + uisec - 1;
1477                       for (i = ii; i <= i5; i += 4)
1478                         {
1479                           f11 = c[i + j * c_dim1];
1480                           f21 = c[i + 1 + j * c_dim1];
1481                           f12 = c[i + (j + 1) * c_dim1];
1482                           f22 = c[i + 1 + (j + 1) * c_dim1];
1483                           f13 = c[i + (j + 2) * c_dim1];
1484                           f23 = c[i + 1 + (j + 2) * c_dim1];
1485                           f14 = c[i + (j + 3) * c_dim1];
1486                           f24 = c[i + 1 + (j + 3) * c_dim1];
1487                           f31 = c[i + 2 + j * c_dim1];
1488                           f41 = c[i + 3 + j * c_dim1];
1489                           f32 = c[i + 2 + (j + 1) * c_dim1];
1490                           f42 = c[i + 3 + (j + 1) * c_dim1];
1491                           f33 = c[i + 2 + (j + 2) * c_dim1];
1492                           f43 = c[i + 3 + (j + 2) * c_dim1];
1493                           f34 = c[i + 2 + (j + 3) * c_dim1];
1494                           f44 = c[i + 3 + (j + 3) * c_dim1];
1495                           i6 = ll + lsec - 1;
1496                           for (l = ll; l <= i6; ++l)
1497                             {
1498                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1499                                       * b[l + j * b_dim1];
1500                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1501                                       * b[l + j * b_dim1];
1502                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1503                                       * b[l + (j + 1) * b_dim1];
1504                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1505                                       * b[l + (j + 1) * b_dim1];
1506                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1507                                       * b[l + (j + 2) * b_dim1];
1508                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1509                                       * b[l + (j + 2) * b_dim1];
1510                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1511                                       * b[l + (j + 3) * b_dim1];
1512                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1513                                       * b[l + (j + 3) * b_dim1];
1514                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1515                                       * b[l + j * b_dim1];
1516                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1517                                       * b[l + j * b_dim1];
1518                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1519                                       * b[l + (j + 1) * b_dim1];
1520                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1521                                       * b[l + (j + 1) * b_dim1];
1522                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1523                                       * b[l + (j + 2) * b_dim1];
1524                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1525                                       * b[l + (j + 2) * b_dim1];
1526                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1527                                       * b[l + (j + 3) * b_dim1];
1528                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1529                                       * b[l + (j + 3) * b_dim1];
1530                             }
1531                           c[i + j * c_dim1] = f11;
1532                           c[i + 1 + j * c_dim1] = f21;
1533                           c[i + (j + 1) * c_dim1] = f12;
1534                           c[i + 1 + (j + 1) * c_dim1] = f22;
1535                           c[i + (j + 2) * c_dim1] = f13;
1536                           c[i + 1 + (j + 2) * c_dim1] = f23;
1537                           c[i + (j + 3) * c_dim1] = f14;
1538                           c[i + 1 + (j + 3) * c_dim1] = f24;
1539                           c[i + 2 + j * c_dim1] = f31;
1540                           c[i + 3 + j * c_dim1] = f41;
1541                           c[i + 2 + (j + 1) * c_dim1] = f32;
1542                           c[i + 3 + (j + 1) * c_dim1] = f42;
1543                           c[i + 2 + (j + 2) * c_dim1] = f33;
1544                           c[i + 3 + (j + 2) * c_dim1] = f43;
1545                           c[i + 2 + (j + 3) * c_dim1] = f34;
1546                           c[i + 3 + (j + 3) * c_dim1] = f44;
1547                         }
1548                       if (uisec < isec)
1549                         {
1550                           i5 = ii + isec - 1;
1551                           for (i = ii + uisec; i <= i5; ++i)
1552                             {
1553                               f11 = c[i + j * c_dim1];
1554                               f12 = c[i + (j + 1) * c_dim1];
1555                               f13 = c[i + (j + 2) * c_dim1];
1556                               f14 = c[i + (j + 3) * c_dim1];
1557                               i6 = ll + lsec - 1;
1558                               for (l = ll; l <= i6; ++l)
1559                                 {
1560                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1561                                           257] * b[l + j * b_dim1];
1562                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1563                                           257] * b[l + (j + 1) * b_dim1];
1564                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1565                                           257] * b[l + (j + 2) * b_dim1];
1566                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1567                                           257] * b[l + (j + 3) * b_dim1];
1568                                 }
1569                               c[i + j * c_dim1] = f11;
1570                               c[i + (j + 1) * c_dim1] = f12;
1571                               c[i + (j + 2) * c_dim1] = f13;
1572                               c[i + (j + 3) * c_dim1] = f14;
1573                             }
1574                         }
1575                     }
1576                   if (ujsec < jsec)
1577                     {
1578                       i4 = jj + jsec - 1;
1579                       for (j = jj + ujsec; j <= i4; ++j)
1580                         {
1581                           i5 = ii + uisec - 1;
1582                           for (i = ii; i <= i5; i += 4)
1583                             {
1584                               f11 = c[i + j * c_dim1];
1585                               f21 = c[i + 1 + j * c_dim1];
1586                               f31 = c[i + 2 + j * c_dim1];
1587                               f41 = c[i + 3 + j * c_dim1];
1588                               i6 = ll + lsec - 1;
1589                               for (l = ll; l <= i6; ++l)
1590                                 {
1591                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1592                                           257] * b[l + j * b_dim1];
1593                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1594                                           257] * b[l + j * b_dim1];
1595                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1596                                           257] * b[l + j * b_dim1];
1597                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1598                                           257] * b[l + j * b_dim1];
1599                                 }
1600                               c[i + j * c_dim1] = f11;
1601                               c[i + 1 + j * c_dim1] = f21;
1602                               c[i + 2 + j * c_dim1] = f31;
1603                               c[i + 3 + j * c_dim1] = f41;
1604                             }
1605                           i5 = ii + isec - 1;
1606                           for (i = ii + uisec; i <= i5; ++i)
1607                             {
1608                               f11 = c[i + j * c_dim1];
1609                               i6 = ll + lsec - 1;
1610                               for (l = ll; l <= i6; ++l)
1611                                 {
1612                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1613                                           257] * b[l + j * b_dim1];
1614                                 }
1615                               c[i + j * c_dim1] = f11;
1616                             }
1617                         }
1618                     }
1619                 }
1620             }
1621         }
1622       return;
1623     }
1624   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1625     {
1626       if (GFC_DESCRIPTOR_RANK (a) != 1)
1627         {
1628           const GFC_REAL_4 *restrict abase_x;
1629           const GFC_REAL_4 *restrict bbase_y;
1630           GFC_REAL_4 *restrict dest_y;
1631           GFC_REAL_4 s;
1632
1633           for (y = 0; y < ycount; y++)
1634             {
1635               bbase_y = &bbase[y*bystride];
1636               dest_y = &dest[y*rystride];
1637               for (x = 0; x < xcount; x++)
1638                 {
1639                   abase_x = &abase[x*axstride];
1640                   s = (GFC_REAL_4) 0;
1641                   for (n = 0; n < count; n++)
1642                     s += abase_x[n] * bbase_y[n];
1643                   dest_y[x] = s;
1644                 }
1645             }
1646         }
1647       else
1648         {
1649           const GFC_REAL_4 *restrict bbase_y;
1650           GFC_REAL_4 s;
1651
1652           for (y = 0; y < ycount; y++)
1653             {
1654               bbase_y = &bbase[y*bystride];
1655               s = (GFC_REAL_4) 0;
1656               for (n = 0; n < count; n++)
1657                 s += abase[n*axstride] * bbase_y[n];
1658               dest[y*rystride] = s;
1659             }
1660         }
1661     }
1662   else if (axstride < aystride)
1663     {
1664       for (y = 0; y < ycount; y++)
1665         for (x = 0; x < xcount; x++)
1666           dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
1667
1668       for (y = 0; y < ycount; y++)
1669         for (n = 0; n < count; n++)
1670           for (x = 0; x < xcount; x++)
1671             /* dest[x,y] += a[x,n] * b[n,y] */
1672             dest[x*rxstride + y*rystride] +=
1673                                         abase[x*axstride + n*aystride] *
1674                                         bbase[n*bxstride + y*bystride];
1675     }
1676   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1677     {
1678       const GFC_REAL_4 *restrict bbase_y;
1679       GFC_REAL_4 s;
1680
1681       for (y = 0; y < ycount; y++)
1682         {
1683           bbase_y = &bbase[y*bystride];
1684           s = (GFC_REAL_4) 0;
1685           for (n = 0; n < count; n++)
1686             s += abase[n*axstride] * bbase_y[n*bxstride];
1687           dest[y*rxstride] = s;
1688         }
1689     }
1690   else
1691     {
1692       const GFC_REAL_4 *restrict abase_x;
1693       const GFC_REAL_4 *restrict bbase_y;
1694       GFC_REAL_4 *restrict dest_y;
1695       GFC_REAL_4 s;
1696
1697       for (y = 0; y < ycount; y++)
1698         {
1699           bbase_y = &bbase[y*bystride];
1700           dest_y = &dest[y*rystride];
1701           for (x = 0; x < xcount; x++)
1702             {
1703               abase_x = &abase[x*axstride];
1704               s = (GFC_REAL_4) 0;
1705               for (n = 0; n < count; n++)
1706                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1707               dest_y[x*rxstride] = s;
1708             }
1709         }
1710     }
1711 }
1712 #undef POW3
1713 #undef min
1714 #undef max
1715
1716 #endif  /* HAVE_AVX512F */
1717
1718 /* Function to fall back to if there is no special processor-specific version.  */
1719 static void
1720 matmul_r4_vanilla (gfc_array_r4 * const restrict retarray,
1721         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1722         int blas_limit, blas_call gemm)
1723 {
1724   const GFC_REAL_4 * restrict abase;
1725   const GFC_REAL_4 * restrict bbase;
1726   GFC_REAL_4 * restrict dest;
1727
1728   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1729   index_type x, y, n, count, xcount, ycount;
1730
1731   assert (GFC_DESCRIPTOR_RANK (a) == 2
1732           || GFC_DESCRIPTOR_RANK (b) == 2);
1733
1734 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1735
1736    Either A or B (but not both) can be rank 1:
1737
1738    o One-dimensional argument A is implicitly treated as a row matrix
1739      dimensioned [1,count], so xcount=1.
1740
1741    o One-dimensional argument B is implicitly treated as a column matrix
1742      dimensioned [count, 1], so ycount=1.
1743 */
1744
1745   if (retarray->base_addr == NULL)
1746     {
1747       if (GFC_DESCRIPTOR_RANK (a) == 1)
1748         {
1749           GFC_DIMENSION_SET(retarray->dim[0], 0,
1750                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1751         }
1752       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1753         {
1754           GFC_DIMENSION_SET(retarray->dim[0], 0,
1755                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1756         }
1757       else
1758         {
1759           GFC_DIMENSION_SET(retarray->dim[0], 0,
1760                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1761
1762           GFC_DIMENSION_SET(retarray->dim[1], 0,
1763                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1764                             GFC_DESCRIPTOR_EXTENT(retarray,0));
1765         }
1766
1767       retarray->base_addr
1768         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
1769       retarray->offset = 0;
1770     }
1771   else if (unlikely (compile_options.bounds_check))
1772     {
1773       index_type ret_extent, arg_extent;
1774
1775       if (GFC_DESCRIPTOR_RANK (a) == 1)
1776         {
1777           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1778           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1779           if (arg_extent != ret_extent)
1780             runtime_error ("Incorrect extent in return array in"
1781                            " MATMUL intrinsic: is %ld, should be %ld",
1782                            (long int) ret_extent, (long int) arg_extent);
1783         }
1784       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1785         {
1786           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1787           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1788           if (arg_extent != ret_extent)
1789             runtime_error ("Incorrect extent in return array in"
1790                            " MATMUL intrinsic: is %ld, should be %ld",
1791                            (long int) ret_extent, (long int) arg_extent);
1792         }
1793       else
1794         {
1795           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1796           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1797           if (arg_extent != ret_extent)
1798             runtime_error ("Incorrect extent in return array in"
1799                            " MATMUL intrinsic for dimension 1:"
1800                            " is %ld, should be %ld",
1801                            (long int) ret_extent, (long int) arg_extent);
1802
1803           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1804           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1805           if (arg_extent != ret_extent)
1806             runtime_error ("Incorrect extent in return array in"
1807                            " MATMUL intrinsic for dimension 2:"
1808                            " is %ld, should be %ld",
1809                            (long int) ret_extent, (long int) arg_extent);
1810         }
1811     }
1812
1813
1814   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1815     {
1816       /* One-dimensional result may be addressed in the code below
1817          either as a row or a column matrix. We want both cases to
1818          work. */
1819       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1820     }
1821   else
1822     {
1823       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1824       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1825     }
1826
1827
1828   if (GFC_DESCRIPTOR_RANK (a) == 1)
1829     {
1830       /* Treat it as a a row matrix A[1,count]. */
1831       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1832       aystride = 1;
1833
1834       xcount = 1;
1835       count = GFC_DESCRIPTOR_EXTENT(a,0);
1836     }
1837   else
1838     {
1839       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1840       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1841
1842       count = GFC_DESCRIPTOR_EXTENT(a,1);
1843       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1844     }
1845
1846   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1847     {
1848       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1849         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1850     }
1851
1852   if (GFC_DESCRIPTOR_RANK (b) == 1)
1853     {
1854       /* Treat it as a column matrix B[count,1] */
1855       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1856
1857       /* bystride should never be used for 1-dimensional b.
1858          in case it is we want it to cause a segfault, rather than
1859          an incorrect result. */
1860       bystride = 0xDEADBEEF;
1861       ycount = 1;
1862     }
1863   else
1864     {
1865       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1866       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1867       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1868     }
1869
1870   abase = a->base_addr;
1871   bbase = b->base_addr;
1872   dest = retarray->base_addr;
1873
1874   /* Now that everything is set up, we perform the multiplication
1875      itself.  */
1876
1877 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1878 #define min(a,b) ((a) <= (b) ? (a) : (b))
1879 #define max(a,b) ((a) >= (b) ? (a) : (b))
1880
1881   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1882       && (bxstride == 1 || bystride == 1)
1883       && (((float) xcount) * ((float) ycount) * ((float) count)
1884           > POW3(blas_limit)))
1885     {
1886       const int m = xcount, n = ycount, k = count, ldc = rystride;
1887       const GFC_REAL_4 one = 1, zero = 0;
1888       const int lda = (axstride == 1) ? aystride : axstride,
1889                 ldb = (bxstride == 1) ? bystride : bxstride;
1890
1891       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1892         {
1893           assert (gemm != NULL);
1894           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1895                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1896                 &ldc, 1, 1);
1897           return;
1898         }
1899     }
1900
1901   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1902     {
1903       /* This block of code implements a tuned matmul, derived from
1904          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1905
1906                Bo Kagstrom and Per Ling
1907                Department of Computing Science
1908                Umea University
1909                S-901 87 Umea, Sweden
1910
1911          from netlib.org, translated to C, and modified for matmul.m4.  */
1912
1913       const GFC_REAL_4 *a, *b;
1914       GFC_REAL_4 *c;
1915       const index_type m = xcount, n = ycount, k = count;
1916
1917       /* System generated locals */
1918       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1919                  i1, i2, i3, i4, i5, i6;
1920
1921       /* Local variables */
1922       GFC_REAL_4 t1[65536], /* was [256][256] */
1923                  f11, f12, f21, f22, f31, f32, f41, f42,
1924                  f13, f14, f23, f24, f33, f34, f43, f44;
1925       index_type i, j, l, ii, jj, ll;
1926       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1927
1928       a = abase;
1929       b = bbase;
1930       c = retarray->base_addr;
1931
1932       /* Parameter adjustments */
1933       c_dim1 = rystride;
1934       c_offset = 1 + c_dim1;
1935       c -= c_offset;
1936       a_dim1 = aystride;
1937       a_offset = 1 + a_dim1;
1938       a -= a_offset;
1939       b_dim1 = bystride;
1940       b_offset = 1 + b_dim1;
1941       b -= b_offset;
1942
1943       /* Early exit if possible */
1944       if (m == 0 || n == 0 || k == 0)
1945         return;
1946
1947       /* Empty c first.  */
1948       for (j=1; j<=n; j++)
1949         for (i=1; i<=m; i++)
1950           c[i + j * c_dim1] = (GFC_REAL_4)0;
1951
1952       /* Start turning the crank. */
1953       i1 = n;
1954       for (jj = 1; jj <= i1; jj += 512)
1955         {
1956           /* Computing MIN */
1957           i2 = 512;
1958           i3 = n - jj + 1;
1959           jsec = min(i2,i3);
1960           ujsec = jsec - jsec % 4;
1961           i2 = k;
1962           for (ll = 1; ll <= i2; ll += 256)
1963             {
1964               /* Computing MIN */
1965               i3 = 256;
1966               i4 = k - ll + 1;
1967               lsec = min(i3,i4);
1968               ulsec = lsec - lsec % 2;
1969
1970               i3 = m;
1971               for (ii = 1; ii <= i3; ii += 256)
1972                 {
1973                   /* Computing MIN */
1974                   i4 = 256;
1975                   i5 = m - ii + 1;
1976                   isec = min(i4,i5);
1977                   uisec = isec - isec % 2;
1978                   i4 = ll + ulsec - 1;
1979                   for (l = ll; l <= i4; l += 2)
1980                     {
1981                       i5 = ii + uisec - 1;
1982                       for (i = ii; i <= i5; i += 2)
1983                         {
1984                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1985                                         a[i + l * a_dim1];
1986                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1987                                         a[i + (l + 1) * a_dim1];
1988                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1989                                         a[i + 1 + l * a_dim1];
1990                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1991                                         a[i + 1 + (l + 1) * a_dim1];
1992                         }
1993                       if (uisec < isec)
1994                         {
1995                           t1[l - ll + 1 + (isec << 8) - 257] =
1996                                     a[ii + isec - 1 + l * a_dim1];
1997                           t1[l - ll + 2 + (isec << 8) - 257] =
1998                                     a[ii + isec - 1 + (l + 1) * a_dim1];
1999                         }
2000                     }
2001                   if (ulsec < lsec)
2002                     {
2003                       i4 = ii + isec - 1;
2004                       for (i = ii; i<= i4; ++i)
2005                         {
2006                           t1[lsec + ((i - ii + 1) << 8) - 257] =
2007                                     a[i + (ll + lsec - 1) * a_dim1];
2008                         }
2009                     }
2010
2011                   uisec = isec - isec % 4;
2012                   i4 = jj + ujsec - 1;
2013                   for (j = jj; j <= i4; j += 4)
2014                     {
2015                       i5 = ii + uisec - 1;
2016                       for (i = ii; i <= i5; i += 4)
2017                         {
2018                           f11 = c[i + j * c_dim1];
2019                           f21 = c[i + 1 + j * c_dim1];
2020                           f12 = c[i + (j + 1) * c_dim1];
2021                           f22 = c[i + 1 + (j + 1) * c_dim1];
2022                           f13 = c[i + (j + 2) * c_dim1];
2023                           f23 = c[i + 1 + (j + 2) * c_dim1];
2024                           f14 = c[i + (j + 3) * c_dim1];
2025                           f24 = c[i + 1 + (j + 3) * c_dim1];
2026                           f31 = c[i + 2 + j * c_dim1];
2027                           f41 = c[i + 3 + j * c_dim1];
2028                           f32 = c[i + 2 + (j + 1) * c_dim1];
2029                           f42 = c[i + 3 + (j + 1) * c_dim1];
2030                           f33 = c[i + 2 + (j + 2) * c_dim1];
2031                           f43 = c[i + 3 + (j + 2) * c_dim1];
2032                           f34 = c[i + 2 + (j + 3) * c_dim1];
2033                           f44 = c[i + 3 + (j + 3) * c_dim1];
2034                           i6 = ll + lsec - 1;
2035                           for (l = ll; l <= i6; ++l)
2036                             {
2037                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2038                                       * b[l + j * b_dim1];
2039                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2040                                       * b[l + j * b_dim1];
2041                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2042                                       * b[l + (j + 1) * b_dim1];
2043                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2044                                       * b[l + (j + 1) * b_dim1];
2045                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2046                                       * b[l + (j + 2) * b_dim1];
2047                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2048                                       * b[l + (j + 2) * b_dim1];
2049                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2050                                       * b[l + (j + 3) * b_dim1];
2051                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2052                                       * b[l + (j + 3) * b_dim1];
2053                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2054                                       * b[l + j * b_dim1];
2055                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2056                                       * b[l + j * b_dim1];
2057                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2058                                       * b[l + (j + 1) * b_dim1];
2059                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2060                                       * b[l + (j + 1) * b_dim1];
2061                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2062                                       * b[l + (j + 2) * b_dim1];
2063                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2064                                       * b[l + (j + 2) * b_dim1];
2065                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2066                                       * b[l + (j + 3) * b_dim1];
2067                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2068                                       * b[l + (j + 3) * b_dim1];
2069                             }
2070                           c[i + j * c_dim1] = f11;
2071                           c[i + 1 + j * c_dim1] = f21;
2072                           c[i + (j + 1) * c_dim1] = f12;
2073                           c[i + 1 + (j + 1) * c_dim1] = f22;
2074                           c[i + (j + 2) * c_dim1] = f13;
2075                           c[i + 1 + (j + 2) * c_dim1] = f23;
2076                           c[i + (j + 3) * c_dim1] = f14;
2077                           c[i + 1 + (j + 3) * c_dim1] = f24;
2078                           c[i + 2 + j * c_dim1] = f31;
2079                           c[i + 3 + j * c_dim1] = f41;
2080                           c[i + 2 + (j + 1) * c_dim1] = f32;
2081                           c[i + 3 + (j + 1) * c_dim1] = f42;
2082                           c[i + 2 + (j + 2) * c_dim1] = f33;
2083                           c[i + 3 + (j + 2) * c_dim1] = f43;
2084                           c[i + 2 + (j + 3) * c_dim1] = f34;
2085                           c[i + 3 + (j + 3) * c_dim1] = f44;
2086                         }
2087                       if (uisec < isec)
2088                         {
2089                           i5 = ii + isec - 1;
2090                           for (i = ii + uisec; i <= i5; ++i)
2091                             {
2092                               f11 = c[i + j * c_dim1];
2093                               f12 = c[i + (j + 1) * c_dim1];
2094                               f13 = c[i + (j + 2) * c_dim1];
2095                               f14 = c[i + (j + 3) * c_dim1];
2096                               i6 = ll + lsec - 1;
2097                               for (l = ll; l <= i6; ++l)
2098                                 {
2099                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2100                                           257] * b[l + j * b_dim1];
2101                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2102                                           257] * b[l + (j + 1) * b_dim1];
2103                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2104                                           257] * b[l + (j + 2) * b_dim1];
2105                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2106                                           257] * b[l + (j + 3) * b_dim1];
2107                                 }
2108                               c[i + j * c_dim1] = f11;
2109                               c[i + (j + 1) * c_dim1] = f12;
2110                               c[i + (j + 2) * c_dim1] = f13;
2111                               c[i + (j + 3) * c_dim1] = f14;
2112                             }
2113                         }
2114                     }
2115                   if (ujsec < jsec)
2116                     {
2117                       i4 = jj + jsec - 1;
2118                       for (j = jj + ujsec; j <= i4; ++j)
2119                         {
2120                           i5 = ii + uisec - 1;
2121                           for (i = ii; i <= i5; i += 4)
2122                             {
2123                               f11 = c[i + j * c_dim1];
2124                               f21 = c[i + 1 + j * c_dim1];
2125                               f31 = c[i + 2 + j * c_dim1];
2126                               f41 = c[i + 3 + j * c_dim1];
2127                               i6 = ll + lsec - 1;
2128                               for (l = ll; l <= i6; ++l)
2129                                 {
2130                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2131                                           257] * b[l + j * b_dim1];
2132                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2133                                           257] * b[l + j * b_dim1];
2134                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2135                                           257] * b[l + j * b_dim1];
2136                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2137                                           257] * b[l + j * b_dim1];
2138                                 }
2139                               c[i + j * c_dim1] = f11;
2140                               c[i + 1 + j * c_dim1] = f21;
2141                               c[i + 2 + j * c_dim1] = f31;
2142                               c[i + 3 + j * c_dim1] = f41;
2143                             }
2144                           i5 = ii + isec - 1;
2145                           for (i = ii + uisec; i <= i5; ++i)
2146                             {
2147                               f11 = c[i + j * c_dim1];
2148                               i6 = ll + lsec - 1;
2149                               for (l = ll; l <= i6; ++l)
2150                                 {
2151                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2152                                           257] * b[l + j * b_dim1];
2153                                 }
2154                               c[i + j * c_dim1] = f11;
2155                             }
2156                         }
2157                     }
2158                 }
2159             }
2160         }
2161       return;
2162     }
2163   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2164     {
2165       if (GFC_DESCRIPTOR_RANK (a) != 1)
2166         {
2167           const GFC_REAL_4 *restrict abase_x;
2168           const GFC_REAL_4 *restrict bbase_y;
2169           GFC_REAL_4 *restrict dest_y;
2170           GFC_REAL_4 s;
2171
2172           for (y = 0; y < ycount; y++)
2173             {
2174               bbase_y = &bbase[y*bystride];
2175               dest_y = &dest[y*rystride];
2176               for (x = 0; x < xcount; x++)
2177                 {
2178                   abase_x = &abase[x*axstride];
2179                   s = (GFC_REAL_4) 0;
2180                   for (n = 0; n < count; n++)
2181                     s += abase_x[n] * bbase_y[n];
2182                   dest_y[x] = s;
2183                 }
2184             }
2185         }
2186       else
2187         {
2188           const GFC_REAL_4 *restrict bbase_y;
2189           GFC_REAL_4 s;
2190
2191           for (y = 0; y < ycount; y++)
2192             {
2193               bbase_y = &bbase[y*bystride];
2194               s = (GFC_REAL_4) 0;
2195               for (n = 0; n < count; n++)
2196                 s += abase[n*axstride] * bbase_y[n];
2197               dest[y*rystride] = s;
2198             }
2199         }
2200     }
2201   else if (axstride < aystride)
2202     {
2203       for (y = 0; y < ycount; y++)
2204         for (x = 0; x < xcount; x++)
2205           dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
2206
2207       for (y = 0; y < ycount; y++)
2208         for (n = 0; n < count; n++)
2209           for (x = 0; x < xcount; x++)
2210             /* dest[x,y] += a[x,n] * b[n,y] */
2211             dest[x*rxstride + y*rystride] +=
2212                                         abase[x*axstride + n*aystride] *
2213                                         bbase[n*bxstride + y*bystride];
2214     }
2215   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2216     {
2217       const GFC_REAL_4 *restrict bbase_y;
2218       GFC_REAL_4 s;
2219
2220       for (y = 0; y < ycount; y++)
2221         {
2222           bbase_y = &bbase[y*bystride];
2223           s = (GFC_REAL_4) 0;
2224           for (n = 0; n < count; n++)
2225             s += abase[n*axstride] * bbase_y[n*bxstride];
2226           dest[y*rxstride] = s;
2227         }
2228     }
2229   else
2230     {
2231       const GFC_REAL_4 *restrict abase_x;
2232       const GFC_REAL_4 *restrict bbase_y;
2233       GFC_REAL_4 *restrict dest_y;
2234       GFC_REAL_4 s;
2235
2236       for (y = 0; y < ycount; y++)
2237         {
2238           bbase_y = &bbase[y*bystride];
2239           dest_y = &dest[y*rystride];
2240           for (x = 0; x < xcount; x++)
2241             {
2242               abase_x = &abase[x*axstride];
2243               s = (GFC_REAL_4) 0;
2244               for (n = 0; n < count; n++)
2245                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2246               dest_y[x*rxstride] = s;
2247             }
2248         }
2249     }
2250 }
2251 #undef POW3
2252 #undef min
2253 #undef max
2254
2255
2256 /* Compiling main function, with selection code for the processor.  */
2257
2258 /* Currently, this is i386 only.  Adjust for other architectures.  */
2259
2260 #include <config/i386/cpuinfo.h>
2261 void matmul_r4 (gfc_array_r4 * const restrict retarray,
2262         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2263         int blas_limit, blas_call gemm)
2264 {
2265   static void (*matmul_p) (gfc_array_r4 * const restrict retarray,
2266         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2267         int blas_limit, blas_call gemm) = NULL;
2268
2269   if (matmul_p == NULL)
2270     {
2271       matmul_p = matmul_r4_vanilla;
2272       if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2273         {
2274           /* Run down the available processors in order of preference.  */
2275 #ifdef HAVE_AVX512F
2276           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2277             {
2278               matmul_p = matmul_r4_avx512f;
2279               goto tailcall;
2280             }
2281
2282 #endif  /* HAVE_AVX512F */
2283
2284 #ifdef HAVE_AVX2
2285           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2286             {
2287               matmul_p = matmul_r4_avx2;
2288               goto tailcall;
2289             }
2290
2291 #endif
2292
2293 #ifdef HAVE_AVX
2294           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2295             {
2296               matmul_p = matmul_r4_avx;
2297               goto tailcall;
2298             }
2299 #endif  /* HAVE_AVX */
2300         }
2301    }
2302
2303 tailcall:
2304    (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm);
2305 }
2306
2307 #else  /* Just the vanilla function.  */
2308
2309 void
2310 matmul_r4 (gfc_array_r4 * const restrict retarray,
2311         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2312         int blas_limit, blas_call gemm)
2313 {
2314   const GFC_REAL_4 * restrict abase;
2315   const GFC_REAL_4 * restrict bbase;
2316   GFC_REAL_4 * restrict dest;
2317
2318   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2319   index_type x, y, n, count, xcount, ycount;
2320
2321   assert (GFC_DESCRIPTOR_RANK (a) == 2
2322           || GFC_DESCRIPTOR_RANK (b) == 2);
2323
2324 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2325
2326    Either A or B (but not both) can be rank 1:
2327
2328    o One-dimensional argument A is implicitly treated as a row matrix
2329      dimensioned [1,count], so xcount=1.
2330
2331    o One-dimensional argument B is implicitly treated as a column matrix
2332      dimensioned [count, 1], so ycount=1.
2333 */
2334
2335   if (retarray->base_addr == NULL)
2336     {
2337       if (GFC_DESCRIPTOR_RANK (a) == 1)
2338         {
2339           GFC_DIMENSION_SET(retarray->dim[0], 0,
2340                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2341         }
2342       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2343         {
2344           GFC_DIMENSION_SET(retarray->dim[0], 0,
2345                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2346         }
2347       else
2348         {
2349           GFC_DIMENSION_SET(retarray->dim[0], 0,
2350                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2351
2352           GFC_DIMENSION_SET(retarray->dim[1], 0,
2353                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2354                             GFC_DESCRIPTOR_EXTENT(retarray,0));
2355         }
2356
2357       retarray->base_addr
2358         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
2359       retarray->offset = 0;
2360     }
2361   else if (unlikely (compile_options.bounds_check))
2362     {
2363       index_type ret_extent, arg_extent;
2364
2365       if (GFC_DESCRIPTOR_RANK (a) == 1)
2366         {
2367           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2368           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2369           if (arg_extent != ret_extent)
2370             runtime_error ("Incorrect extent in return array in"
2371                            " MATMUL intrinsic: is %ld, should be %ld",
2372                            (long int) ret_extent, (long int) arg_extent);
2373         }
2374       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2375         {
2376           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2377           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2378           if (arg_extent != ret_extent)
2379             runtime_error ("Incorrect extent in return array in"
2380                            " MATMUL intrinsic: is %ld, should be %ld",
2381                            (long int) ret_extent, (long int) arg_extent);
2382         }
2383       else
2384         {
2385           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2386           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2387           if (arg_extent != ret_extent)
2388             runtime_error ("Incorrect extent in return array in"
2389                            " MATMUL intrinsic for dimension 1:"
2390                            " is %ld, should be %ld",
2391                            (long int) ret_extent, (long int) arg_extent);
2392
2393           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2394           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2395           if (arg_extent != ret_extent)
2396             runtime_error ("Incorrect extent in return array in"
2397                            " MATMUL intrinsic for dimension 2:"
2398                            " is %ld, should be %ld",
2399                            (long int) ret_extent, (long int) arg_extent);
2400         }
2401     }
2402
2403
2404   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2405     {
2406       /* One-dimensional result may be addressed in the code below
2407          either as a row or a column matrix. We want both cases to
2408          work. */
2409       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2410     }
2411   else
2412     {
2413       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2414       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2415     }
2416
2417
2418   if (GFC_DESCRIPTOR_RANK (a) == 1)
2419     {
2420       /* Treat it as a a row matrix A[1,count]. */
2421       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2422       aystride = 1;
2423
2424       xcount = 1;
2425       count = GFC_DESCRIPTOR_EXTENT(a,0);
2426     }
2427   else
2428     {
2429       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2430       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2431
2432       count = GFC_DESCRIPTOR_EXTENT(a,1);
2433       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2434     }
2435
2436   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2437     {
2438       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2439         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2440     }
2441
2442   if (GFC_DESCRIPTOR_RANK (b) == 1)
2443     {
2444       /* Treat it as a column matrix B[count,1] */
2445       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2446
2447       /* bystride should never be used for 1-dimensional b.
2448          in case it is we want it to cause a segfault, rather than
2449          an incorrect result. */
2450       bystride = 0xDEADBEEF;
2451       ycount = 1;
2452     }
2453   else
2454     {
2455       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2456       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2457       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2458     }
2459
2460   abase = a->base_addr;
2461   bbase = b->base_addr;
2462   dest = retarray->base_addr;
2463
2464   /* Now that everything is set up, we perform the multiplication
2465      itself.  */
2466
2467 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2468 #define min(a,b) ((a) <= (b) ? (a) : (b))
2469 #define max(a,b) ((a) >= (b) ? (a) : (b))
2470
2471   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2472       && (bxstride == 1 || bystride == 1)
2473       && (((float) xcount) * ((float) ycount) * ((float) count)
2474           > POW3(blas_limit)))
2475     {
2476       const int m = xcount, n = ycount, k = count, ldc = rystride;
2477       const GFC_REAL_4 one = 1, zero = 0;
2478       const int lda = (axstride == 1) ? aystride : axstride,
2479                 ldb = (bxstride == 1) ? bystride : bxstride;
2480
2481       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2482         {
2483           assert (gemm != NULL);
2484           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2485                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2486                 &ldc, 1, 1);
2487           return;
2488         }
2489     }
2490
2491   if (rxstride == 1 && axstride == 1 && bxstride == 1)
2492     {
2493       /* This block of code implements a tuned matmul, derived from
2494          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2495
2496                Bo Kagstrom and Per Ling
2497                Department of Computing Science
2498                Umea University
2499                S-901 87 Umea, Sweden
2500
2501          from netlib.org, translated to C, and modified for matmul.m4.  */
2502
2503       const GFC_REAL_4 *a, *b;
2504       GFC_REAL_4 *c;
2505       const index_type m = xcount, n = ycount, k = count;
2506
2507       /* System generated locals */
2508       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2509                  i1, i2, i3, i4, i5, i6;
2510
2511       /* Local variables */
2512       GFC_REAL_4 t1[65536], /* was [256][256] */
2513                  f11, f12, f21, f22, f31, f32, f41, f42,
2514                  f13, f14, f23, f24, f33, f34, f43, f44;
2515       index_type i, j, l, ii, jj, ll;
2516       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2517
2518       a = abase;
2519       b = bbase;
2520       c = retarray->base_addr;
2521
2522       /* Parameter adjustments */
2523       c_dim1 = rystride;
2524       c_offset = 1 + c_dim1;
2525       c -= c_offset;
2526       a_dim1 = aystride;
2527       a_offset = 1 + a_dim1;
2528       a -= a_offset;
2529       b_dim1 = bystride;
2530       b_offset = 1 + b_dim1;
2531       b -= b_offset;
2532
2533       /* Early exit if possible */
2534       if (m == 0 || n == 0 || k == 0)
2535         return;
2536
2537       /* Empty c first.  */
2538       for (j=1; j<=n; j++)
2539         for (i=1; i<=m; i++)
2540           c[i + j * c_dim1] = (GFC_REAL_4)0;
2541
2542       /* Start turning the crank. */
2543       i1 = n;
2544       for (jj = 1; jj <= i1; jj += 512)
2545         {
2546           /* Computing MIN */
2547           i2 = 512;
2548           i3 = n - jj + 1;
2549           jsec = min(i2,i3);
2550           ujsec = jsec - jsec % 4;
2551           i2 = k;
2552           for (ll = 1; ll <= i2; ll += 256)
2553             {
2554               /* Computing MIN */
2555               i3 = 256;
2556               i4 = k - ll + 1;
2557               lsec = min(i3,i4);
2558               ulsec = lsec - lsec % 2;
2559
2560               i3 = m;
2561               for (ii = 1; ii <= i3; ii += 256)
2562                 {
2563                   /* Computing MIN */
2564                   i4 = 256;
2565                   i5 = m - ii + 1;
2566                   isec = min(i4,i5);
2567                   uisec = isec - isec % 2;
2568                   i4 = ll + ulsec - 1;
2569                   for (l = ll; l <= i4; l += 2)
2570                     {
2571                       i5 = ii + uisec - 1;
2572                       for (i = ii; i <= i5; i += 2)
2573                         {
2574                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2575                                         a[i + l * a_dim1];
2576                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2577                                         a[i + (l + 1) * a_dim1];
2578                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2579                                         a[i + 1 + l * a_dim1];
2580                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2581                                         a[i + 1 + (l + 1) * a_dim1];
2582                         }
2583                       if (uisec < isec)
2584                         {
2585                           t1[l - ll + 1 + (isec << 8) - 257] =
2586                                     a[ii + isec - 1 + l * a_dim1];
2587                           t1[l - ll + 2 + (isec << 8) - 257] =
2588                                     a[ii + isec - 1 + (l + 1) * a_dim1];
2589                         }
2590                     }
2591                   if (ulsec < lsec)
2592                     {
2593                       i4 = ii + isec - 1;
2594                       for (i = ii; i<= i4; ++i)
2595                         {
2596                           t1[lsec + ((i - ii + 1) << 8) - 257] =
2597                                     a[i + (ll + lsec - 1) * a_dim1];
2598                         }
2599                     }
2600
2601                   uisec = isec - isec % 4;
2602                   i4 = jj + ujsec - 1;
2603                   for (j = jj; j <= i4; j += 4)
2604                     {
2605                       i5 = ii + uisec - 1;
2606                       for (i = ii; i <= i5; i += 4)
2607                         {
2608                           f11 = c[i + j * c_dim1];
2609                           f21 = c[i + 1 + j * c_dim1];
2610                           f12 = c[i + (j + 1) * c_dim1];
2611                           f22 = c[i + 1 + (j + 1) * c_dim1];
2612                           f13 = c[i + (j + 2) * c_dim1];
2613                           f23 = c[i + 1 + (j + 2) * c_dim1];
2614                           f14 = c[i + (j + 3) * c_dim1];
2615                           f24 = c[i + 1 + (j + 3) * c_dim1];
2616                           f31 = c[i + 2 + j * c_dim1];
2617                           f41 = c[i + 3 + j * c_dim1];
2618                           f32 = c[i + 2 + (j + 1) * c_dim1];
2619                           f42 = c[i + 3 + (j + 1) * c_dim1];
2620                           f33 = c[i + 2 + (j + 2) * c_dim1];
2621                           f43 = c[i + 3 + (j + 2) * c_dim1];
2622                           f34 = c[i + 2 + (j + 3) * c_dim1];
2623                           f44 = c[i + 3 + (j + 3) * c_dim1];
2624                           i6 = ll + lsec - 1;
2625                           for (l = ll; l <= i6; ++l)
2626                             {
2627                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2628                                       * b[l + j * b_dim1];
2629                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2630                                       * b[l + j * b_dim1];
2631                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2632                                       * b[l + (j + 1) * b_dim1];
2633                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2634                                       * b[l + (j + 1) * b_dim1];
2635                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2636                                       * b[l + (j + 2) * b_dim1];
2637                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2638                                       * b[l + (j + 2) * b_dim1];
2639                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2640                                       * b[l + (j + 3) * b_dim1];
2641                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2642                                       * b[l + (j + 3) * b_dim1];
2643                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2644                                       * b[l + j * b_dim1];
2645                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2646                                       * b[l + j * b_dim1];
2647                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2648                                       * b[l + (j + 1) * b_dim1];
2649                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2650                                       * b[l + (j + 1) * b_dim1];
2651                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2652                                       * b[l + (j + 2) * b_dim1];
2653                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2654                                       * b[l + (j + 2) * b_dim1];
2655                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2656                                       * b[l + (j + 3) * b_dim1];
2657                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2658                                       * b[l + (j + 3) * b_dim1];
2659                             }
2660                           c[i + j * c_dim1] = f11;
2661                           c[i + 1 + j * c_dim1] = f21;
2662                           c[i + (j + 1) * c_dim1] = f12;
2663                           c[i + 1 + (j + 1) * c_dim1] = f22;
2664                           c[i + (j + 2) * c_dim1] = f13;
2665                           c[i + 1 + (j + 2) * c_dim1] = f23;
2666                           c[i + (j + 3) * c_dim1] = f14;
2667                           c[i + 1 + (j + 3) * c_dim1] = f24;
2668                           c[i + 2 + j * c_dim1] = f31;
2669                           c[i + 3 + j * c_dim1] = f41;
2670                           c[i + 2 + (j + 1) * c_dim1] = f32;
2671                           c[i + 3 + (j + 1) * c_dim1] = f42;
2672                           c[i + 2 + (j + 2) * c_dim1] = f33;
2673                           c[i + 3 + (j + 2) * c_dim1] = f43;
2674                           c[i + 2 + (j + 3) * c_dim1] = f34;
2675                           c[i + 3 + (j + 3) * c_dim1] = f44;
2676                         }
2677                       if (uisec < isec)
2678                         {
2679                           i5 = ii + isec - 1;
2680                           for (i = ii + uisec; i <= i5; ++i)
2681                             {
2682                               f11 = c[i + j * c_dim1];
2683                               f12 = c[i + (j + 1) * c_dim1];
2684                               f13 = c[i + (j + 2) * c_dim1];
2685                               f14 = c[i + (j + 3) * c_dim1];
2686                               i6 = ll + lsec - 1;
2687                               for (l = ll; l <= i6; ++l)
2688                                 {
2689                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2690                                           257] * b[l + j * b_dim1];
2691                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2692                                           257] * b[l + (j + 1) * b_dim1];
2693                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2694                                           257] * b[l + (j + 2) * b_dim1];
2695                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2696                                           257] * b[l + (j + 3) * b_dim1];
2697                                 }
2698                               c[i + j * c_dim1] = f11;
2699                               c[i + (j + 1) * c_dim1] = f12;
2700                               c[i + (j + 2) * c_dim1] = f13;
2701                               c[i + (j + 3) * c_dim1] = f14;
2702                             }
2703                         }
2704                     }
2705                   if (ujsec < jsec)
2706                     {
2707                       i4 = jj + jsec - 1;
2708                       for (j = jj + ujsec; j <= i4; ++j)
2709                         {
2710                           i5 = ii + uisec - 1;
2711                           for (i = ii; i <= i5; i += 4)
2712                             {
2713                               f11 = c[i + j * c_dim1];
2714                               f21 = c[i + 1 + j * c_dim1];
2715                               f31 = c[i + 2 + j * c_dim1];
2716                               f41 = c[i + 3 + j * c_dim1];
2717                               i6 = ll + lsec - 1;
2718                               for (l = ll; l <= i6; ++l)
2719                                 {
2720                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2721                                           257] * b[l + j * b_dim1];
2722                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2723                                           257] * b[l + j * b_dim1];
2724                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2725                                           257] * b[l + j * b_dim1];
2726                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2727                                           257] * b[l + j * b_dim1];
2728                                 }
2729                               c[i + j * c_dim1] = f11;
2730                               c[i + 1 + j * c_dim1] = f21;
2731                               c[i + 2 + j * c_dim1] = f31;
2732                               c[i + 3 + j * c_dim1] = f41;
2733                             }
2734                           i5 = ii + isec - 1;
2735                           for (i = ii + uisec; i <= i5; ++i)
2736                             {
2737                               f11 = c[i + j * c_dim1];
2738                               i6 = ll + lsec - 1;
2739                               for (l = ll; l <= i6; ++l)
2740                                 {
2741                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2742                                           257] * b[l + j * b_dim1];
2743                                 }
2744                               c[i + j * c_dim1] = f11;
2745                             }
2746                         }
2747                     }
2748                 }
2749             }
2750         }
2751       return;
2752     }
2753   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2754     {
2755       if (GFC_DESCRIPTOR_RANK (a) != 1)
2756         {
2757           const GFC_REAL_4 *restrict abase_x;
2758           const GFC_REAL_4 *restrict bbase_y;
2759           GFC_REAL_4 *restrict dest_y;
2760           GFC_REAL_4 s;
2761
2762           for (y = 0; y < ycount; y++)
2763             {
2764               bbase_y = &bbase[y*bystride];
2765               dest_y = &dest[y*rystride];
2766               for (x = 0; x < xcount; x++)
2767                 {
2768                   abase_x = &abase[x*axstride];
2769                   s = (GFC_REAL_4) 0;
2770                   for (n = 0; n < count; n++)
2771                     s += abase_x[n] * bbase_y[n];
2772                   dest_y[x] = s;
2773                 }
2774             }
2775         }
2776       else
2777         {
2778           const GFC_REAL_4 *restrict bbase_y;
2779           GFC_REAL_4 s;
2780
2781           for (y = 0; y < ycount; y++)
2782             {
2783               bbase_y = &bbase[y*bystride];
2784               s = (GFC_REAL_4) 0;
2785               for (n = 0; n < count; n++)
2786                 s += abase[n*axstride] * bbase_y[n];
2787               dest[y*rystride] = s;
2788             }
2789         }
2790     }
2791   else if (axstride < aystride)
2792     {
2793       for (y = 0; y < ycount; y++)
2794         for (x = 0; x < xcount; x++)
2795           dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
2796
2797       for (y = 0; y < ycount; y++)
2798         for (n = 0; n < count; n++)
2799           for (x = 0; x < xcount; x++)
2800             /* dest[x,y] += a[x,n] * b[n,y] */
2801             dest[x*rxstride + y*rystride] +=
2802                                         abase[x*axstride + n*aystride] *
2803                                         bbase[n*bxstride + y*bystride];
2804     }
2805   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2806     {
2807       const GFC_REAL_4 *restrict bbase_y;
2808       GFC_REAL_4 s;
2809
2810       for (y = 0; y < ycount; y++)
2811         {
2812           bbase_y = &bbase[y*bystride];
2813           s = (GFC_REAL_4) 0;
2814           for (n = 0; n < count; n++)
2815             s += abase[n*axstride] * bbase_y[n*bxstride];
2816           dest[y*rxstride] = s;
2817         }
2818     }
2819   else
2820     {
2821       const GFC_REAL_4 *restrict abase_x;
2822       const GFC_REAL_4 *restrict bbase_y;
2823       GFC_REAL_4 *restrict dest_y;
2824       GFC_REAL_4 s;
2825
2826       for (y = 0; y < ycount; y++)
2827         {
2828           bbase_y = &bbase[y*bystride];
2829           dest_y = &dest[y*rystride];
2830           for (x = 0; x < xcount; x++)
2831             {
2832               abase_x = &abase[x*axstride];
2833               s = (GFC_REAL_4) 0;
2834               for (n = 0; n < count; n++)
2835                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2836               dest_y[x*rxstride] = s;
2837             }
2838         }
2839     }
2840 }
2841 #undef POW3
2842 #undef min
2843 #undef max
2844
2845 #endif
2846 #endif
2847