libgfortran/generated/matmul_r16.c

   1 /* Implementation of the MATMUL intrinsic
   2    Copyright (C) 2002-2017 Free Software Foundation, Inc.
   3    Contributed by Paul Brook <paul@nowt.org>
   4
   5 This file is part of the GNU Fortran runtime library (libgfortran).
   6
   7 Libgfortran is free software; you can redistribute it and/or
   8 modify it under the terms of the GNU General Public
   9 License as published by the Free Software Foundation; either
  10 version 3 of the License, or (at your option) any later version.
  11
  12 Libgfortran is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 Under Section 7 of GPL version 3, you are granted additional
  18 permissions described in the GCC Runtime Library Exception, version
  19 3.1, as published by the Free Software Foundation.
  20
  21 You should have received a copy of the GNU General Public License and
  22 a copy of the GCC Runtime Library Exception along with this program;
  23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24 <http://www.gnu.org/licenses/>.  */
  25
  26 #include "libgfortran.h"
  27 #include <string.h>
  28 #include <assert.h>
  29
  30
  31 #if defined (HAVE_GFC_REAL_16)
  32
  33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
  34    passed to us by the front-end, in which case we call it for large
  35    matrices.  */
  36
  37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
  38                           const int *, const GFC_REAL_16 *, const GFC_REAL_16 *,
  39                           const int *, const GFC_REAL_16 *, const int *,
  40                           const GFC_REAL_16 *, GFC_REAL_16 *, const int *,
  41                           int, int);
  42
  43 /* The order of loops is different in the case of plain matrix
  44    multiplication C=MATMUL(A,B), and in the frequent special case where
  45    the argument A is the temporary result of a TRANSPOSE intrinsic:
  46    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
  47    looking at their strides.
  48
  49    The equivalent Fortran pseudo-code is:
  50
  51    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
  52    IF (.NOT.IS_TRANSPOSED(A)) THEN
  53      C = 0
  54      DO J=1,N
  55        DO K=1,COUNT
  56          DO I=1,M
  57            C(I,J) = C(I,J)+A(I,K)*B(K,J)
  58    ELSE
  59      DO J=1,N
  60        DO I=1,M
  61          S = 0
  62          DO K=1,COUNT
  63            S = S+A(I,K)*B(K,J)
  64          C(I,J) = S
  65    ENDIF
  66 */
  67
  68 /* If try_blas is set to a nonzero value, then the matmul function will
  69    see if there is a way to perform the matrix multiplication by a call
  70    to the BLAS gemm function.  */
  71
  72 extern void matmul_r16 (gfc_array_r16 * const restrict retarray,
  73         gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
  74         int blas_limit, blas_call gemm);
  75 export_proto(matmul_r16);
  76
  77 /* Put exhaustive list of possible architectures here here, ORed together.  */
  78
  79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
  80
  81 #ifdef HAVE_AVX
  82 static void
  83 matmul_r16_avx (gfc_array_r16 * const restrict retarray,
  84         gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
  85         int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
  86 static void
  87 matmul_r16_avx (gfc_array_r16 * const restrict retarray,
  88         gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
  89         int blas_limit, blas_call gemm)
  90 {
  91   const GFC_REAL_16 * restrict abase;
  92   const GFC_REAL_16 * restrict bbase;
  93   GFC_REAL_16 * restrict dest;
  94
  95   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
  96   index_type x, y, n, count, xcount, ycount;
  97
  98   assert (GFC_DESCRIPTOR_RANK (a) == 2
  99           || GFC_DESCRIPTOR_RANK (b) == 2);
 100
 101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
 102
 103    Either A or B (but not both) can be rank 1:
 104
 105    o One-dimensional argument A is implicitly treated as a row matrix
 106      dimensioned [1,count], so xcount=1.
 107
 108    o One-dimensional argument B is implicitly treated as a column matrix
 109      dimensioned [count, 1], so ycount=1.
 110 */
 111
 112   if (retarray->base_addr == NULL)
 113     {
 114       if (GFC_DESCRIPTOR_RANK (a) == 1)
 115         {
 116           GFC_DIMENSION_SET(retarray->dim[0], 0,
 117                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
 118         }
 119       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 120         {
 121           GFC_DIMENSION_SET(retarray->dim[0], 0,
 122                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 123         }
 124       else
 125         {
 126           GFC_DIMENSION_SET(retarray->dim[0], 0,
 127                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 128
 129           GFC_DIMENSION_SET(retarray->dim[1], 0,
 130                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
 131                             GFC_DESCRIPTOR_EXTENT(retarray,0));
 132         }
 133
 134       retarray->base_addr
 135         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
 136       retarray->offset = 0;
 137     }
 138   else if (unlikely (compile_options.bounds_check))
 139     {
 140       index_type ret_extent, arg_extent;
 141
 142       if (GFC_DESCRIPTOR_RANK (a) == 1)
 143         {
 144           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 145           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 146           if (arg_extent != ret_extent)
 147             runtime_error ("Incorrect extent in return array in"
 148                            " MATMUL intrinsic: is %ld, should be %ld",
 149                            (long int) ret_extent, (long int) arg_extent);
 150         }
 151       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 152         {
 153           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 154           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 155           if (arg_extent != ret_extent)
 156             runtime_error ("Incorrect extent in return array in"
 157                            " MATMUL intrinsic: is %ld, should be %ld",
 158                            (long int) ret_extent, (long int) arg_extent);
 159         }
 160       else
 161         {
 162           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 163           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 164           if (arg_extent != ret_extent)
 165             runtime_error ("Incorrect extent in return array in"
 166                            " MATMUL intrinsic for dimension 1:"
 167                            " is %ld, should be %ld",
 168                            (long int) ret_extent, (long int) arg_extent);
 169
 170           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 171           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 172           if (arg_extent != ret_extent)
 173             runtime_error ("Incorrect extent in return array in"
 174                            " MATMUL intrinsic for dimension 2:"
 175                            " is %ld, should be %ld",
 176                            (long int) ret_extent, (long int) arg_extent);
 177         }
 178     }
 179
 180
 181   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 182     {
 183       /* One-dimensional result may be addressed in the code below
 184          either as a row or a column matrix. We want both cases to
 185          work. */
 186       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 187     }
 188   else
 189     {
 190       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 191       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 192     }
 193
 194
 195   if (GFC_DESCRIPTOR_RANK (a) == 1)
 196     {
 197       /* Treat it as a a row matrix A[1,count]. */
 198       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 199       aystride = 1;
 200
 201       xcount = 1;
 202       count = GFC_DESCRIPTOR_EXTENT(a,0);
 203     }
 204   else
 205     {
 206       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 207       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 208
 209       count = GFC_DESCRIPTOR_EXTENT(a,1);
 210       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 211     }
 212
 213   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 214     {
 215       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 216         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
 217     }
 218
 219   if (GFC_DESCRIPTOR_RANK (b) == 1)
 220     {
 221       /* Treat it as a column matrix B[count,1] */
 222       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 223
 224       /* bystride should never be used for 1-dimensional b.
 225          in case it is we want it to cause a segfault, rather than
 226          an incorrect result. */
 227       bystride = 0xDEADBEEF;
 228       ycount = 1;
 229     }
 230   else
 231     {
 232       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 233       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 234       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 235     }
 236
 237   abase = a->base_addr;
 238   bbase = b->base_addr;
 239   dest = retarray->base_addr;
 240
 241   /* Now that everything is set up, we perform the multiplication
 242      itself.  */
 243
 244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 245 #define min(a,b) ((a) <= (b) ? (a) : (b))
 246 #define max(a,b) ((a) >= (b) ? (a) : (b))
 247
 248   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 249       && (bxstride == 1 || bystride == 1)
 250       && (((float) xcount) * ((float) ycount) * ((float) count)
 251           > POW3(blas_limit)))
 252     {
 253       const int m = xcount, n = ycount, k = count, ldc = rystride;
 254       const GFC_REAL_16 one = 1, zero = 0;
 255       const int lda = (axstride == 1) ? aystride : axstride,
 256                 ldb = (bxstride == 1) ? bystride : bxstride;
 257
 258       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 259         {
 260           assert (gemm != NULL);
 261           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
 262                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 263                 &ldc, 1, 1);
 264           return;
 265         }
 266     }
 267
 268   if (rxstride == 1 && axstride == 1 && bxstride == 1)
 269     {
 270       /* This block of code implements a tuned matmul, derived from
 271          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 272
 273                Bo Kagstrom and Per Ling
 274                Department of Computing Science
 275                Umea University
 276                S-901 87 Umea, Sweden
 277
 278          from netlib.org, translated to C, and modified for matmul.m4.  */
 279
 280       const GFC_REAL_16 *a, *b;
 281       GFC_REAL_16 *c;
 282       const index_type m = xcount, n = ycount, k = count;
 283
 284       /* System generated locals */
 285       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 286                  i1, i2, i3, i4, i5, i6;
 287
 288       /* Local variables */
 289       GFC_REAL_16 t1[65536], /* was [256][256] */
 290                  f11, f12, f21, f22, f31, f32, f41, f42,
 291                  f13, f14, f23, f24, f33, f34, f43, f44;
 292       index_type i, j, l, ii, jj, ll;
 293       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 294
 295       a = abase;
 296       b = bbase;
 297       c = retarray->base_addr;
 298
 299       /* Parameter adjustments */
 300       c_dim1 = rystride;
 301       c_offset = 1 + c_dim1;
 302       c -= c_offset;
 303       a_dim1 = aystride;
 304       a_offset = 1 + a_dim1;
 305       a -= a_offset;
 306       b_dim1 = bystride;
 307       b_offset = 1 + b_dim1;
 308       b -= b_offset;
 309
 310       /* Early exit if possible */
 311       if (m == 0 || n == 0 || k == 0)
 312         return;
 313
 314       /* Empty c first.  */
 315       for (j=1; j<=n; j++)
 316         for (i=1; i<=m; i++)
 317           c[i + j * c_dim1] = (GFC_REAL_16)0;
 318
 319       /* Start turning the crank. */
 320       i1 = n;
 321       for (jj = 1; jj <= i1; jj += 512)
 322         {
 323           /* Computing MIN */
 324           i2 = 512;
 325           i3 = n - jj + 1;
 326           jsec = min(i2,i3);
 327           ujsec = jsec - jsec % 4;
 328           i2 = k;
 329           for (ll = 1; ll <= i2; ll += 256)
 330             {
 331               /* Computing MIN */
 332               i3 = 256;
 333               i4 = k - ll + 1;
 334               lsec = min(i3,i4);
 335               ulsec = lsec - lsec % 2;
 336
 337               i3 = m;
 338               for (ii = 1; ii <= i3; ii += 256)
 339                 {
 340                   /* Computing MIN */
 341                   i4 = 256;
 342                   i5 = m - ii + 1;
 343                   isec = min(i4,i5);
 344                   uisec = isec - isec % 2;
 345                   i4 = ll + ulsec - 1;
 346                   for (l = ll; l <= i4; l += 2)
 347                     {
 348                       i5 = ii + uisec - 1;
 349                       for (i = ii; i <= i5; i += 2)
 350                         {
 351                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 352                                         a[i + l * a_dim1];
 353                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 354                                         a[i + (l + 1) * a_dim1];
 355                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 356                                         a[i + 1 + l * a_dim1];
 357                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 358                                         a[i + 1 + (l + 1) * a_dim1];
 359                         }
 360                       if (uisec < isec)
 361                         {
 362                           t1[l - ll + 1 + (isec << 8) - 257] =
 363                                     a[ii + isec - 1 + l * a_dim1];
 364                           t1[l - ll + 2 + (isec << 8) - 257] =
 365                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 366                         }
 367                     }
 368                   if (ulsec < lsec)
 369                     {
 370                       i4 = ii + isec - 1;
 371                       for (i = ii; i<= i4; ++i)
 372                         {
 373                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 374                                     a[i + (ll + lsec - 1) * a_dim1];
 375                         }
 376                     }
 377
 378                   uisec = isec - isec % 4;
 379                   i4 = jj + ujsec - 1;
 380                   for (j = jj; j <= i4; j += 4)
 381                     {
 382                       i5 = ii + uisec - 1;
 383                       for (i = ii; i <= i5; i += 4)
 384                         {
 385                           f11 = c[i + j * c_dim1];
 386                           f21 = c[i + 1 + j * c_dim1];
 387                           f12 = c[i + (j + 1) * c_dim1];
 388                           f22 = c[i + 1 + (j + 1) * c_dim1];
 389                           f13 = c[i + (j + 2) * c_dim1];
 390                           f23 = c[i + 1 + (j + 2) * c_dim1];
 391                           f14 = c[i + (j + 3) * c_dim1];
 392                           f24 = c[i + 1 + (j + 3) * c_dim1];
 393                           f31 = c[i + 2 + j * c_dim1];
 394                           f41 = c[i + 3 + j * c_dim1];
 395                           f32 = c[i + 2 + (j + 1) * c_dim1];
 396                           f42 = c[i + 3 + (j + 1) * c_dim1];
 397                           f33 = c[i + 2 + (j + 2) * c_dim1];
 398                           f43 = c[i + 3 + (j + 2) * c_dim1];
 399                           f34 = c[i + 2 + (j + 3) * c_dim1];
 400                           f44 = c[i + 3 + (j + 3) * c_dim1];
 401                           i6 = ll + lsec - 1;
 402                           for (l = ll; l <= i6; ++l)
 403                             {
 404                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 405                                       * b[l + j * b_dim1];
 406                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 407                                       * b[l + j * b_dim1];
 408                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 409                                       * b[l + (j + 1) * b_dim1];
 410                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 411                                       * b[l + (j + 1) * b_dim1];
 412                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 413                                       * b[l + (j + 2) * b_dim1];
 414                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 415                                       * b[l + (j + 2) * b_dim1];
 416                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 417                                       * b[l + (j + 3) * b_dim1];
 418                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 419                                       * b[l + (j + 3) * b_dim1];
 420                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 421                                       * b[l + j * b_dim1];
 422                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 423                                       * b[l + j * b_dim1];
 424                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 425                                       * b[l + (j + 1) * b_dim1];
 426                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 427                                       * b[l + (j + 1) * b_dim1];
 428                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 429                                       * b[l + (j + 2) * b_dim1];
 430                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 431                                       * b[l + (j + 2) * b_dim1];
 432                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 433                                       * b[l + (j + 3) * b_dim1];
 434                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 435                                       * b[l + (j + 3) * b_dim1];
 436                             }
 437                           c[i + j * c_dim1] = f11;
 438                           c[i + 1 + j * c_dim1] = f21;
 439                           c[i + (j + 1) * c_dim1] = f12;
 440                           c[i + 1 + (j + 1) * c_dim1] = f22;
 441                           c[i + (j + 2) * c_dim1] = f13;
 442                           c[i + 1 + (j + 2) * c_dim1] = f23;
 443                           c[i + (j + 3) * c_dim1] = f14;
 444                           c[i + 1 + (j + 3) * c_dim1] = f24;
 445                           c[i + 2 + j * c_dim1] = f31;
 446                           c[i + 3 + j * c_dim1] = f41;
 447                           c[i + 2 + (j + 1) * c_dim1] = f32;
 448                           c[i + 3 + (j + 1) * c_dim1] = f42;
 449                           c[i + 2 + (j + 2) * c_dim1] = f33;
 450                           c[i + 3 + (j + 2) * c_dim1] = f43;
 451                           c[i + 2 + (j + 3) * c_dim1] = f34;
 452                           c[i + 3 + (j + 3) * c_dim1] = f44;
 453                         }
 454                       if (uisec < isec)
 455                         {
 456                           i5 = ii + isec - 1;
 457                           for (i = ii + uisec; i <= i5; ++i)
 458                             {
 459                               f11 = c[i + j * c_dim1];
 460                               f12 = c[i + (j + 1) * c_dim1];
 461                               f13 = c[i + (j + 2) * c_dim1];
 462                               f14 = c[i + (j + 3) * c_dim1];
 463                               i6 = ll + lsec - 1;
 464                               for (l = ll; l <= i6; ++l)
 465                                 {
 466                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 467                                           257] * b[l + j * b_dim1];
 468                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 469                                           257] * b[l + (j + 1) * b_dim1];
 470                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 471                                           257] * b[l + (j + 2) * b_dim1];
 472                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 473                                           257] * b[l + (j + 3) * b_dim1];
 474                                 }
 475                               c[i + j * c_dim1] = f11;
 476                               c[i + (j + 1) * c_dim1] = f12;
 477                               c[i + (j + 2) * c_dim1] = f13;
 478                               c[i + (j + 3) * c_dim1] = f14;
 479                             }
 480                         }
 481                     }
 482                   if (ujsec < jsec)
 483                     {
 484                       i4 = jj + jsec - 1;
 485                       for (j = jj + ujsec; j <= i4; ++j)
 486                         {
 487                           i5 = ii + uisec - 1;
 488                           for (i = ii; i <= i5; i += 4)
 489                             {
 490                               f11 = c[i + j * c_dim1];
 491                               f21 = c[i + 1 + j * c_dim1];
 492                               f31 = c[i + 2 + j * c_dim1];
 493                               f41 = c[i + 3 + j * c_dim1];
 494                               i6 = ll + lsec - 1;
 495                               for (l = ll; l <= i6; ++l)
 496                                 {
 497                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 498                                           257] * b[l + j * b_dim1];
 499                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
 500                                           257] * b[l + j * b_dim1];
 501                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
 502                                           257] * b[l + j * b_dim1];
 503                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
 504                                           257] * b[l + j * b_dim1];
 505                                 }
 506                               c[i + j * c_dim1] = f11;
 507                               c[i + 1 + j * c_dim1] = f21;
 508                               c[i + 2 + j * c_dim1] = f31;
 509                               c[i + 3 + j * c_dim1] = f41;
 510                             }
 511                           i5 = ii + isec - 1;
 512                           for (i = ii + uisec; i <= i5; ++i)
 513                             {
 514                               f11 = c[i + j * c_dim1];
 515                               i6 = ll + lsec - 1;
 516                               for (l = ll; l <= i6; ++l)
 517                                 {
 518                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 519                                           257] * b[l + j * b_dim1];
 520                                 }
 521                               c[i + j * c_dim1] = f11;
 522                             }
 523                         }
 524                     }
 525                 }
 526             }
 527         }
 528       return;
 529     }
 530   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
 531     {
 532       if (GFC_DESCRIPTOR_RANK (a) != 1)
 533         {
 534           const GFC_REAL_16 *restrict abase_x;
 535           const GFC_REAL_16 *restrict bbase_y;
 536           GFC_REAL_16 *restrict dest_y;
 537           GFC_REAL_16 s;
 538
 539           for (y = 0; y < ycount; y++)
 540             {
 541               bbase_y = &bbase[y*bystride];
 542               dest_y = &dest[y*rystride];
 543               for (x = 0; x < xcount; x++)
 544                 {
 545                   abase_x = &abase[x*axstride];
 546                   s = (GFC_REAL_16) 0;
 547                   for (n = 0; n < count; n++)
 548                     s += abase_x[n] * bbase_y[n];
 549                   dest_y[x] = s;
 550                 }
 551             }
 552         }
 553       else
 554         {
 555           const GFC_REAL_16 *restrict bbase_y;
 556           GFC_REAL_16 s;
 557
 558           for (y = 0; y < ycount; y++)
 559             {
 560               bbase_y = &bbase[y*bystride];
 561               s = (GFC_REAL_16) 0;
 562               for (n = 0; n < count; n++)
 563                 s += abase[n*axstride] * bbase_y[n];
 564               dest[y*rystride] = s;
 565             }
 566         }
 567     }
 568   else if (axstride < aystride)
 569     {
 570       for (y = 0; y < ycount; y++)
 571         for (x = 0; x < xcount; x++)
 572           dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
 573
 574       for (y = 0; y < ycount; y++)
 575         for (n = 0; n < count; n++)
 576           for (x = 0; x < xcount; x++)
 577             /* dest[x,y] += a[x,n] * b[n,y] */
 578             dest[x*rxstride + y*rystride] +=
 579                                         abase[x*axstride + n*aystride] *
 580                                         bbase[n*bxstride + y*bystride];
 581     }
 582   else if (GFC_DESCRIPTOR_RANK (a) == 1)
 583     {
 584       const GFC_REAL_16 *restrict bbase_y;
 585       GFC_REAL_16 s;
 586
 587       for (y = 0; y < ycount; y++)
 588         {
 589           bbase_y = &bbase[y*bystride];
 590           s = (GFC_REAL_16) 0;
 591           for (n = 0; n < count; n++)
 592             s += abase[n*axstride] * bbase_y[n*bxstride];
 593           dest[y*rxstride] = s;
 594         }
 595     }
 596   else
 597     {
 598       const GFC_REAL_16 *restrict abase_x;
 599       const GFC_REAL_16 *restrict bbase_y;
 600       GFC_REAL_16 *restrict dest_y;
 601       GFC_REAL_16 s;
 602
 603       for (y = 0; y < ycount; y++)
 604         {
 605           bbase_y = &bbase[y*bystride];
 606           dest_y = &dest[y*rystride];
 607           for (x = 0; x < xcount; x++)
 608             {
 609               abase_x = &abase[x*axstride];
 610               s = (GFC_REAL_16) 0;
 611               for (n = 0; n < count; n++)
 612                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
 613               dest_y[x*rxstride] = s;
 614             }
 615         }
 616     }
 617 }
 618 #undef POW3
 619 #undef min
 620 #undef max
 621
 622 #endif /* HAVE_AVX */
 623
 624 #ifdef HAVE_AVX2
 625 static void
 626 matmul_r16_avx2 (gfc_array_r16 * const restrict retarray,
 627         gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
 628         int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 629 static void
 630 matmul_r16_avx2 (gfc_array_r16 * const restrict retarray,
 631         gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
 632         int blas_limit, blas_call gemm)
 633 {
 634   const GFC_REAL_16 * restrict abase;
 635   const GFC_REAL_16 * restrict bbase;
 636   GFC_REAL_16 * restrict dest;
 637
 638   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
 639   index_type x, y, n, count, xcount, ycount;
 640
 641   assert (GFC_DESCRIPTOR_RANK (a) == 2
 642           || GFC_DESCRIPTOR_RANK (b) == 2);
 643
 644 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
 645
 646    Either A or B (but not both) can be rank 1:
 647
 648    o One-dimensional argument A is implicitly treated as a row matrix
 649      dimensioned [1,count], so xcount=1.
 650
 651    o One-dimensional argument B is implicitly treated as a column matrix
 652      dimensioned [count, 1], so ycount=1.
 653 */
 654
 655   if (retarray->base_addr == NULL)
 656     {
 657       if (GFC_DESCRIPTOR_RANK (a) == 1)
 658         {
 659           GFC_DIMENSION_SET(retarray->dim[0], 0,
 660                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
 661         }
 662       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 663         {
 664           GFC_DIMENSION_SET(retarray->dim[0], 0,
 665                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 666         }
 667       else
 668         {
 669           GFC_DIMENSION_SET(retarray->dim[0], 0,
 670                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 671
 672           GFC_DIMENSION_SET(retarray->dim[1], 0,
 673                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
 674                             GFC_DESCRIPTOR_EXTENT(retarray,0));
 675         }
 676
 677       retarray->base_addr
 678         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
 679       retarray->offset = 0;
 680     }
 681   else if (unlikely (compile_options.bounds_check))
 682     {
 683       index_type ret_extent, arg_extent;
 684
 685       if (GFC_DESCRIPTOR_RANK (a) == 1)
 686         {
 687           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 688           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 689           if (arg_extent != ret_extent)
 690             runtime_error ("Incorrect extent in return array in"
 691                            " MATMUL intrinsic: is %ld, should be %ld",
 692                            (long int) ret_extent, (long int) arg_extent);
 693         }
 694       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 695         {
 696           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 697           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 698           if (arg_extent != ret_extent)
 699             runtime_error ("Incorrect extent in return array in"
 700                            " MATMUL intrinsic: is %ld, should be %ld",
 701                            (long int) ret_extent, (long int) arg_extent);
 702         }
 703       else
 704         {
 705           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 706           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 707           if (arg_extent != ret_extent)
 708             runtime_error ("Incorrect extent in return array in"
 709                            " MATMUL intrinsic for dimension 1:"
 710                            " is %ld, should be %ld",
 711                            (long int) ret_extent, (long int) arg_extent);
 712
 713           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 714           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 715           if (arg_extent != ret_extent)
 716             runtime_error ("Incorrect extent in return array in"
 717                            " MATMUL intrinsic for dimension 2:"
 718                            " is %ld, should be %ld",
 719                            (long int) ret_extent, (long int) arg_extent);
 720         }
 721     }
 722
 723
 724   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 725     {
 726       /* One-dimensional result may be addressed in the code below
 727          either as a row or a column matrix. We want both cases to
 728          work. */
 729       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 730     }
 731   else
 732     {
 733       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 734       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 735     }
 736
 737
 738   if (GFC_DESCRIPTOR_RANK (a) == 1)
 739     {
 740       /* Treat it as a a row matrix A[1,count]. */
 741       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 742       aystride = 1;
 743
 744       xcount = 1;
 745       count = GFC_DESCRIPTOR_EXTENT(a,0);
 746     }
 747   else
 748     {
 749       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 750       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 751
 752       count = GFC_DESCRIPTOR_EXTENT(a,1);
 753       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 754     }
 755
 756   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 757     {
 758       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 759         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
 760     }
 761
 762   if (GFC_DESCRIPTOR_RANK (b) == 1)
 763     {
 764       /* Treat it as a column matrix B[count,1] */
 765       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 766
 767       /* bystride should never be used for 1-dimensional b.
 768          in case it is we want it to cause a segfault, rather than
 769          an incorrect result. */
 770       bystride = 0xDEADBEEF;
 771       ycount = 1;
 772     }
 773   else
 774     {
 775       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 776       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 777       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 778     }
 779
 780   abase = a->base_addr;
 781   bbase = b->base_addr;
 782   dest = retarray->base_addr;
 783
 784   /* Now that everything is set up, we perform the multiplication
 785      itself.  */
 786
 787 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 788 #define min(a,b) ((a) <= (b) ? (a) : (b))
 789 #define max(a,b) ((a) >= (b) ? (a) : (b))
 790
 791   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 792       && (bxstride == 1 || bystride == 1)
 793       && (((float) xcount) * ((float) ycount) * ((float) count)
 794           > POW3(blas_limit)))
 795     {
 796       const int m = xcount, n = ycount, k = count, ldc = rystride;
 797       const GFC_REAL_16 one = 1, zero = 0;
 798       const int lda = (axstride == 1) ? aystride : axstride,
 799                 ldb = (bxstride == 1) ? bystride : bxstride;
 800
 801       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 802         {
 803           assert (gemm != NULL);
 804           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
 805                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 806                 &ldc, 1, 1);
 807           return;
 808         }
 809     }
 810
 811   if (rxstride == 1 && axstride == 1 && bxstride == 1)
 812     {
 813       /* This block of code implements a tuned matmul, derived from
 814          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 815
 816                Bo Kagstrom and Per Ling
 817                Department of Computing Science
 818                Umea University
 819                S-901 87 Umea, Sweden
 820
 821          from netlib.org, translated to C, and modified for matmul.m4.  */
 822
 823       const GFC_REAL_16 *a, *b;
 824       GFC_REAL_16 *c;
 825       const index_type m = xcount, n = ycount, k = count;
 826
 827       /* System generated locals */
 828       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 829                  i1, i2, i3, i4, i5, i6;
 830
 831       /* Local variables */
 832       GFC_REAL_16 t1[65536], /* was [256][256] */
 833                  f11, f12, f21, f22, f31, f32, f41, f42,
 834                  f13, f14, f23, f24, f33, f34, f43, f44;
 835       index_type i, j, l, ii, jj, ll;
 836       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 837
 838       a = abase;
 839       b = bbase;
 840       c = retarray->base_addr;
 841
 842       /* Parameter adjustments */
 843       c_dim1 = rystride;
 844       c_offset = 1 + c_dim1;
 845       c -= c_offset;
 846       a_dim1 = aystride;
 847       a_offset = 1 + a_dim1;
 848       a -= a_offset;
 849       b_dim1 = bystride;
 850       b_offset = 1 + b_dim1;
 851       b -= b_offset;
 852
 853       /* Early exit if possible */
 854       if (m == 0 || n == 0 || k == 0)
 855         return;
 856
 857       /* Empty c first.  */
 858       for (j=1; j<=n; j++)
 859         for (i=1; i<=m; i++)
 860           c[i + j * c_dim1] = (GFC_REAL_16)0;
 861
 862       /* Start turning the crank. */
 863       i1 = n;
 864       for (jj = 1; jj <= i1; jj += 512)
 865         {
 866           /* Computing MIN */
 867           i2 = 512;
 868           i3 = n - jj + 1;
 869           jsec = min(i2,i3);
 870           ujsec = jsec - jsec % 4;
 871           i2 = k;
 872           for (ll = 1; ll <= i2; ll += 256)
 873             {
 874               /* Computing MIN */
 875               i3 = 256;
 876               i4 = k - ll + 1;
 877               lsec = min(i3,i4);
 878               ulsec = lsec - lsec % 2;
 879
 880               i3 = m;
 881               for (ii = 1; ii <= i3; ii += 256)
 882                 {
 883                   /* Computing MIN */
 884                   i4 = 256;
 885                   i5 = m - ii + 1;
 886                   isec = min(i4,i5);
 887                   uisec = isec - isec % 2;
 888                   i4 = ll + ulsec - 1;
 889                   for (l = ll; l <= i4; l += 2)
 890                     {
 891                       i5 = ii + uisec - 1;
 892                       for (i = ii; i <= i5; i += 2)
 893                         {
 894                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 895                                         a[i + l * a_dim1];
 896                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 897                                         a[i + (l + 1) * a_dim1];
 898                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 899                                         a[i + 1 + l * a_dim1];
 900                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 901                                         a[i + 1 + (l + 1) * a_dim1];
 902                         }
 903                       if (uisec < isec)
 904                         {
 905                           t1[l - ll + 1 + (isec << 8) - 257] =
 906                                     a[ii + isec - 1 + l * a_dim1];
 907                           t1[l - ll + 2 + (isec << 8) - 257] =
 908                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 909                         }
 910                     }
 911                   if (ulsec < lsec)
 912                     {
 913                       i4 = ii + isec - 1;
 914                       for (i = ii; i<= i4; ++i)
 915                         {
 916                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 917                                     a[i + (ll + lsec - 1) * a_dim1];
 918                         }
 919                     }
 920
 921                   uisec = isec - isec % 4;
 922                   i4 = jj + ujsec - 1;
 923                   for (j = jj; j <= i4; j += 4)
 924                     {
 925                       i5 = ii + uisec - 1;
 926                       for (i = ii; i <= i5; i += 4)
 927                         {
 928                           f11 = c[i + j * c_dim1];
 929                           f21 = c[i + 1 + j * c_dim1];
 930                           f12 = c[i + (j + 1) * c_dim1];
 931                           f22 = c[i + 1 + (j + 1) * c_dim1];
 932                           f13 = c[i + (j + 2) * c_dim1];
 933                           f23 = c[i + 1 + (j + 2) * c_dim1];
 934                           f14 = c[i + (j + 3) * c_dim1];
 935                           f24 = c[i + 1 + (j + 3) * c_dim1];
 936                           f31 = c[i + 2 + j * c_dim1];
 937                           f41 = c[i + 3 + j * c_dim1];
 938                           f32 = c[i + 2 + (j + 1) * c_dim1];
 939                           f42 = c[i + 3 + (j + 1) * c_dim1];
 940                           f33 = c[i + 2 + (j + 2) * c_dim1];
 941                           f43 = c[i + 3 + (j + 2) * c_dim1];
 942                           f34 = c[i + 2 + (j + 3) * c_dim1];
 943                           f44 = c[i + 3 + (j + 3) * c_dim1];
 944                           i6 = ll + lsec - 1;
 945                           for (l = ll; l <= i6; ++l)
 946                             {
 947                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 948                                       * b[l + j * b_dim1];
 949                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 950                                       * b[l + j * b_dim1];
 951                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 952                                       * b[l + (j + 1) * b_dim1];
 953                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 954                                       * b[l + (j + 1) * b_dim1];
 955                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 956                                       * b[l + (j + 2) * b_dim1];
 957                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 958                                       * b[l + (j + 2) * b_dim1];
 959                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 960                                       * b[l + (j + 3) * b_dim1];
 961                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 962                                       * b[l + (j + 3) * b_dim1];
 963                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 964                                       * b[l + j * b_dim1];
 965                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 966                                       * b[l + j * b_dim1];
 967                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 968                                       * b[l + (j + 1) * b_dim1];
 969                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 970                                       * b[l + (j + 1) * b_dim1];
 971                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 972                                       * b[l + (j + 2) * b_dim1];
 973                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 974                                       * b[l + (j + 2) * b_dim1];
 975                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 976                                       * b[l + (j + 3) * b_dim1];
 977                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 978                                       * b[l + (j + 3) * b_dim1];
 979                             }
 980                           c[i + j * c_dim1] = f11;
 981                           c[i + 1 + j * c_dim1] = f21;
 982                           c[i + (j + 1) * c_dim1] = f12;
 983                           c[i + 1 + (j + 1) * c_dim1] = f22;
 984                           c[i + (j + 2) * c_dim1] = f13;
 985                           c[i + 1 + (j + 2) * c_dim1] = f23;
 986                           c[i + (j + 3) * c_dim1] = f14;
 987                           c[i + 1 + (j + 3) * c_dim1] = f24;
 988                           c[i + 2 + j * c_dim1] = f31;
 989                           c[i + 3 + j * c_dim1] = f41;
 990                           c[i + 2 + (j + 1) * c_dim1] = f32;
 991                           c[i + 3 + (j + 1) * c_dim1] = f42;
 992                           c[i + 2 + (j + 2) * c_dim1] = f33;
 993                           c[i + 3 + (j + 2) * c_dim1] = f43;
 994                           c[i + 2 + (j + 3) * c_dim1] = f34;
 995                           c[i + 3 + (j + 3) * c_dim1] = f44;
 996                         }
 997                       if (uisec < isec)
 998                         {
 999                           i5 = ii + isec - 1;
1000                           for (i = ii + uisec; i <= i5; ++i)
1001                             {
1002                               f11 = c[i + j * c_dim1];
1003                               f12 = c[i + (j + 1) * c_dim1];
1004                               f13 = c[i + (j + 2) * c_dim1];
1005                               f14 = c[i + (j + 3) * c_dim1];
1006                               i6 = ll + lsec - 1;
1007                               for (l = ll; l <= i6; ++l)
1008                                 {
1009                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1010                                           257] * b[l + j * b_dim1];
1011                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1012                                           257] * b[l + (j + 1) * b_dim1];
1013                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1014                                           257] * b[l + (j + 2) * b_dim1];
1015                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1016                                           257] * b[l + (j + 3) * b_dim1];
1017                                 }
1018                               c[i + j * c_dim1] = f11;
1019                               c[i + (j + 1) * c_dim1] = f12;
1020                               c[i + (j + 2) * c_dim1] = f13;
1021                               c[i + (j + 3) * c_dim1] = f14;
1022                             }
1023                         }
1024                     }
1025                   if (ujsec < jsec)
1026                     {
1027                       i4 = jj + jsec - 1;
1028                       for (j = jj + ujsec; j <= i4; ++j)
1029                         {
1030                           i5 = ii + uisec - 1;
1031                           for (i = ii; i <= i5; i += 4)
1032                             {
1033                               f11 = c[i + j * c_dim1];
1034                               f21 = c[i + 1 + j * c_dim1];
1035                               f31 = c[i + 2 + j * c_dim1];
1036                               f41 = c[i + 3 + j * c_dim1];
1037                               i6 = ll + lsec - 1;
1038                               for (l = ll; l <= i6; ++l)
1039                                 {
1040                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1041                                           257] * b[l + j * b_dim1];
1042                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1043                                           257] * b[l + j * b_dim1];
1044                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1045                                           257] * b[l + j * b_dim1];
1046                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1047                                           257] * b[l + j * b_dim1];
1048                                 }
1049                               c[i + j * c_dim1] = f11;
1050                               c[i + 1 + j * c_dim1] = f21;
1051                               c[i + 2 + j * c_dim1] = f31;
1052                               c[i + 3 + j * c_dim1] = f41;
1053                             }
1054                           i5 = ii + isec - 1;
1055                           for (i = ii + uisec; i <= i5; ++i)
1056                             {
1057                               f11 = c[i + j * c_dim1];
1058                               i6 = ll + lsec - 1;
1059                               for (l = ll; l <= i6; ++l)
1060                                 {
1061                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1062                                           257] * b[l + j * b_dim1];
1063                                 }
1064                               c[i + j * c_dim1] = f11;
1065                             }
1066                         }
1067                     }
1068                 }
1069             }
1070         }
1071       return;
1072     }
1073   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1074     {
1075       if (GFC_DESCRIPTOR_RANK (a) != 1)
1076         {
1077           const GFC_REAL_16 *restrict abase_x;
1078           const GFC_REAL_16 *restrict bbase_y;
1079           GFC_REAL_16 *restrict dest_y;
1080           GFC_REAL_16 s;
1081
1082           for (y = 0; y < ycount; y++)
1083             {
1084               bbase_y = &bbase[y*bystride];
1085               dest_y = &dest[y*rystride];
1086               for (x = 0; x < xcount; x++)
1087                 {
1088                   abase_x = &abase[x*axstride];
1089                   s = (GFC_REAL_16) 0;
1090                   for (n = 0; n < count; n++)
1091                     s += abase_x[n] * bbase_y[n];
1092                   dest_y[x] = s;
1093                 }
1094             }
1095         }
1096       else
1097         {
1098           const GFC_REAL_16 *restrict bbase_y;
1099           GFC_REAL_16 s;
1100
1101           for (y = 0; y < ycount; y++)
1102             {
1103               bbase_y = &bbase[y*bystride];
1104               s = (GFC_REAL_16) 0;
1105               for (n = 0; n < count; n++)
1106                 s += abase[n*axstride] * bbase_y[n];
1107               dest[y*rystride] = s;
1108             }
1109         }
1110     }
1111   else if (axstride < aystride)
1112     {
1113       for (y = 0; y < ycount; y++)
1114         for (x = 0; x < xcount; x++)
1115           dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
1116
1117       for (y = 0; y < ycount; y++)
1118         for (n = 0; n < count; n++)
1119           for (x = 0; x < xcount; x++)
1120             /* dest[x,y] += a[x,n] * b[n,y] */
1121             dest[x*rxstride + y*rystride] +=
1122                                         abase[x*axstride + n*aystride] *
1123                                         bbase[n*bxstride + y*bystride];
1124     }
1125   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1126     {
1127       const GFC_REAL_16 *restrict bbase_y;
1128       GFC_REAL_16 s;
1129
1130       for (y = 0; y < ycount; y++)
1131         {
1132           bbase_y = &bbase[y*bystride];
1133           s = (GFC_REAL_16) 0;
1134           for (n = 0; n < count; n++)
1135             s += abase[n*axstride] * bbase_y[n*bxstride];
1136           dest[y*rxstride] = s;
1137         }
1138     }
1139   else
1140     {
1141       const GFC_REAL_16 *restrict abase_x;
1142       const GFC_REAL_16 *restrict bbase_y;
1143       GFC_REAL_16 *restrict dest_y;
1144       GFC_REAL_16 s;
1145
1146       for (y = 0; y < ycount; y++)
1147         {
1148           bbase_y = &bbase[y*bystride];
1149           dest_y = &dest[y*rystride];
1150           for (x = 0; x < xcount; x++)
1151             {
1152               abase_x = &abase[x*axstride];
1153               s = (GFC_REAL_16) 0;
1154               for (n = 0; n < count; n++)
1155                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1156               dest_y[x*rxstride] = s;
1157             }
1158         }
1159     }
1160 }
1161 #undef POW3
1162 #undef min
1163 #undef max
1164
1165 #endif /* HAVE_AVX2 */
1166
1167 #ifdef HAVE_AVX512F
1168 static void
1169 matmul_r16_avx512f (gfc_array_r16 * const restrict retarray,
1170         gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
1171         int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1172 static void
1173 matmul_r16_avx512f (gfc_array_r16 * const restrict retarray,
1174         gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
1175         int blas_limit, blas_call gemm)
1176 {
1177   const GFC_REAL_16 * restrict abase;
1178   const GFC_REAL_16 * restrict bbase;
1179   GFC_REAL_16 * restrict dest;
1180
1181   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1182   index_type x, y, n, count, xcount, ycount;
1183
1184   assert (GFC_DESCRIPTOR_RANK (a) == 2
1185           || GFC_DESCRIPTOR_RANK (b) == 2);
1186
1187 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1188
1189    Either A or B (but not both) can be rank 1:
1190
1191    o One-dimensional argument A is implicitly treated as a row matrix
1192      dimensioned [1,count], so xcount=1.
1193
1194    o One-dimensional argument B is implicitly treated as a column matrix
1195      dimensioned [count, 1], so ycount=1.
1196 */
1197
1198   if (retarray->base_addr == NULL)
1199     {
1200       if (GFC_DESCRIPTOR_RANK (a) == 1)
1201         {
1202           GFC_DIMENSION_SET(retarray->dim[0], 0,
1203                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1204         }
1205       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1206         {
1207           GFC_DIMENSION_SET(retarray->dim[0], 0,
1208                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1209         }
1210       else
1211         {
1212           GFC_DIMENSION_SET(retarray->dim[0], 0,
1213                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1214
1215           GFC_DIMENSION_SET(retarray->dim[1], 0,
1216                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1217                             GFC_DESCRIPTOR_EXTENT(retarray,0));
1218         }
1219
1220       retarray->base_addr
1221         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
1222       retarray->offset = 0;
1223     }
1224   else if (unlikely (compile_options.bounds_check))
1225     {
1226       index_type ret_extent, arg_extent;
1227
1228       if (GFC_DESCRIPTOR_RANK (a) == 1)
1229         {
1230           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1231           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1232           if (arg_extent != ret_extent)
1233             runtime_error ("Incorrect extent in return array in"
1234                            " MATMUL intrinsic: is %ld, should be %ld",
1235                            (long int) ret_extent, (long int) arg_extent);
1236         }
1237       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1238         {
1239           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1240           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1241           if (arg_extent != ret_extent)
1242             runtime_error ("Incorrect extent in return array in"
1243                            " MATMUL intrinsic: is %ld, should be %ld",
1244                            (long int) ret_extent, (long int) arg_extent);
1245         }
1246       else
1247         {
1248           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1249           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1250           if (arg_extent != ret_extent)
1251             runtime_error ("Incorrect extent in return array in"
1252                            " MATMUL intrinsic for dimension 1:"
1253                            " is %ld, should be %ld",
1254                            (long int) ret_extent, (long int) arg_extent);
1255
1256           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1257           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1258           if (arg_extent != ret_extent)
1259             runtime_error ("Incorrect extent in return array in"
1260                            " MATMUL intrinsic for dimension 2:"
1261                            " is %ld, should be %ld",
1262                            (long int) ret_extent, (long int) arg_extent);
1263         }
1264     }
1265
1266
1267   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1268     {
1269       /* One-dimensional result may be addressed in the code below
1270          either as a row or a column matrix. We want both cases to
1271          work. */
1272       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1273     }
1274   else
1275     {
1276       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1277       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1278     }
1279
1280
1281   if (GFC_DESCRIPTOR_RANK (a) == 1)
1282     {
1283       /* Treat it as a a row matrix A[1,count]. */
1284       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1285       aystride = 1;
1286
1287       xcount = 1;
1288       count = GFC_DESCRIPTOR_EXTENT(a,0);
1289     }
1290   else
1291     {
1292       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1293       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1294
1295       count = GFC_DESCRIPTOR_EXTENT(a,1);
1296       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1297     }
1298
1299   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1300     {
1301       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1302         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1303     }
1304
1305   if (GFC_DESCRIPTOR_RANK (b) == 1)
1306     {
1307       /* Treat it as a column matrix B[count,1] */
1308       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1309
1310       /* bystride should never be used for 1-dimensional b.
1311          in case it is we want it to cause a segfault, rather than
1312          an incorrect result. */
1313       bystride = 0xDEADBEEF;
1314       ycount = 1;
1315     }
1316   else
1317     {
1318       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1319       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1320       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1321     }
1322
1323   abase = a->base_addr;
1324   bbase = b->base_addr;
1325   dest = retarray->base_addr;
1326
1327   /* Now that everything is set up, we perform the multiplication
1328      itself.  */
1329
1330 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1331 #define min(a,b) ((a) <= (b) ? (a) : (b))
1332 #define max(a,b) ((a) >= (b) ? (a) : (b))
1333
1334   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1335       && (bxstride == 1 || bystride == 1)
1336       && (((float) xcount) * ((float) ycount) * ((float) count)
1337           > POW3(blas_limit)))
1338     {
1339       const int m = xcount, n = ycount, k = count, ldc = rystride;
1340       const GFC_REAL_16 one = 1, zero = 0;
1341       const int lda = (axstride == 1) ? aystride : axstride,
1342                 ldb = (bxstride == 1) ? bystride : bxstride;
1343
1344       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1345         {
1346           assert (gemm != NULL);
1347           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1348                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1349                 &ldc, 1, 1);
1350           return;
1351         }
1352     }
1353
1354   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1355     {
1356       /* This block of code implements a tuned matmul, derived from
1357          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1358
1359                Bo Kagstrom and Per Ling
1360                Department of Computing Science
1361                Umea University
1362                S-901 87 Umea, Sweden
1363
1364          from netlib.org, translated to C, and modified for matmul.m4.  */
1365
1366       const GFC_REAL_16 *a, *b;
1367       GFC_REAL_16 *c;
1368       const index_type m = xcount, n = ycount, k = count;
1369
1370       /* System generated locals */
1371       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1372                  i1, i2, i3, i4, i5, i6;
1373
1374       /* Local variables */
1375       GFC_REAL_16 t1[65536], /* was [256][256] */
1376                  f11, f12, f21, f22, f31, f32, f41, f42,
1377                  f13, f14, f23, f24, f33, f34, f43, f44;
1378       index_type i, j, l, ii, jj, ll;
1379       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1380
1381       a = abase;
1382       b = bbase;
1383       c = retarray->base_addr;
1384
1385       /* Parameter adjustments */
1386       c_dim1 = rystride;
1387       c_offset = 1 + c_dim1;
1388       c -= c_offset;
1389       a_dim1 = aystride;
1390       a_offset = 1 + a_dim1;
1391       a -= a_offset;
1392       b_dim1 = bystride;
1393       b_offset = 1 + b_dim1;
1394       b -= b_offset;
1395
1396       /* Early exit if possible */
1397       if (m == 0 || n == 0 || k == 0)
1398         return;
1399
1400       /* Empty c first.  */
1401       for (j=1; j<=n; j++)
1402         for (i=1; i<=m; i++)
1403           c[i + j * c_dim1] = (GFC_REAL_16)0;
1404
1405       /* Start turning the crank. */
1406       i1 = n;
1407       for (jj = 1; jj <= i1; jj += 512)
1408         {
1409           /* Computing MIN */
1410           i2 = 512;
1411           i3 = n - jj + 1;
1412           jsec = min(i2,i3);
1413           ujsec = jsec - jsec % 4;
1414           i2 = k;
1415           for (ll = 1; ll <= i2; ll += 256)
1416             {
1417               /* Computing MIN */
1418               i3 = 256;
1419               i4 = k - ll + 1;
1420               lsec = min(i3,i4);
1421               ulsec = lsec - lsec % 2;
1422
1423               i3 = m;
1424               for (ii = 1; ii <= i3; ii += 256)
1425                 {
1426                   /* Computing MIN */
1427                   i4 = 256;
1428                   i5 = m - ii + 1;
1429                   isec = min(i4,i5);
1430                   uisec = isec - isec % 2;
1431                   i4 = ll + ulsec - 1;
1432                   for (l = ll; l <= i4; l += 2)
1433                     {
1434                       i5 = ii + uisec - 1;
1435                       for (i = ii; i <= i5; i += 2)
1436                         {
1437                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1438                                         a[i + l * a_dim1];
1439                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1440                                         a[i + (l + 1) * a_dim1];
1441                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1442                                         a[i + 1 + l * a_dim1];
1443                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1444                                         a[i + 1 + (l + 1) * a_dim1];
1445                         }
1446                       if (uisec < isec)
1447                         {
1448                           t1[l - ll + 1 + (isec << 8) - 257] =
1449                                     a[ii + isec - 1 + l * a_dim1];
1450                           t1[l - ll + 2 + (isec << 8) - 257] =
1451                                     a[ii + isec - 1 + (l + 1) * a_dim1];
1452                         }
1453                     }
1454                   if (ulsec < lsec)
1455                     {
1456                       i4 = ii + isec - 1;
1457                       for (i = ii; i<= i4; ++i)
1458                         {
1459                           t1[lsec + ((i - ii + 1) << 8) - 257] =
1460                                     a[i + (ll + lsec - 1) * a_dim1];
1461                         }
1462                     }
1463
1464                   uisec = isec - isec % 4;
1465                   i4 = jj + ujsec - 1;
1466                   for (j = jj; j <= i4; j += 4)
1467                     {
1468                       i5 = ii + uisec - 1;
1469                       for (i = ii; i <= i5; i += 4)
1470                         {
1471                           f11 = c[i + j * c_dim1];
1472                           f21 = c[i + 1 + j * c_dim1];
1473                           f12 = c[i + (j + 1) * c_dim1];
1474                           f22 = c[i + 1 + (j + 1) * c_dim1];
1475                           f13 = c[i + (j + 2) * c_dim1];
1476                           f23 = c[i + 1 + (j + 2) * c_dim1];
1477                           f14 = c[i + (j + 3) * c_dim1];
1478                           f24 = c[i + 1 + (j + 3) * c_dim1];
1479                           f31 = c[i + 2 + j * c_dim1];
1480                           f41 = c[i + 3 + j * c_dim1];
1481                           f32 = c[i + 2 + (j + 1) * c_dim1];
1482                           f42 = c[i + 3 + (j + 1) * c_dim1];
1483                           f33 = c[i + 2 + (j + 2) * c_dim1];
1484                           f43 = c[i + 3 + (j + 2) * c_dim1];
1485                           f34 = c[i + 2 + (j + 3) * c_dim1];
1486                           f44 = c[i + 3 + (j + 3) * c_dim1];
1487                           i6 = ll + lsec - 1;
1488                           for (l = ll; l <= i6; ++l)
1489                             {
1490                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1491                                       * b[l + j * b_dim1];
1492                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1493                                       * b[l + j * b_dim1];
1494                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1495                                       * b[l + (j + 1) * b_dim1];
1496                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1497                                       * b[l + (j + 1) * b_dim1];
1498                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1499                                       * b[l + (j + 2) * b_dim1];
1500                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1501                                       * b[l + (j + 2) * b_dim1];
1502                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1503                                       * b[l + (j + 3) * b_dim1];
1504                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1505                                       * b[l + (j + 3) * b_dim1];
1506                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1507                                       * b[l + j * b_dim1];
1508                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1509                                       * b[l + j * b_dim1];
1510                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1511                                       * b[l + (j + 1) * b_dim1];
1512                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1513                                       * b[l + (j + 1) * b_dim1];
1514                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1515                                       * b[l + (j + 2) * b_dim1];
1516                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1517                                       * b[l + (j + 2) * b_dim1];
1518                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1519                                       * b[l + (j + 3) * b_dim1];
1520                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1521                                       * b[l + (j + 3) * b_dim1];
1522                             }
1523                           c[i + j * c_dim1] = f11;
1524                           c[i + 1 + j * c_dim1] = f21;
1525                           c[i + (j + 1) * c_dim1] = f12;
1526                           c[i + 1 + (j + 1) * c_dim1] = f22;
1527                           c[i + (j + 2) * c_dim1] = f13;
1528                           c[i + 1 + (j + 2) * c_dim1] = f23;
1529                           c[i + (j + 3) * c_dim1] = f14;
1530                           c[i + 1 + (j + 3) * c_dim1] = f24;
1531                           c[i + 2 + j * c_dim1] = f31;
1532                           c[i + 3 + j * c_dim1] = f41;
1533                           c[i + 2 + (j + 1) * c_dim1] = f32;
1534                           c[i + 3 + (j + 1) * c_dim1] = f42;
1535                           c[i + 2 + (j + 2) * c_dim1] = f33;
1536                           c[i + 3 + (j + 2) * c_dim1] = f43;
1537                           c[i + 2 + (j + 3) * c_dim1] = f34;
1538                           c[i + 3 + (j + 3) * c_dim1] = f44;
1539                         }
1540                       if (uisec < isec)
1541                         {
1542                           i5 = ii + isec - 1;
1543                           for (i = ii + uisec; i <= i5; ++i)
1544                             {
1545                               f11 = c[i + j * c_dim1];
1546                               f12 = c[i + (j + 1) * c_dim1];
1547                               f13 = c[i + (j + 2) * c_dim1];
1548                               f14 = c[i + (j + 3) * c_dim1];
1549                               i6 = ll + lsec - 1;
1550                               for (l = ll; l <= i6; ++l)
1551                                 {
1552                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1553                                           257] * b[l + j * b_dim1];
1554                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1555                                           257] * b[l + (j + 1) * b_dim1];
1556                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1557                                           257] * b[l + (j + 2) * b_dim1];
1558                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1559                                           257] * b[l + (j + 3) * b_dim1];
1560                                 }
1561                               c[i + j * c_dim1] = f11;
1562                               c[i + (j + 1) * c_dim1] = f12;
1563                               c[i + (j + 2) * c_dim1] = f13;
1564                               c[i + (j + 3) * c_dim1] = f14;
1565                             }
1566                         }
1567                     }
1568                   if (ujsec < jsec)
1569                     {
1570                       i4 = jj + jsec - 1;
1571                       for (j = jj + ujsec; j <= i4; ++j)
1572                         {
1573                           i5 = ii + uisec - 1;
1574                           for (i = ii; i <= i5; i += 4)
1575                             {
1576                               f11 = c[i + j * c_dim1];
1577                               f21 = c[i + 1 + j * c_dim1];
1578                               f31 = c[i + 2 + j * c_dim1];
1579                               f41 = c[i + 3 + j * c_dim1];
1580                               i6 = ll + lsec - 1;
1581                               for (l = ll; l <= i6; ++l)
1582                                 {
1583                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1584                                           257] * b[l + j * b_dim1];
1585                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1586                                           257] * b[l + j * b_dim1];
1587                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1588                                           257] * b[l + j * b_dim1];
1589                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1590                                           257] * b[l + j * b_dim1];
1591                                 }
1592                               c[i + j * c_dim1] = f11;
1593                               c[i + 1 + j * c_dim1] = f21;
1594                               c[i + 2 + j * c_dim1] = f31;
1595                               c[i + 3 + j * c_dim1] = f41;
1596                             }
1597                           i5 = ii + isec - 1;
1598                           for (i = ii + uisec; i <= i5; ++i)
1599                             {
1600                               f11 = c[i + j * c_dim1];
1601                               i6 = ll + lsec - 1;
1602                               for (l = ll; l <= i6; ++l)
1603                                 {
1604                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1605                                           257] * b[l + j * b_dim1];
1606                                 }
1607                               c[i + j * c_dim1] = f11;
1608                             }
1609                         }
1610                     }
1611                 }
1612             }
1613         }
1614       return;
1615     }
1616   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1617     {
1618       if (GFC_DESCRIPTOR_RANK (a) != 1)
1619         {
1620           const GFC_REAL_16 *restrict abase_x;
1621           const GFC_REAL_16 *restrict bbase_y;
1622           GFC_REAL_16 *restrict dest_y;
1623           GFC_REAL_16 s;
1624
1625           for (y = 0; y < ycount; y++)
1626             {
1627               bbase_y = &bbase[y*bystride];
1628               dest_y = &dest[y*rystride];
1629               for (x = 0; x < xcount; x++)
1630                 {
1631                   abase_x = &abase[x*axstride];
1632                   s = (GFC_REAL_16) 0;
1633                   for (n = 0; n < count; n++)
1634                     s += abase_x[n] * bbase_y[n];
1635                   dest_y[x] = s;
1636                 }
1637             }
1638         }
1639       else
1640         {
1641           const GFC_REAL_16 *restrict bbase_y;
1642           GFC_REAL_16 s;
1643
1644           for (y = 0; y < ycount; y++)
1645             {
1646               bbase_y = &bbase[y*bystride];
1647               s = (GFC_REAL_16) 0;
1648               for (n = 0; n < count; n++)
1649                 s += abase[n*axstride] * bbase_y[n];
1650               dest[y*rystride] = s;
1651             }
1652         }
1653     }
1654   else if (axstride < aystride)
1655     {
1656       for (y = 0; y < ycount; y++)
1657         for (x = 0; x < xcount; x++)
1658           dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
1659
1660       for (y = 0; y < ycount; y++)
1661         for (n = 0; n < count; n++)
1662           for (x = 0; x < xcount; x++)
1663             /* dest[x,y] += a[x,n] * b[n,y] */
1664             dest[x*rxstride + y*rystride] +=
1665                                         abase[x*axstride + n*aystride] *
1666                                         bbase[n*bxstride + y*bystride];
1667     }
1668   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1669     {
1670       const GFC_REAL_16 *restrict bbase_y;
1671       GFC_REAL_16 s;
1672
1673       for (y = 0; y < ycount; y++)
1674         {
1675           bbase_y = &bbase[y*bystride];
1676           s = (GFC_REAL_16) 0;
1677           for (n = 0; n < count; n++)
1678             s += abase[n*axstride] * bbase_y[n*bxstride];
1679           dest[y*rxstride] = s;
1680         }
1681     }
1682   else
1683     {
1684       const GFC_REAL_16 *restrict abase_x;
1685       const GFC_REAL_16 *restrict bbase_y;
1686       GFC_REAL_16 *restrict dest_y;
1687       GFC_REAL_16 s;
1688
1689       for (y = 0; y < ycount; y++)
1690         {
1691           bbase_y = &bbase[y*bystride];
1692           dest_y = &dest[y*rystride];
1693           for (x = 0; x < xcount; x++)
1694             {
1695               abase_x = &abase[x*axstride];
1696               s = (GFC_REAL_16) 0;
1697               for (n = 0; n < count; n++)
1698                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1699               dest_y[x*rxstride] = s;
1700             }
1701         }
1702     }
1703 }
1704 #undef POW3
1705 #undef min
1706 #undef max
1707
1708 #endif  /* HAVE_AVX512F */
1709
1710 /* Function to fall back to if there is no special processor-specific version.  */
1711 static void
1712 matmul_r16_vanilla (gfc_array_r16 * const restrict retarray,
1713         gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
1714         int blas_limit, blas_call gemm)
1715 {
1716   const GFC_REAL_16 * restrict abase;
1717   const GFC_REAL_16 * restrict bbase;
1718   GFC_REAL_16 * restrict dest;
1719
1720   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1721   index_type x, y, n, count, xcount, ycount;
1722
1723   assert (GFC_DESCRIPTOR_RANK (a) == 2
1724           || GFC_DESCRIPTOR_RANK (b) == 2);
1725
1726 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1727
1728    Either A or B (but not both) can be rank 1:
1729
1730    o One-dimensional argument A is implicitly treated as a row matrix
1731      dimensioned [1,count], so xcount=1.
1732
1733    o One-dimensional argument B is implicitly treated as a column matrix
1734      dimensioned [count, 1], so ycount=1.
1735 */
1736
1737   if (retarray->base_addr == NULL)
1738     {
1739       if (GFC_DESCRIPTOR_RANK (a) == 1)
1740         {
1741           GFC_DIMENSION_SET(retarray->dim[0], 0,
1742                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1743         }
1744       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1745         {
1746           GFC_DIMENSION_SET(retarray->dim[0], 0,
1747                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1748         }
1749       else
1750         {
1751           GFC_DIMENSION_SET(retarray->dim[0], 0,
1752                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1753
1754           GFC_DIMENSION_SET(retarray->dim[1], 0,
1755                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1756                             GFC_DESCRIPTOR_EXTENT(retarray,0));
1757         }
1758
1759       retarray->base_addr
1760         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
1761       retarray->offset = 0;
1762     }
1763   else if (unlikely (compile_options.bounds_check))
1764     {
1765       index_type ret_extent, arg_extent;
1766
1767       if (GFC_DESCRIPTOR_RANK (a) == 1)
1768         {
1769           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1770           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1771           if (arg_extent != ret_extent)
1772             runtime_error ("Incorrect extent in return array in"
1773                            " MATMUL intrinsic: is %ld, should be %ld",
1774                            (long int) ret_extent, (long int) arg_extent);
1775         }
1776       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1777         {
1778           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1779           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1780           if (arg_extent != ret_extent)
1781             runtime_error ("Incorrect extent in return array in"
1782                            " MATMUL intrinsic: is %ld, should be %ld",
1783                            (long int) ret_extent, (long int) arg_extent);
1784         }
1785       else
1786         {
1787           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1788           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1789           if (arg_extent != ret_extent)
1790             runtime_error ("Incorrect extent in return array in"
1791                            " MATMUL intrinsic for dimension 1:"
1792                            " is %ld, should be %ld",
1793                            (long int) ret_extent, (long int) arg_extent);
1794
1795           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1796           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1797           if (arg_extent != ret_extent)
1798             runtime_error ("Incorrect extent in return array in"
1799                            " MATMUL intrinsic for dimension 2:"
1800                            " is %ld, should be %ld",
1801                            (long int) ret_extent, (long int) arg_extent);
1802         }
1803     }
1804
1805
1806   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1807     {
1808       /* One-dimensional result may be addressed in the code below
1809          either as a row or a column matrix. We want both cases to
1810          work. */
1811       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1812     }
1813   else
1814     {
1815       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1816       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1817     }
1818
1819
1820   if (GFC_DESCRIPTOR_RANK (a) == 1)
1821     {
1822       /* Treat it as a a row matrix A[1,count]. */
1823       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1824       aystride = 1;
1825
1826       xcount = 1;
1827       count = GFC_DESCRIPTOR_EXTENT(a,0);
1828     }
1829   else
1830     {
1831       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1832       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1833
1834       count = GFC_DESCRIPTOR_EXTENT(a,1);
1835       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1836     }
1837
1838   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1839     {
1840       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1841         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1842     }
1843
1844   if (GFC_DESCRIPTOR_RANK (b) == 1)
1845     {
1846       /* Treat it as a column matrix B[count,1] */
1847       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1848
1849       /* bystride should never be used for 1-dimensional b.
1850          in case it is we want it to cause a segfault, rather than
1851          an incorrect result. */
1852       bystride = 0xDEADBEEF;
1853       ycount = 1;
1854     }
1855   else
1856     {
1857       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1858       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1859       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1860     }
1861
1862   abase = a->base_addr;
1863   bbase = b->base_addr;
1864   dest = retarray->base_addr;
1865
1866   /* Now that everything is set up, we perform the multiplication
1867      itself.  */
1868
1869 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1870 #define min(a,b) ((a) <= (b) ? (a) : (b))
1871 #define max(a,b) ((a) >= (b) ? (a) : (b))
1872
1873   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1874       && (bxstride == 1 || bystride == 1)
1875       && (((float) xcount) * ((float) ycount) * ((float) count)
1876           > POW3(blas_limit)))
1877     {
1878       const int m = xcount, n = ycount, k = count, ldc = rystride;
1879       const GFC_REAL_16 one = 1, zero = 0;
1880       const int lda = (axstride == 1) ? aystride : axstride,
1881                 ldb = (bxstride == 1) ? bystride : bxstride;
1882
1883       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1884         {
1885           assert (gemm != NULL);
1886           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1887                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1888                 &ldc, 1, 1);
1889           return;
1890         }
1891     }
1892
1893   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1894     {
1895       /* This block of code implements a tuned matmul, derived from
1896          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1897
1898                Bo Kagstrom and Per Ling
1899                Department of Computing Science
1900                Umea University
1901                S-901 87 Umea, Sweden
1902
1903          from netlib.org, translated to C, and modified for matmul.m4.  */
1904
1905       const GFC_REAL_16 *a, *b;
1906       GFC_REAL_16 *c;
1907       const index_type m = xcount, n = ycount, k = count;
1908
1909       /* System generated locals */
1910       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1911                  i1, i2, i3, i4, i5, i6;
1912
1913       /* Local variables */
1914       GFC_REAL_16 t1[65536], /* was [256][256] */
1915                  f11, f12, f21, f22, f31, f32, f41, f42,
1916                  f13, f14, f23, f24, f33, f34, f43, f44;
1917       index_type i, j, l, ii, jj, ll;
1918       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1919
1920       a = abase;
1921       b = bbase;
1922       c = retarray->base_addr;
1923
1924       /* Parameter adjustments */
1925       c_dim1 = rystride;
1926       c_offset = 1 + c_dim1;
1927       c -= c_offset;
1928       a_dim1 = aystride;
1929       a_offset = 1 + a_dim1;
1930       a -= a_offset;
1931       b_dim1 = bystride;
1932       b_offset = 1 + b_dim1;
1933       b -= b_offset;
1934
1935       /* Early exit if possible */
1936       if (m == 0 || n == 0 || k == 0)
1937         return;
1938
1939       /* Empty c first.  */
1940       for (j=1; j<=n; j++)
1941         for (i=1; i<=m; i++)
1942           c[i + j * c_dim1] = (GFC_REAL_16)0;
1943
1944       /* Start turning the crank. */
1945       i1 = n;
1946       for (jj = 1; jj <= i1; jj += 512)
1947         {
1948           /* Computing MIN */
1949           i2 = 512;
1950           i3 = n - jj + 1;
1951           jsec = min(i2,i3);
1952           ujsec = jsec - jsec % 4;
1953           i2 = k;
1954           for (ll = 1; ll <= i2; ll += 256)
1955             {
1956               /* Computing MIN */
1957               i3 = 256;
1958               i4 = k - ll + 1;
1959               lsec = min(i3,i4);
1960               ulsec = lsec - lsec % 2;
1961
1962               i3 = m;
1963               for (ii = 1; ii <= i3; ii += 256)
1964                 {
1965                   /* Computing MIN */
1966                   i4 = 256;
1967                   i5 = m - ii + 1;
1968                   isec = min(i4,i5);
1969                   uisec = isec - isec % 2;
1970                   i4 = ll + ulsec - 1;
1971                   for (l = ll; l <= i4; l += 2)
1972                     {
1973                       i5 = ii + uisec - 1;
1974                       for (i = ii; i <= i5; i += 2)
1975                         {
1976                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1977                                         a[i + l * a_dim1];
1978                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1979                                         a[i + (l + 1) * a_dim1];
1980                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1981                                         a[i + 1 + l * a_dim1];
1982                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1983                                         a[i + 1 + (l + 1) * a_dim1];
1984                         }
1985                       if (uisec < isec)
1986                         {
1987                           t1[l - ll + 1 + (isec << 8) - 257] =
1988                                     a[ii + isec - 1 + l * a_dim1];
1989                           t1[l - ll + 2 + (isec << 8) - 257] =
1990                                     a[ii + isec - 1 + (l + 1) * a_dim1];
1991                         }
1992                     }
1993                   if (ulsec < lsec)
1994                     {
1995                       i4 = ii + isec - 1;
1996                       for (i = ii; i<= i4; ++i)
1997                         {
1998                           t1[lsec + ((i - ii + 1) << 8) - 257] =
1999                                     a[i + (ll + lsec - 1) * a_dim1];
2000                         }
2001                     }
2002
2003                   uisec = isec - isec % 4;
2004                   i4 = jj + ujsec - 1;
2005                   for (j = jj; j <= i4; j += 4)
2006                     {
2007                       i5 = ii + uisec - 1;
2008                       for (i = ii; i <= i5; i += 4)
2009                         {
2010                           f11 = c[i + j * c_dim1];
2011                           f21 = c[i + 1 + j * c_dim1];
2012                           f12 = c[i + (j + 1) * c_dim1];
2013                           f22 = c[i + 1 + (j + 1) * c_dim1];
2014                           f13 = c[i + (j + 2) * c_dim1];
2015                           f23 = c[i + 1 + (j + 2) * c_dim1];
2016                           f14 = c[i + (j + 3) * c_dim1];
2017                           f24 = c[i + 1 + (j + 3) * c_dim1];
2018                           f31 = c[i + 2 + j * c_dim1];
2019                           f41 = c[i + 3 + j * c_dim1];
2020                           f32 = c[i + 2 + (j + 1) * c_dim1];
2021                           f42 = c[i + 3 + (j + 1) * c_dim1];
2022                           f33 = c[i + 2 + (j + 2) * c_dim1];
2023                           f43 = c[i + 3 + (j + 2) * c_dim1];
2024                           f34 = c[i + 2 + (j + 3) * c_dim1];
2025                           f44 = c[i + 3 + (j + 3) * c_dim1];
2026                           i6 = ll + lsec - 1;
2027                           for (l = ll; l <= i6; ++l)
2028                             {
2029                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2030                                       * b[l + j * b_dim1];
2031                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2032                                       * b[l + j * b_dim1];
2033                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2034                                       * b[l + (j + 1) * b_dim1];
2035                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2036                                       * b[l + (j + 1) * b_dim1];
2037                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2038                                       * b[l + (j + 2) * b_dim1];
2039                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2040                                       * b[l + (j + 2) * b_dim1];
2041                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2042                                       * b[l + (j + 3) * b_dim1];
2043                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2044                                       * b[l + (j + 3) * b_dim1];
2045                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2046                                       * b[l + j * b_dim1];
2047                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2048                                       * b[l + j * b_dim1];
2049                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2050                                       * b[l + (j + 1) * b_dim1];
2051                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2052                                       * b[l + (j + 1) * b_dim1];
2053                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2054                                       * b[l + (j + 2) * b_dim1];
2055                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2056                                       * b[l + (j + 2) * b_dim1];
2057                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2058                                       * b[l + (j + 3) * b_dim1];
2059                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2060                                       * b[l + (j + 3) * b_dim1];
2061                             }
2062                           c[i + j * c_dim1] = f11;
2063                           c[i + 1 + j * c_dim1] = f21;
2064                           c[i + (j + 1) * c_dim1] = f12;
2065                           c[i + 1 + (j + 1) * c_dim1] = f22;
2066                           c[i + (j + 2) * c_dim1] = f13;
2067                           c[i + 1 + (j + 2) * c_dim1] = f23;
2068                           c[i + (j + 3) * c_dim1] = f14;
2069                           c[i + 1 + (j + 3) * c_dim1] = f24;
2070                           c[i + 2 + j * c_dim1] = f31;
2071                           c[i + 3 + j * c_dim1] = f41;
2072                           c[i + 2 + (j + 1) * c_dim1] = f32;
2073                           c[i + 3 + (j + 1) * c_dim1] = f42;
2074                           c[i + 2 + (j + 2) * c_dim1] = f33;
2075                           c[i + 3 + (j + 2) * c_dim1] = f43;
2076                           c[i + 2 + (j + 3) * c_dim1] = f34;
2077                           c[i + 3 + (j + 3) * c_dim1] = f44;
2078                         }
2079                       if (uisec < isec)
2080                         {
2081                           i5 = ii + isec - 1;
2082                           for (i = ii + uisec; i <= i5; ++i)
2083                             {
2084                               f11 = c[i + j * c_dim1];
2085                               f12 = c[i + (j + 1) * c_dim1];
2086                               f13 = c[i + (j + 2) * c_dim1];
2087                               f14 = c[i + (j + 3) * c_dim1];
2088                               i6 = ll + lsec - 1;
2089                               for (l = ll; l <= i6; ++l)
2090                                 {
2091                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2092                                           257] * b[l + j * b_dim1];
2093                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2094                                           257] * b[l + (j + 1) * b_dim1];
2095                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2096                                           257] * b[l + (j + 2) * b_dim1];
2097                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2098                                           257] * b[l + (j + 3) * b_dim1];
2099                                 }
2100                               c[i + j * c_dim1] = f11;
2101                               c[i + (j + 1) * c_dim1] = f12;
2102                               c[i + (j + 2) * c_dim1] = f13;
2103                               c[i + (j + 3) * c_dim1] = f14;
2104                             }
2105                         }
2106                     }
2107                   if (ujsec < jsec)
2108                     {
2109                       i4 = jj + jsec - 1;
2110                       for (j = jj + ujsec; j <= i4; ++j)
2111                         {
2112                           i5 = ii + uisec - 1;
2113                           for (i = ii; i <= i5; i += 4)
2114                             {
2115                               f11 = c[i + j * c_dim1];
2116                               f21 = c[i + 1 + j * c_dim1];
2117                               f31 = c[i + 2 + j * c_dim1];
2118                               f41 = c[i + 3 + j * c_dim1];
2119                               i6 = ll + lsec - 1;
2120                               for (l = ll; l <= i6; ++l)
2121                                 {
2122                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2123                                           257] * b[l + j * b_dim1];
2124                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2125                                           257] * b[l + j * b_dim1];
2126                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2127                                           257] * b[l + j * b_dim1];
2128                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2129                                           257] * b[l + j * b_dim1];
2130                                 }
2131                               c[i + j * c_dim1] = f11;
2132                               c[i + 1 + j * c_dim1] = f21;
2133                               c[i + 2 + j * c_dim1] = f31;
2134                               c[i + 3 + j * c_dim1] = f41;
2135                             }
2136                           i5 = ii + isec - 1;
2137                           for (i = ii + uisec; i <= i5; ++i)
2138                             {
2139                               f11 = c[i + j * c_dim1];
2140                               i6 = ll + lsec - 1;
2141                               for (l = ll; l <= i6; ++l)
2142                                 {
2143                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2144                                           257] * b[l + j * b_dim1];
2145                                 }
2146                               c[i + j * c_dim1] = f11;
2147                             }
2148                         }
2149                     }
2150                 }
2151             }
2152         }
2153       return;
2154     }
2155   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2156     {
2157       if (GFC_DESCRIPTOR_RANK (a) != 1)
2158         {
2159           const GFC_REAL_16 *restrict abase_x;
2160           const GFC_REAL_16 *restrict bbase_y;
2161           GFC_REAL_16 *restrict dest_y;
2162           GFC_REAL_16 s;
2163
2164           for (y = 0; y < ycount; y++)
2165             {
2166               bbase_y = &bbase[y*bystride];
2167               dest_y = &dest[y*rystride];
2168               for (x = 0; x < xcount; x++)
2169                 {
2170                   abase_x = &abase[x*axstride];
2171                   s = (GFC_REAL_16) 0;
2172                   for (n = 0; n < count; n++)
2173                     s += abase_x[n] * bbase_y[n];
2174                   dest_y[x] = s;
2175                 }
2176             }
2177         }
2178       else
2179         {
2180           const GFC_REAL_16 *restrict bbase_y;
2181           GFC_REAL_16 s;
2182
2183           for (y = 0; y < ycount; y++)
2184             {
2185               bbase_y = &bbase[y*bystride];
2186               s = (GFC_REAL_16) 0;
2187               for (n = 0; n < count; n++)
2188                 s += abase[n*axstride] * bbase_y[n];
2189               dest[y*rystride] = s;
2190             }
2191         }
2192     }
2193   else if (axstride < aystride)
2194     {
2195       for (y = 0; y < ycount; y++)
2196         for (x = 0; x < xcount; x++)
2197           dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
2198
2199       for (y = 0; y < ycount; y++)
2200         for (n = 0; n < count; n++)
2201           for (x = 0; x < xcount; x++)
2202             /* dest[x,y] += a[x,n] * b[n,y] */
2203             dest[x*rxstride + y*rystride] +=
2204                                         abase[x*axstride + n*aystride] *
2205                                         bbase[n*bxstride + y*bystride];
2206     }
2207   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2208     {
2209       const GFC_REAL_16 *restrict bbase_y;
2210       GFC_REAL_16 s;
2211
2212       for (y = 0; y < ycount; y++)
2213         {
2214           bbase_y = &bbase[y*bystride];
2215           s = (GFC_REAL_16) 0;
2216           for (n = 0; n < count; n++)
2217             s += abase[n*axstride] * bbase_y[n*bxstride];
2218           dest[y*rxstride] = s;
2219         }
2220     }
2221   else
2222     {
2223       const GFC_REAL_16 *restrict abase_x;
2224       const GFC_REAL_16 *restrict bbase_y;
2225       GFC_REAL_16 *restrict dest_y;
2226       GFC_REAL_16 s;
2227
2228       for (y = 0; y < ycount; y++)
2229         {
2230           bbase_y = &bbase[y*bystride];
2231           dest_y = &dest[y*rystride];
2232           for (x = 0; x < xcount; x++)
2233             {
2234               abase_x = &abase[x*axstride];
2235               s = (GFC_REAL_16) 0;
2236               for (n = 0; n < count; n++)
2237                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2238               dest_y[x*rxstride] = s;
2239             }
2240         }
2241     }
2242 }
2243 #undef POW3
2244 #undef min
2245 #undef max
2246
2247
2248 /* Compiling main function, with selection code for the processor.  */
2249
2250 /* Currently, this is i386 only.  Adjust for other architectures.  */
2251
2252 #include <config/i386/cpuinfo.h>
2253 void matmul_r16 (gfc_array_r16 * const restrict retarray,
2254         gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
2255         int blas_limit, blas_call gemm)
2256 {
2257   static void (*matmul_p) (gfc_array_r16 * const restrict retarray,
2258         gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
2259         int blas_limit, blas_call gemm);
2260
2261   void (*matmul_fn) (gfc_array_r16 * const restrict retarray,
2262         gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
2263         int blas_limit, blas_call gemm);
2264
2265   matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2266   if (matmul_fn == NULL)
2267     {
2268       matmul_fn = matmul_r16_vanilla;
2269       if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2270         {
2271           /* Run down the available processors in order of preference.  */
2272 #ifdef HAVE_AVX512F
2273           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2274             {
2275               matmul_fn = matmul_r16_avx512f;
2276               goto store;
2277             }
2278
2279 #endif  /* HAVE_AVX512F */
2280
2281 #ifdef HAVE_AVX2
2282           if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2283              && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2284             {
2285               matmul_fn = matmul_r16_avx2;
2286               goto store;
2287             }
2288
2289 #endif
2290
2291 #ifdef HAVE_AVX
2292           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2293             {
2294               matmul_fn = matmul_r16_avx;
2295               goto store;
2296             }
2297 #endif  /* HAVE_AVX */
2298         }
2299    store:
2300       __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2301    }
2302
2303    (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2304 }
2305
2306 #else  /* Just the vanilla function.  */
2307
2308 void
2309 matmul_r16 (gfc_array_r16 * const restrict retarray,
2310         gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
2311         int blas_limit, blas_call gemm)
2312 {
2313   const GFC_REAL_16 * restrict abase;
2314   const GFC_REAL_16 * restrict bbase;
2315   GFC_REAL_16 * restrict dest;
2316
2317   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2318   index_type x, y, n, count, xcount, ycount;
2319
2320   assert (GFC_DESCRIPTOR_RANK (a) == 2
2321           || GFC_DESCRIPTOR_RANK (b) == 2);
2322
2323 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2324
2325    Either A or B (but not both) can be rank 1:
2326
2327    o One-dimensional argument A is implicitly treated as a row matrix
2328      dimensioned [1,count], so xcount=1.
2329
2330    o One-dimensional argument B is implicitly treated as a column matrix
2331      dimensioned [count, 1], so ycount=1.
2332 */
2333
2334   if (retarray->base_addr == NULL)
2335     {
2336       if (GFC_DESCRIPTOR_RANK (a) == 1)
2337         {
2338           GFC_DIMENSION_SET(retarray->dim[0], 0,
2339                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2340         }
2341       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2342         {
2343           GFC_DIMENSION_SET(retarray->dim[0], 0,
2344                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2345         }
2346       else
2347         {
2348           GFC_DIMENSION_SET(retarray->dim[0], 0,
2349                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2350
2351           GFC_DIMENSION_SET(retarray->dim[1], 0,
2352                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2353                             GFC_DESCRIPTOR_EXTENT(retarray,0));
2354         }
2355
2356       retarray->base_addr
2357         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16));
2358       retarray->offset = 0;
2359     }
2360   else if (unlikely (compile_options.bounds_check))
2361     {
2362       index_type ret_extent, arg_extent;
2363
2364       if (GFC_DESCRIPTOR_RANK (a) == 1)
2365         {
2366           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2367           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2368           if (arg_extent != ret_extent)
2369             runtime_error ("Incorrect extent in return array in"
2370                            " MATMUL intrinsic: is %ld, should be %ld",
2371                            (long int) ret_extent, (long int) arg_extent);
2372         }
2373       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2374         {
2375           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2376           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2377           if (arg_extent != ret_extent)
2378             runtime_error ("Incorrect extent in return array in"
2379                            " MATMUL intrinsic: is %ld, should be %ld",
2380                            (long int) ret_extent, (long int) arg_extent);
2381         }
2382       else
2383         {
2384           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2385           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2386           if (arg_extent != ret_extent)
2387             runtime_error ("Incorrect extent in return array in"
2388                            " MATMUL intrinsic for dimension 1:"
2389                            " is %ld, should be %ld",
2390                            (long int) ret_extent, (long int) arg_extent);
2391
2392           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2393           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2394           if (arg_extent != ret_extent)
2395             runtime_error ("Incorrect extent in return array in"
2396                            " MATMUL intrinsic for dimension 2:"
2397                            " is %ld, should be %ld",
2398                            (long int) ret_extent, (long int) arg_extent);
2399         }
2400     }
2401
2402
2403   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2404     {
2405       /* One-dimensional result may be addressed in the code below
2406          either as a row or a column matrix. We want both cases to
2407          work. */
2408       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2409     }
2410   else
2411     {
2412       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2413       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2414     }
2415
2416
2417   if (GFC_DESCRIPTOR_RANK (a) == 1)
2418     {
2419       /* Treat it as a a row matrix A[1,count]. */
2420       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2421       aystride = 1;
2422
2423       xcount = 1;
2424       count = GFC_DESCRIPTOR_EXTENT(a,0);
2425     }
2426   else
2427     {
2428       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2429       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2430
2431       count = GFC_DESCRIPTOR_EXTENT(a,1);
2432       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2433     }
2434
2435   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2436     {
2437       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2438         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2439     }
2440
2441   if (GFC_DESCRIPTOR_RANK (b) == 1)
2442     {
2443       /* Treat it as a column matrix B[count,1] */
2444       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2445
2446       /* bystride should never be used for 1-dimensional b.
2447          in case it is we want it to cause a segfault, rather than
2448          an incorrect result. */
2449       bystride = 0xDEADBEEF;
2450       ycount = 1;
2451     }
2452   else
2453     {
2454       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2455       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2456       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2457     }
2458
2459   abase = a->base_addr;
2460   bbase = b->base_addr;
2461   dest = retarray->base_addr;
2462
2463   /* Now that everything is set up, we perform the multiplication
2464      itself.  */
2465
2466 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2467 #define min(a,b) ((a) <= (b) ? (a) : (b))
2468 #define max(a,b) ((a) >= (b) ? (a) : (b))
2469
2470   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2471       && (bxstride == 1 || bystride == 1)
2472       && (((float) xcount) * ((float) ycount) * ((float) count)
2473           > POW3(blas_limit)))
2474     {
2475       const int m = xcount, n = ycount, k = count, ldc = rystride;
2476       const GFC_REAL_16 one = 1, zero = 0;
2477       const int lda = (axstride == 1) ? aystride : axstride,
2478                 ldb = (bxstride == 1) ? bystride : bxstride;
2479
2480       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2481         {
2482           assert (gemm != NULL);
2483           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2484                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2485                 &ldc, 1, 1);
2486           return;
2487         }
2488     }
2489
2490   if (rxstride == 1 && axstride == 1 && bxstride == 1)
2491     {
2492       /* This block of code implements a tuned matmul, derived from
2493          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2494
2495                Bo Kagstrom and Per Ling
2496                Department of Computing Science
2497                Umea University
2498                S-901 87 Umea, Sweden
2499
2500          from netlib.org, translated to C, and modified for matmul.m4.  */
2501
2502       const GFC_REAL_16 *a, *b;
2503       GFC_REAL_16 *c;
2504       const index_type m = xcount, n = ycount, k = count;
2505
2506       /* System generated locals */
2507       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2508                  i1, i2, i3, i4, i5, i6;
2509
2510       /* Local variables */
2511       GFC_REAL_16 t1[65536], /* was [256][256] */
2512                  f11, f12, f21, f22, f31, f32, f41, f42,
2513                  f13, f14, f23, f24, f33, f34, f43, f44;
2514       index_type i, j, l, ii, jj, ll;
2515       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2516
2517       a = abase;
2518       b = bbase;
2519       c = retarray->base_addr;
2520
2521       /* Parameter adjustments */
2522       c_dim1 = rystride;
2523       c_offset = 1 + c_dim1;
2524       c -= c_offset;
2525       a_dim1 = aystride;
2526       a_offset = 1 + a_dim1;
2527       a -= a_offset;
2528       b_dim1 = bystride;
2529       b_offset = 1 + b_dim1;
2530       b -= b_offset;
2531
2532       /* Early exit if possible */
2533       if (m == 0 || n == 0 || k == 0)
2534         return;
2535
2536       /* Empty c first.  */
2537       for (j=1; j<=n; j++)
2538         for (i=1; i<=m; i++)
2539           c[i + j * c_dim1] = (GFC_REAL_16)0;
2540
2541       /* Start turning the crank. */
2542       i1 = n;
2543       for (jj = 1; jj <= i1; jj += 512)
2544         {
2545           /* Computing MIN */
2546           i2 = 512;
2547           i3 = n - jj + 1;
2548           jsec = min(i2,i3);
2549           ujsec = jsec - jsec % 4;
2550           i2 = k;
2551           for (ll = 1; ll <= i2; ll += 256)
2552             {
2553               /* Computing MIN */
2554               i3 = 256;
2555               i4 = k - ll + 1;
2556               lsec = min(i3,i4);
2557               ulsec = lsec - lsec % 2;
2558
2559               i3 = m;
2560               for (ii = 1; ii <= i3; ii += 256)
2561                 {
2562                   /* Computing MIN */
2563                   i4 = 256;
2564                   i5 = m - ii + 1;
2565                   isec = min(i4,i5);
2566                   uisec = isec - isec % 2;
2567                   i4 = ll + ulsec - 1;
2568                   for (l = ll; l <= i4; l += 2)
2569                     {
2570                       i5 = ii + uisec - 1;
2571                       for (i = ii; i <= i5; i += 2)
2572                         {
2573                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2574                                         a[i + l * a_dim1];
2575                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2576                                         a[i + (l + 1) * a_dim1];
2577                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2578                                         a[i + 1 + l * a_dim1];
2579                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2580                                         a[i + 1 + (l + 1) * a_dim1];
2581                         }
2582                       if (uisec < isec)
2583                         {
2584                           t1[l - ll + 1 + (isec << 8) - 257] =
2585                                     a[ii + isec - 1 + l * a_dim1];
2586                           t1[l - ll + 2 + (isec << 8) - 257] =
2587                                     a[ii + isec - 1 + (l + 1) * a_dim1];
2588                         }
2589                     }
2590                   if (ulsec < lsec)
2591                     {
2592                       i4 = ii + isec - 1;
2593                       for (i = ii; i<= i4; ++i)
2594                         {
2595                           t1[lsec + ((i - ii + 1) << 8) - 257] =
2596                                     a[i + (ll + lsec - 1) * a_dim1];
2597                         }
2598                     }
2599
2600                   uisec = isec - isec % 4;
2601                   i4 = jj + ujsec - 1;
2602                   for (j = jj; j <= i4; j += 4)
2603                     {
2604                       i5 = ii + uisec - 1;
2605                       for (i = ii; i <= i5; i += 4)
2606                         {
2607                           f11 = c[i + j * c_dim1];
2608                           f21 = c[i + 1 + j * c_dim1];
2609                           f12 = c[i + (j + 1) * c_dim1];
2610                           f22 = c[i + 1 + (j + 1) * c_dim1];
2611                           f13 = c[i + (j + 2) * c_dim1];
2612                           f23 = c[i + 1 + (j + 2) * c_dim1];
2613                           f14 = c[i + (j + 3) * c_dim1];
2614                           f24 = c[i + 1 + (j + 3) * c_dim1];
2615                           f31 = c[i + 2 + j * c_dim1];
2616                           f41 = c[i + 3 + j * c_dim1];
2617                           f32 = c[i + 2 + (j + 1) * c_dim1];
2618                           f42 = c[i + 3 + (j + 1) * c_dim1];
2619                           f33 = c[i + 2 + (j + 2) * c_dim1];
2620                           f43 = c[i + 3 + (j + 2) * c_dim1];
2621                           f34 = c[i + 2 + (j + 3) * c_dim1];
2622                           f44 = c[i + 3 + (j + 3) * c_dim1];
2623                           i6 = ll + lsec - 1;
2624                           for (l = ll; l <= i6; ++l)
2625                             {
2626                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2627                                       * b[l + j * b_dim1];
2628                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2629                                       * b[l + j * b_dim1];
2630                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2631                                       * b[l + (j + 1) * b_dim1];
2632                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2633                                       * b[l + (j + 1) * b_dim1];
2634                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2635                                       * b[l + (j + 2) * b_dim1];
2636                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2637                                       * b[l + (j + 2) * b_dim1];
2638                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2639                                       * b[l + (j + 3) * b_dim1];
2640                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2641                                       * b[l + (j + 3) * b_dim1];
2642                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2643                                       * b[l + j * b_dim1];
2644                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2645                                       * b[l + j * b_dim1];
2646                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2647                                       * b[l + (j + 1) * b_dim1];
2648                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2649                                       * b[l + (j + 1) * b_dim1];
2650                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2651                                       * b[l + (j + 2) * b_dim1];
2652                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2653                                       * b[l + (j + 2) * b_dim1];
2654                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2655                                       * b[l + (j + 3) * b_dim1];
2656                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2657                                       * b[l + (j + 3) * b_dim1];
2658                             }
2659                           c[i + j * c_dim1] = f11;
2660                           c[i + 1 + j * c_dim1] = f21;
2661                           c[i + (j + 1) * c_dim1] = f12;
2662                           c[i + 1 + (j + 1) * c_dim1] = f22;
2663                           c[i + (j + 2) * c_dim1] = f13;
2664                           c[i + 1 + (j + 2) * c_dim1] = f23;
2665                           c[i + (j + 3) * c_dim1] = f14;
2666                           c[i + 1 + (j + 3) * c_dim1] = f24;
2667                           c[i + 2 + j * c_dim1] = f31;
2668                           c[i + 3 + j * c_dim1] = f41;
2669                           c[i + 2 + (j + 1) * c_dim1] = f32;
2670                           c[i + 3 + (j + 1) * c_dim1] = f42;
2671                           c[i + 2 + (j + 2) * c_dim1] = f33;
2672                           c[i + 3 + (j + 2) * c_dim1] = f43;
2673                           c[i + 2 + (j + 3) * c_dim1] = f34;
2674                           c[i + 3 + (j + 3) * c_dim1] = f44;
2675                         }
2676                       if (uisec < isec)
2677                         {
2678                           i5 = ii + isec - 1;
2679                           for (i = ii + uisec; i <= i5; ++i)
2680                             {
2681                               f11 = c[i + j * c_dim1];
2682                               f12 = c[i + (j + 1) * c_dim1];
2683                               f13 = c[i + (j + 2) * c_dim1];
2684                               f14 = c[i + (j + 3) * c_dim1];
2685                               i6 = ll + lsec - 1;
2686                               for (l = ll; l <= i6; ++l)
2687                                 {
2688                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2689                                           257] * b[l + j * b_dim1];
2690                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2691                                           257] * b[l + (j + 1) * b_dim1];
2692                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2693                                           257] * b[l + (j + 2) * b_dim1];
2694                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2695                                           257] * b[l + (j + 3) * b_dim1];
2696                                 }
2697                               c[i + j * c_dim1] = f11;
2698                               c[i + (j + 1) * c_dim1] = f12;
2699                               c[i + (j + 2) * c_dim1] = f13;
2700                               c[i + (j + 3) * c_dim1] = f14;
2701                             }
2702                         }
2703                     }
2704                   if (ujsec < jsec)
2705                     {
2706                       i4 = jj + jsec - 1;
2707                       for (j = jj + ujsec; j <= i4; ++j)
2708                         {
2709                           i5 = ii + uisec - 1;
2710                           for (i = ii; i <= i5; i += 4)
2711                             {
2712                               f11 = c[i + j * c_dim1];
2713                               f21 = c[i + 1 + j * c_dim1];
2714                               f31 = c[i + 2 + j * c_dim1];
2715                               f41 = c[i + 3 + j * c_dim1];
2716                               i6 = ll + lsec - 1;
2717                               for (l = ll; l <= i6; ++l)
2718                                 {
2719                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2720                                           257] * b[l + j * b_dim1];
2721                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2722                                           257] * b[l + j * b_dim1];
2723                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2724                                           257] * b[l + j * b_dim1];
2725                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2726                                           257] * b[l + j * b_dim1];
2727                                 }
2728                               c[i + j * c_dim1] = f11;
2729                               c[i + 1 + j * c_dim1] = f21;
2730                               c[i + 2 + j * c_dim1] = f31;
2731                               c[i + 3 + j * c_dim1] = f41;
2732                             }
2733                           i5 = ii + isec - 1;
2734                           for (i = ii + uisec; i <= i5; ++i)
2735                             {
2736                               f11 = c[i + j * c_dim1];
2737                               i6 = ll + lsec - 1;
2738                               for (l = ll; l <= i6; ++l)
2739                                 {
2740                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2741                                           257] * b[l + j * b_dim1];
2742                                 }
2743                               c[i + j * c_dim1] = f11;
2744                             }
2745                         }
2746                     }
2747                 }
2748             }
2749         }
2750       return;
2751     }
2752   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2753     {
2754       if (GFC_DESCRIPTOR_RANK (a) != 1)
2755         {
2756           const GFC_REAL_16 *restrict abase_x;
2757           const GFC_REAL_16 *restrict bbase_y;
2758           GFC_REAL_16 *restrict dest_y;
2759           GFC_REAL_16 s;
2760
2761           for (y = 0; y < ycount; y++)
2762             {
2763               bbase_y = &bbase[y*bystride];
2764               dest_y = &dest[y*rystride];
2765               for (x = 0; x < xcount; x++)
2766                 {
2767                   abase_x = &abase[x*axstride];
2768                   s = (GFC_REAL_16) 0;
2769                   for (n = 0; n < count; n++)
2770                     s += abase_x[n] * bbase_y[n];
2771                   dest_y[x] = s;
2772                 }
2773             }
2774         }
2775       else
2776         {
2777           const GFC_REAL_16 *restrict bbase_y;
2778           GFC_REAL_16 s;
2779
2780           for (y = 0; y < ycount; y++)
2781             {
2782               bbase_y = &bbase[y*bystride];
2783               s = (GFC_REAL_16) 0;
2784               for (n = 0; n < count; n++)
2785                 s += abase[n*axstride] * bbase_y[n];
2786               dest[y*rystride] = s;
2787             }
2788         }
2789     }
2790   else if (axstride < aystride)
2791     {
2792       for (y = 0; y < ycount; y++)
2793         for (x = 0; x < xcount; x++)
2794           dest[x*rxstride + y*rystride] = (GFC_REAL_16)0;
2795
2796       for (y = 0; y < ycount; y++)
2797         for (n = 0; n < count; n++)
2798           for (x = 0; x < xcount; x++)
2799             /* dest[x,y] += a[x,n] * b[n,y] */
2800             dest[x*rxstride + y*rystride] +=
2801                                         abase[x*axstride + n*aystride] *
2802                                         bbase[n*bxstride + y*bystride];
2803     }
2804   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2805     {
2806       const GFC_REAL_16 *restrict bbase_y;
2807       GFC_REAL_16 s;
2808
2809       for (y = 0; y < ycount; y++)
2810         {
2811           bbase_y = &bbase[y*bystride];
2812           s = (GFC_REAL_16) 0;
2813           for (n = 0; n < count; n++)
2814             s += abase[n*axstride] * bbase_y[n*bxstride];
2815           dest[y*rxstride] = s;
2816         }
2817     }
2818   else
2819     {
2820       const GFC_REAL_16 *restrict abase_x;
2821       const GFC_REAL_16 *restrict bbase_y;
2822       GFC_REAL_16 *restrict dest_y;
2823       GFC_REAL_16 s;
2824
2825       for (y = 0; y < ycount; y++)
2826         {
2827           bbase_y = &bbase[y*bystride];
2828           dest_y = &dest[y*rystride];
2829           for (x = 0; x < xcount; x++)
2830             {
2831               abase_x = &abase[x*axstride];
2832               s = (GFC_REAL_16) 0;
2833               for (n = 0; n < count; n++)
2834                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2835               dest_y[x*rxstride] = s;
2836             }
2837         }
2838     }
2839 }
2840 #undef POW3
2841 #undef min
2842 #undef max
2843
2844 #endif
2845 #endif
2846