libgfortran/generated/matmul_c10.c

   1 /* Implementation of the MATMUL intrinsic
   2    Copyright (C) 2002-2018 Free Software Foundation, Inc.
   3    Contributed by Paul Brook <paul@nowt.org>
   4
   5 This file is part of the GNU Fortran runtime library (libgfortran).
   6
   7 Libgfortran is free software; you can redistribute it and/or
   8 modify it under the terms of the GNU General Public
   9 License as published by the Free Software Foundation; either
  10 version 3 of the License, or (at your option) any later version.
  11
  12 Libgfortran is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 Under Section 7 of GPL version 3, you are granted additional
  18 permissions described in the GCC Runtime Library Exception, version
  19 3.1, as published by the Free Software Foundation.
  20
  21 You should have received a copy of the GNU General Public License and
  22 a copy of the GCC Runtime Library Exception along with this program;
  23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24 <http://www.gnu.org/licenses/>.  */
  25
  26 #include "libgfortran.h"
  27 #include <string.h>
  28 #include <assert.h>
  29
  30
  31 #if defined (HAVE_GFC_COMPLEX_10)
  32
  33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
  34    passed to us by the front-end, in which case we call it for large
  35    matrices.  */
  36
  37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
  38                           const int *, const GFC_COMPLEX_10 *, const GFC_COMPLEX_10 *,
  39                           const int *, const GFC_COMPLEX_10 *, const int *,
  40                           const GFC_COMPLEX_10 *, GFC_COMPLEX_10 *, const int *,
  41                           int, int);
  42
  43 /* The order of loops is different in the case of plain matrix
  44    multiplication C=MATMUL(A,B), and in the frequent special case where
  45    the argument A is the temporary result of a TRANSPOSE intrinsic:
  46    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
  47    looking at their strides.
  48
  49    The equivalent Fortran pseudo-code is:
  50
  51    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
  52    IF (.NOT.IS_TRANSPOSED(A)) THEN
  53      C = 0
  54      DO J=1,N
  55        DO K=1,COUNT
  56          DO I=1,M
  57            C(I,J) = C(I,J)+A(I,K)*B(K,J)
  58    ELSE
  59      DO J=1,N
  60        DO I=1,M
  61          S = 0
  62          DO K=1,COUNT
  63            S = S+A(I,K)*B(K,J)
  64          C(I,J) = S
  65    ENDIF
  66 */
  67
  68 /* If try_blas is set to a nonzero value, then the matmul function will
  69    see if there is a way to perform the matrix multiplication by a call
  70    to the BLAS gemm function.  */
  71
  72 extern void matmul_c10 (gfc_array_c10 * const restrict retarray,
  73         gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
  74         int blas_limit, blas_call gemm);
  75 export_proto(matmul_c10);
  76
  77 /* Put exhaustive list of possible architectures here here, ORed together.  */
  78
  79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
  80
  81 #ifdef HAVE_AVX
  82 static void
  83 matmul_c10_avx (gfc_array_c10 * const restrict retarray,
  84         gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
  85         int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
  86 static void
  87 matmul_c10_avx (gfc_array_c10 * const restrict retarray,
  88         gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
  89         int blas_limit, blas_call gemm)
  90 {
  91   const GFC_COMPLEX_10 * restrict abase;
  92   const GFC_COMPLEX_10 * restrict bbase;
  93   GFC_COMPLEX_10 * restrict dest;
  94
  95   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
  96   index_type x, y, n, count, xcount, ycount;
  97
  98   assert (GFC_DESCRIPTOR_RANK (a) == 2
  99           || GFC_DESCRIPTOR_RANK (b) == 2);
 100
 101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
 102
 103    Either A or B (but not both) can be rank 1:
 104
 105    o One-dimensional argument A is implicitly treated as a row matrix
 106      dimensioned [1,count], so xcount=1.
 107
 108    o One-dimensional argument B is implicitly treated as a column matrix
 109      dimensioned [count, 1], so ycount=1.
 110 */
 111
 112   if (retarray->base_addr == NULL)
 113     {
 114       if (GFC_DESCRIPTOR_RANK (a) == 1)
 115         {
 116           GFC_DIMENSION_SET(retarray->dim[0], 0,
 117                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
 118         }
 119       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 120         {
 121           GFC_DIMENSION_SET(retarray->dim[0], 0,
 122                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 123         }
 124       else
 125         {
 126           GFC_DIMENSION_SET(retarray->dim[0], 0,
 127                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 128
 129           GFC_DIMENSION_SET(retarray->dim[1], 0,
 130                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
 131                             GFC_DESCRIPTOR_EXTENT(retarray,0));
 132         }
 133
 134       retarray->base_addr
 135         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10));
 136       retarray->offset = 0;
 137     }
 138   else if (unlikely (compile_options.bounds_check))
 139     {
 140       index_type ret_extent, arg_extent;
 141
 142       if (GFC_DESCRIPTOR_RANK (a) == 1)
 143         {
 144           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 145           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 146           if (arg_extent != ret_extent)
 147             runtime_error ("Incorrect extent in return array in"
 148                            " MATMUL intrinsic: is %ld, should be %ld",
 149                            (long int) ret_extent, (long int) arg_extent);
 150         }
 151       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 152         {
 153           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 154           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 155           if (arg_extent != ret_extent)
 156             runtime_error ("Incorrect extent in return array in"
 157                            " MATMUL intrinsic: is %ld, should be %ld",
 158                            (long int) ret_extent, (long int) arg_extent);
 159         }
 160       else
 161         {
 162           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 163           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 164           if (arg_extent != ret_extent)
 165             runtime_error ("Incorrect extent in return array in"
 166                            " MATMUL intrinsic for dimension 1:"
 167                            " is %ld, should be %ld",
 168                            (long int) ret_extent, (long int) arg_extent);
 169
 170           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 171           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 172           if (arg_extent != ret_extent)
 173             runtime_error ("Incorrect extent in return array in"
 174                            " MATMUL intrinsic for dimension 2:"
 175                            " is %ld, should be %ld",
 176                            (long int) ret_extent, (long int) arg_extent);
 177         }
 178     }
 179
 180
 181   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 182     {
 183       /* One-dimensional result may be addressed in the code below
 184          either as a row or a column matrix. We want both cases to
 185          work. */
 186       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 187     }
 188   else
 189     {
 190       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 191       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 192     }
 193
 194
 195   if (GFC_DESCRIPTOR_RANK (a) == 1)
 196     {
 197       /* Treat it as a a row matrix A[1,count]. */
 198       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 199       aystride = 1;
 200
 201       xcount = 1;
 202       count = GFC_DESCRIPTOR_EXTENT(a,0);
 203     }
 204   else
 205     {
 206       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 207       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 208
 209       count = GFC_DESCRIPTOR_EXTENT(a,1);
 210       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 211     }
 212
 213   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 214     {
 215       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 216         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
 217     }
 218
 219   if (GFC_DESCRIPTOR_RANK (b) == 1)
 220     {
 221       /* Treat it as a column matrix B[count,1] */
 222       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 223
 224       /* bystride should never be used for 1-dimensional b.
 225          The value is only used for calculation of the
 226          memory by the buffer.  */
 227       bystride = 256;
 228       ycount = 1;
 229     }
 230   else
 231     {
 232       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 233       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 234       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 235     }
 236
 237   abase = a->base_addr;
 238   bbase = b->base_addr;
 239   dest = retarray->base_addr;
 240
 241   /* Now that everything is set up, we perform the multiplication
 242      itself.  */
 243
 244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 245 #define min(a,b) ((a) <= (b) ? (a) : (b))
 246 #define max(a,b) ((a) >= (b) ? (a) : (b))
 247
 248   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 249       && (bxstride == 1 || bystride == 1)
 250       && (((float) xcount) * ((float) ycount) * ((float) count)
 251           > POW3(blas_limit)))
 252     {
 253       const int m = xcount, n = ycount, k = count, ldc = rystride;
 254       const GFC_COMPLEX_10 one = 1, zero = 0;
 255       const int lda = (axstride == 1) ? aystride : axstride,
 256                 ldb = (bxstride == 1) ? bystride : bxstride;
 257
 258       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 259         {
 260           assert (gemm != NULL);
 261           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
 262                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 263                 &ldc, 1, 1);
 264           return;
 265         }
 266     }
 267
 268   if (rxstride == 1 && axstride == 1 && bxstride == 1)
 269     {
 270       /* This block of code implements a tuned matmul, derived from
 271          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 272
 273                Bo Kagstrom and Per Ling
 274                Department of Computing Science
 275                Umea University
 276                S-901 87 Umea, Sweden
 277
 278          from netlib.org, translated to C, and modified for matmul.m4.  */
 279
 280       const GFC_COMPLEX_10 *a, *b;
 281       GFC_COMPLEX_10 *c;
 282       const index_type m = xcount, n = ycount, k = count;
 283
 284       /* System generated locals */
 285       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 286                  i1, i2, i3, i4, i5, i6;
 287
 288       /* Local variables */
 289       GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
 290                  f13, f14, f23, f24, f33, f34, f43, f44;
 291       index_type i, j, l, ii, jj, ll;
 292       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 293       GFC_COMPLEX_10 *t1;
 294
 295       a = abase;
 296       b = bbase;
 297       c = retarray->base_addr;
 298
 299       /* Parameter adjustments */
 300       c_dim1 = rystride;
 301       c_offset = 1 + c_dim1;
 302       c -= c_offset;
 303       a_dim1 = aystride;
 304       a_offset = 1 + a_dim1;
 305       a -= a_offset;
 306       b_dim1 = bystride;
 307       b_offset = 1 + b_dim1;
 308       b -= b_offset;
 309
 310       /* Empty c first.  */
 311       for (j=1; j<=n; j++)
 312         for (i=1; i<=m; i++)
 313           c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
 314
 315       /* Early exit if possible */
 316       if (m == 0 || n == 0 || k == 0)
 317         return;
 318
 319       /* Adjust size of t1 to what is needed.  */
 320       index_type t1_dim, a_sz;
 321       if (aystride == 1)
 322         a_sz = rystride;
 323       else
 324         a_sz = a_dim1;
 325
 326       t1_dim = a_sz * 256 + b_dim1;
 327       if (t1_dim > 65536)
 328         t1_dim = 65536;
 329
 330       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
 331
 332       /* Start turning the crank. */
 333       i1 = n;
 334       for (jj = 1; jj <= i1; jj += 512)
 335         {
 336           /* Computing MIN */
 337           i2 = 512;
 338           i3 = n - jj + 1;
 339           jsec = min(i2,i3);
 340           ujsec = jsec - jsec % 4;
 341           i2 = k;
 342           for (ll = 1; ll <= i2; ll += 256)
 343             {
 344               /* Computing MIN */
 345               i3 = 256;
 346               i4 = k - ll + 1;
 347               lsec = min(i3,i4);
 348               ulsec = lsec - lsec % 2;
 349
 350               i3 = m;
 351               for (ii = 1; ii <= i3; ii += 256)
 352                 {
 353                   /* Computing MIN */
 354                   i4 = 256;
 355                   i5 = m - ii + 1;
 356                   isec = min(i4,i5);
 357                   uisec = isec - isec % 2;
 358                   i4 = ll + ulsec - 1;
 359                   for (l = ll; l <= i4; l += 2)
 360                     {
 361                       i5 = ii + uisec - 1;
 362                       for (i = ii; i <= i5; i += 2)
 363                         {
 364                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 365                                         a[i + l * a_dim1];
 366                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 367                                         a[i + (l + 1) * a_dim1];
 368                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 369                                         a[i + 1 + l * a_dim1];
 370                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 371                                         a[i + 1 + (l + 1) * a_dim1];
 372                         }
 373                       if (uisec < isec)
 374                         {
 375                           t1[l - ll + 1 + (isec << 8) - 257] =
 376                                     a[ii + isec - 1 + l * a_dim1];
 377                           t1[l - ll + 2 + (isec << 8) - 257] =
 378                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 379                         }
 380                     }
 381                   if (ulsec < lsec)
 382                     {
 383                       i4 = ii + isec - 1;
 384                       for (i = ii; i<= i4; ++i)
 385                         {
 386                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 387                                     a[i + (ll + lsec - 1) * a_dim1];
 388                         }
 389                     }
 390
 391                   uisec = isec - isec % 4;
 392                   i4 = jj + ujsec - 1;
 393                   for (j = jj; j <= i4; j += 4)
 394                     {
 395                       i5 = ii + uisec - 1;
 396                       for (i = ii; i <= i5; i += 4)
 397                         {
 398                           f11 = c[i + j * c_dim1];
 399                           f21 = c[i + 1 + j * c_dim1];
 400                           f12 = c[i + (j + 1) * c_dim1];
 401                           f22 = c[i + 1 + (j + 1) * c_dim1];
 402                           f13 = c[i + (j + 2) * c_dim1];
 403                           f23 = c[i + 1 + (j + 2) * c_dim1];
 404                           f14 = c[i + (j + 3) * c_dim1];
 405                           f24 = c[i + 1 + (j + 3) * c_dim1];
 406                           f31 = c[i + 2 + j * c_dim1];
 407                           f41 = c[i + 3 + j * c_dim1];
 408                           f32 = c[i + 2 + (j + 1) * c_dim1];
 409                           f42 = c[i + 3 + (j + 1) * c_dim1];
 410                           f33 = c[i + 2 + (j + 2) * c_dim1];
 411                           f43 = c[i + 3 + (j + 2) * c_dim1];
 412                           f34 = c[i + 2 + (j + 3) * c_dim1];
 413                           f44 = c[i + 3 + (j + 3) * c_dim1];
 414                           i6 = ll + lsec - 1;
 415                           for (l = ll; l <= i6; ++l)
 416                             {
 417                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 418                                       * b[l + j * b_dim1];
 419                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 420                                       * b[l + j * b_dim1];
 421                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 422                                       * b[l + (j + 1) * b_dim1];
 423                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 424                                       * b[l + (j + 1) * b_dim1];
 425                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 426                                       * b[l + (j + 2) * b_dim1];
 427                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 428                                       * b[l + (j + 2) * b_dim1];
 429                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 430                                       * b[l + (j + 3) * b_dim1];
 431                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 432                                       * b[l + (j + 3) * b_dim1];
 433                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 434                                       * b[l + j * b_dim1];
 435                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 436                                       * b[l + j * b_dim1];
 437                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 438                                       * b[l + (j + 1) * b_dim1];
 439                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 440                                       * b[l + (j + 1) * b_dim1];
 441                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 442                                       * b[l + (j + 2) * b_dim1];
 443                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 444                                       * b[l + (j + 2) * b_dim1];
 445                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 446                                       * b[l + (j + 3) * b_dim1];
 447                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 448                                       * b[l + (j + 3) * b_dim1];
 449                             }
 450                           c[i + j * c_dim1] = f11;
 451                           c[i + 1 + j * c_dim1] = f21;
 452                           c[i + (j + 1) * c_dim1] = f12;
 453                           c[i + 1 + (j + 1) * c_dim1] = f22;
 454                           c[i + (j + 2) * c_dim1] = f13;
 455                           c[i + 1 + (j + 2) * c_dim1] = f23;
 456                           c[i + (j + 3) * c_dim1] = f14;
 457                           c[i + 1 + (j + 3) * c_dim1] = f24;
 458                           c[i + 2 + j * c_dim1] = f31;
 459                           c[i + 3 + j * c_dim1] = f41;
 460                           c[i + 2 + (j + 1) * c_dim1] = f32;
 461                           c[i + 3 + (j + 1) * c_dim1] = f42;
 462                           c[i + 2 + (j + 2) * c_dim1] = f33;
 463                           c[i + 3 + (j + 2) * c_dim1] = f43;
 464                           c[i + 2 + (j + 3) * c_dim1] = f34;
 465                           c[i + 3 + (j + 3) * c_dim1] = f44;
 466                         }
 467                       if (uisec < isec)
 468                         {
 469                           i5 = ii + isec - 1;
 470                           for (i = ii + uisec; i <= i5; ++i)
 471                             {
 472                               f11 = c[i + j * c_dim1];
 473                               f12 = c[i + (j + 1) * c_dim1];
 474                               f13 = c[i + (j + 2) * c_dim1];
 475                               f14 = c[i + (j + 3) * c_dim1];
 476                               i6 = ll + lsec - 1;
 477                               for (l = ll; l <= i6; ++l)
 478                                 {
 479                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 480                                           257] * b[l + j * b_dim1];
 481                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 482                                           257] * b[l + (j + 1) * b_dim1];
 483                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 484                                           257] * b[l + (j + 2) * b_dim1];
 485                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 486                                           257] * b[l + (j + 3) * b_dim1];
 487                                 }
 488                               c[i + j * c_dim1] = f11;
 489                               c[i + (j + 1) * c_dim1] = f12;
 490                               c[i + (j + 2) * c_dim1] = f13;
 491                               c[i + (j + 3) * c_dim1] = f14;
 492                             }
 493                         }
 494                     }
 495                   if (ujsec < jsec)
 496                     {
 497                       i4 = jj + jsec - 1;
 498                       for (j = jj + ujsec; j <= i4; ++j)
 499                         {
 500                           i5 = ii + uisec - 1;
 501                           for (i = ii; i <= i5; i += 4)
 502                             {
 503                               f11 = c[i + j * c_dim1];
 504                               f21 = c[i + 1 + j * c_dim1];
 505                               f31 = c[i + 2 + j * c_dim1];
 506                               f41 = c[i + 3 + j * c_dim1];
 507                               i6 = ll + lsec - 1;
 508                               for (l = ll; l <= i6; ++l)
 509                                 {
 510                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 511                                           257] * b[l + j * b_dim1];
 512                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
 513                                           257] * b[l + j * b_dim1];
 514                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
 515                                           257] * b[l + j * b_dim1];
 516                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
 517                                           257] * b[l + j * b_dim1];
 518                                 }
 519                               c[i + j * c_dim1] = f11;
 520                               c[i + 1 + j * c_dim1] = f21;
 521                               c[i + 2 + j * c_dim1] = f31;
 522                               c[i + 3 + j * c_dim1] = f41;
 523                             }
 524                           i5 = ii + isec - 1;
 525                           for (i = ii + uisec; i <= i5; ++i)
 526                             {
 527                               f11 = c[i + j * c_dim1];
 528                               i6 = ll + lsec - 1;
 529                               for (l = ll; l <= i6; ++l)
 530                                 {
 531                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 532                                           257] * b[l + j * b_dim1];
 533                                 }
 534                               c[i + j * c_dim1] = f11;
 535                             }
 536                         }
 537                     }
 538                 }
 539             }
 540         }
 541       free(t1);
 542       return;
 543     }
 544   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
 545     {
 546       if (GFC_DESCRIPTOR_RANK (a) != 1)
 547         {
 548           const GFC_COMPLEX_10 *restrict abase_x;
 549           const GFC_COMPLEX_10 *restrict bbase_y;
 550           GFC_COMPLEX_10 *restrict dest_y;
 551           GFC_COMPLEX_10 s;
 552
 553           for (y = 0; y < ycount; y++)
 554             {
 555               bbase_y = &bbase[y*bystride];
 556               dest_y = &dest[y*rystride];
 557               for (x = 0; x < xcount; x++)
 558                 {
 559                   abase_x = &abase[x*axstride];
 560                   s = (GFC_COMPLEX_10) 0;
 561                   for (n = 0; n < count; n++)
 562                     s += abase_x[n] * bbase_y[n];
 563                   dest_y[x] = s;
 564                 }
 565             }
 566         }
 567       else
 568         {
 569           const GFC_COMPLEX_10 *restrict bbase_y;
 570           GFC_COMPLEX_10 s;
 571
 572           for (y = 0; y < ycount; y++)
 573             {
 574               bbase_y = &bbase[y*bystride];
 575               s = (GFC_COMPLEX_10) 0;
 576               for (n = 0; n < count; n++)
 577                 s += abase[n*axstride] * bbase_y[n];
 578               dest[y*rystride] = s;
 579             }
 580         }
 581     }
 582   else if (axstride < aystride)
 583     {
 584       for (y = 0; y < ycount; y++)
 585         for (x = 0; x < xcount; x++)
 586           dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0;
 587
 588       for (y = 0; y < ycount; y++)
 589         for (n = 0; n < count; n++)
 590           for (x = 0; x < xcount; x++)
 591             /* dest[x,y] += a[x,n] * b[n,y] */
 592             dest[x*rxstride + y*rystride] +=
 593                                         abase[x*axstride + n*aystride] *
 594                                         bbase[n*bxstride + y*bystride];
 595     }
 596   else if (GFC_DESCRIPTOR_RANK (a) == 1)
 597     {
 598       const GFC_COMPLEX_10 *restrict bbase_y;
 599       GFC_COMPLEX_10 s;
 600
 601       for (y = 0; y < ycount; y++)
 602         {
 603           bbase_y = &bbase[y*bystride];
 604           s = (GFC_COMPLEX_10) 0;
 605           for (n = 0; n < count; n++)
 606             s += abase[n*axstride] * bbase_y[n*bxstride];
 607           dest[y*rxstride] = s;
 608         }
 609     }
 610   else
 611     {
 612       const GFC_COMPLEX_10 *restrict abase_x;
 613       const GFC_COMPLEX_10 *restrict bbase_y;
 614       GFC_COMPLEX_10 *restrict dest_y;
 615       GFC_COMPLEX_10 s;
 616
 617       for (y = 0; y < ycount; y++)
 618         {
 619           bbase_y = &bbase[y*bystride];
 620           dest_y = &dest[y*rystride];
 621           for (x = 0; x < xcount; x++)
 622             {
 623               abase_x = &abase[x*axstride];
 624               s = (GFC_COMPLEX_10) 0;
 625               for (n = 0; n < count; n++)
 626                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
 627               dest_y[x*rxstride] = s;
 628             }
 629         }
 630     }
 631 }
 632 #undef POW3
 633 #undef min
 634 #undef max
 635
 636 #endif /* HAVE_AVX */
 637
 638 #ifdef HAVE_AVX2
 639 static void
 640 matmul_c10_avx2 (gfc_array_c10 * const restrict retarray,
 641         gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
 642         int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 643 static void
 644 matmul_c10_avx2 (gfc_array_c10 * const restrict retarray,
 645         gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
 646         int blas_limit, blas_call gemm)
 647 {
 648   const GFC_COMPLEX_10 * restrict abase;
 649   const GFC_COMPLEX_10 * restrict bbase;
 650   GFC_COMPLEX_10 * restrict dest;
 651
 652   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
 653   index_type x, y, n, count, xcount, ycount;
 654
 655   assert (GFC_DESCRIPTOR_RANK (a) == 2
 656           || GFC_DESCRIPTOR_RANK (b) == 2);
 657
 658 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
 659
 660    Either A or B (but not both) can be rank 1:
 661
 662    o One-dimensional argument A is implicitly treated as a row matrix
 663      dimensioned [1,count], so xcount=1.
 664
 665    o One-dimensional argument B is implicitly treated as a column matrix
 666      dimensioned [count, 1], so ycount=1.
 667 */
 668
 669   if (retarray->base_addr == NULL)
 670     {
 671       if (GFC_DESCRIPTOR_RANK (a) == 1)
 672         {
 673           GFC_DIMENSION_SET(retarray->dim[0], 0,
 674                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
 675         }
 676       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 677         {
 678           GFC_DIMENSION_SET(retarray->dim[0], 0,
 679                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 680         }
 681       else
 682         {
 683           GFC_DIMENSION_SET(retarray->dim[0], 0,
 684                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 685
 686           GFC_DIMENSION_SET(retarray->dim[1], 0,
 687                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
 688                             GFC_DESCRIPTOR_EXTENT(retarray,0));
 689         }
 690
 691       retarray->base_addr
 692         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10));
 693       retarray->offset = 0;
 694     }
 695   else if (unlikely (compile_options.bounds_check))
 696     {
 697       index_type ret_extent, arg_extent;
 698
 699       if (GFC_DESCRIPTOR_RANK (a) == 1)
 700         {
 701           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 702           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 703           if (arg_extent != ret_extent)
 704             runtime_error ("Incorrect extent in return array in"
 705                            " MATMUL intrinsic: is %ld, should be %ld",
 706                            (long int) ret_extent, (long int) arg_extent);
 707         }
 708       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 709         {
 710           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 711           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 712           if (arg_extent != ret_extent)
 713             runtime_error ("Incorrect extent in return array in"
 714                            " MATMUL intrinsic: is %ld, should be %ld",
 715                            (long int) ret_extent, (long int) arg_extent);
 716         }
 717       else
 718         {
 719           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 720           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 721           if (arg_extent != ret_extent)
 722             runtime_error ("Incorrect extent in return array in"
 723                            " MATMUL intrinsic for dimension 1:"
 724                            " is %ld, should be %ld",
 725                            (long int) ret_extent, (long int) arg_extent);
 726
 727           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 728           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 729           if (arg_extent != ret_extent)
 730             runtime_error ("Incorrect extent in return array in"
 731                            " MATMUL intrinsic for dimension 2:"
 732                            " is %ld, should be %ld",
 733                            (long int) ret_extent, (long int) arg_extent);
 734         }
 735     }
 736
 737
 738   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 739     {
 740       /* One-dimensional result may be addressed in the code below
 741          either as a row or a column matrix. We want both cases to
 742          work. */
 743       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 744     }
 745   else
 746     {
 747       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 748       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 749     }
 750
 751
 752   if (GFC_DESCRIPTOR_RANK (a) == 1)
 753     {
 754       /* Treat it as a a row matrix A[1,count]. */
 755       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 756       aystride = 1;
 757
 758       xcount = 1;
 759       count = GFC_DESCRIPTOR_EXTENT(a,0);
 760     }
 761   else
 762     {
 763       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 764       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 765
 766       count = GFC_DESCRIPTOR_EXTENT(a,1);
 767       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 768     }
 769
 770   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 771     {
 772       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 773         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
 774     }
 775
 776   if (GFC_DESCRIPTOR_RANK (b) == 1)
 777     {
 778       /* Treat it as a column matrix B[count,1] */
 779       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 780
 781       /* bystride should never be used for 1-dimensional b.
 782          The value is only used for calculation of the
 783          memory by the buffer.  */
 784       bystride = 256;
 785       ycount = 1;
 786     }
 787   else
 788     {
 789       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 790       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 791       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 792     }
 793
 794   abase = a->base_addr;
 795   bbase = b->base_addr;
 796   dest = retarray->base_addr;
 797
 798   /* Now that everything is set up, we perform the multiplication
 799      itself.  */
 800
 801 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 802 #define min(a,b) ((a) <= (b) ? (a) : (b))
 803 #define max(a,b) ((a) >= (b) ? (a) : (b))
 804
 805   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 806       && (bxstride == 1 || bystride == 1)
 807       && (((float) xcount) * ((float) ycount) * ((float) count)
 808           > POW3(blas_limit)))
 809     {
 810       const int m = xcount, n = ycount, k = count, ldc = rystride;
 811       const GFC_COMPLEX_10 one = 1, zero = 0;
 812       const int lda = (axstride == 1) ? aystride : axstride,
 813                 ldb = (bxstride == 1) ? bystride : bxstride;
 814
 815       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 816         {
 817           assert (gemm != NULL);
 818           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
 819                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 820                 &ldc, 1, 1);
 821           return;
 822         }
 823     }
 824
 825   if (rxstride == 1 && axstride == 1 && bxstride == 1)
 826     {
 827       /* This block of code implements a tuned matmul, derived from
 828          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 829
 830                Bo Kagstrom and Per Ling
 831                Department of Computing Science
 832                Umea University
 833                S-901 87 Umea, Sweden
 834
 835          from netlib.org, translated to C, and modified for matmul.m4.  */
 836
 837       const GFC_COMPLEX_10 *a, *b;
 838       GFC_COMPLEX_10 *c;
 839       const index_type m = xcount, n = ycount, k = count;
 840
 841       /* System generated locals */
 842       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 843                  i1, i2, i3, i4, i5, i6;
 844
 845       /* Local variables */
 846       GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
 847                  f13, f14, f23, f24, f33, f34, f43, f44;
 848       index_type i, j, l, ii, jj, ll;
 849       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 850       GFC_COMPLEX_10 *t1;
 851
 852       a = abase;
 853       b = bbase;
 854       c = retarray->base_addr;
 855
 856       /* Parameter adjustments */
 857       c_dim1 = rystride;
 858       c_offset = 1 + c_dim1;
 859       c -= c_offset;
 860       a_dim1 = aystride;
 861       a_offset = 1 + a_dim1;
 862       a -= a_offset;
 863       b_dim1 = bystride;
 864       b_offset = 1 + b_dim1;
 865       b -= b_offset;
 866
 867       /* Empty c first.  */
 868       for (j=1; j<=n; j++)
 869         for (i=1; i<=m; i++)
 870           c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
 871
 872       /* Early exit if possible */
 873       if (m == 0 || n == 0 || k == 0)
 874         return;
 875
 876       /* Adjust size of t1 to what is needed.  */
 877       index_type t1_dim, a_sz;
 878       if (aystride == 1)
 879         a_sz = rystride;
 880       else
 881         a_sz = a_dim1;
 882
 883       t1_dim = a_sz * 256 + b_dim1;
 884       if (t1_dim > 65536)
 885         t1_dim = 65536;
 886
 887       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
 888
 889       /* Start turning the crank. */
 890       i1 = n;
 891       for (jj = 1; jj <= i1; jj += 512)
 892         {
 893           /* Computing MIN */
 894           i2 = 512;
 895           i3 = n - jj + 1;
 896           jsec = min(i2,i3);
 897           ujsec = jsec - jsec % 4;
 898           i2 = k;
 899           for (ll = 1; ll <= i2; ll += 256)
 900             {
 901               /* Computing MIN */
 902               i3 = 256;
 903               i4 = k - ll + 1;
 904               lsec = min(i3,i4);
 905               ulsec = lsec - lsec % 2;
 906
 907               i3 = m;
 908               for (ii = 1; ii <= i3; ii += 256)
 909                 {
 910                   /* Computing MIN */
 911                   i4 = 256;
 912                   i5 = m - ii + 1;
 913                   isec = min(i4,i5);
 914                   uisec = isec - isec % 2;
 915                   i4 = ll + ulsec - 1;
 916                   for (l = ll; l <= i4; l += 2)
 917                     {
 918                       i5 = ii + uisec - 1;
 919                       for (i = ii; i <= i5; i += 2)
 920                         {
 921                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 922                                         a[i + l * a_dim1];
 923                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 924                                         a[i + (l + 1) * a_dim1];
 925                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 926                                         a[i + 1 + l * a_dim1];
 927                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 928                                         a[i + 1 + (l + 1) * a_dim1];
 929                         }
 930                       if (uisec < isec)
 931                         {
 932                           t1[l - ll + 1 + (isec << 8) - 257] =
 933                                     a[ii + isec - 1 + l * a_dim1];
 934                           t1[l - ll + 2 + (isec << 8) - 257] =
 935                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 936                         }
 937                     }
 938                   if (ulsec < lsec)
 939                     {
 940                       i4 = ii + isec - 1;
 941                       for (i = ii; i<= i4; ++i)
 942                         {
 943                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 944                                     a[i + (ll + lsec - 1) * a_dim1];
 945                         }
 946                     }
 947
 948                   uisec = isec - isec % 4;
 949                   i4 = jj + ujsec - 1;
 950                   for (j = jj; j <= i4; j += 4)
 951                     {
 952                       i5 = ii + uisec - 1;
 953                       for (i = ii; i <= i5; i += 4)
 954                         {
 955                           f11 = c[i + j * c_dim1];
 956                           f21 = c[i + 1 + j * c_dim1];
 957                           f12 = c[i + (j + 1) * c_dim1];
 958                           f22 = c[i + 1 + (j + 1) * c_dim1];
 959                           f13 = c[i + (j + 2) * c_dim1];
 960                           f23 = c[i + 1 + (j + 2) * c_dim1];
 961                           f14 = c[i + (j + 3) * c_dim1];
 962                           f24 = c[i + 1 + (j + 3) * c_dim1];
 963                           f31 = c[i + 2 + j * c_dim1];
 964                           f41 = c[i + 3 + j * c_dim1];
 965                           f32 = c[i + 2 + (j + 1) * c_dim1];
 966                           f42 = c[i + 3 + (j + 1) * c_dim1];
 967                           f33 = c[i + 2 + (j + 2) * c_dim1];
 968                           f43 = c[i + 3 + (j + 2) * c_dim1];
 969                           f34 = c[i + 2 + (j + 3) * c_dim1];
 970                           f44 = c[i + 3 + (j + 3) * c_dim1];
 971                           i6 = ll + lsec - 1;
 972                           for (l = ll; l <= i6; ++l)
 973                             {
 974                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 975                                       * b[l + j * b_dim1];
 976                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 977                                       * b[l + j * b_dim1];
 978                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 979                                       * b[l + (j + 1) * b_dim1];
 980                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 981                                       * b[l + (j + 1) * b_dim1];
 982                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 983                                       * b[l + (j + 2) * b_dim1];
 984                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 985                                       * b[l + (j + 2) * b_dim1];
 986                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 987                                       * b[l + (j + 3) * b_dim1];
 988                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 989                                       * b[l + (j + 3) * b_dim1];
 990                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 991                                       * b[l + j * b_dim1];
 992                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 993                                       * b[l + j * b_dim1];
 994                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 995                                       * b[l + (j + 1) * b_dim1];
 996                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 997                                       * b[l + (j + 1) * b_dim1];
 998                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 999                                       * b[l + (j + 2) * b_dim1];
1000                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1001                                       * b[l + (j + 2) * b_dim1];
1002                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1003                                       * b[l + (j + 3) * b_dim1];
1004                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1005                                       * b[l + (j + 3) * b_dim1];
1006                             }
1007                           c[i + j * c_dim1] = f11;
1008                           c[i + 1 + j * c_dim1] = f21;
1009                           c[i + (j + 1) * c_dim1] = f12;
1010                           c[i + 1 + (j + 1) * c_dim1] = f22;
1011                           c[i + (j + 2) * c_dim1] = f13;
1012                           c[i + 1 + (j + 2) * c_dim1] = f23;
1013                           c[i + (j + 3) * c_dim1] = f14;
1014                           c[i + 1 + (j + 3) * c_dim1] = f24;
1015                           c[i + 2 + j * c_dim1] = f31;
1016                           c[i + 3 + j * c_dim1] = f41;
1017                           c[i + 2 + (j + 1) * c_dim1] = f32;
1018                           c[i + 3 + (j + 1) * c_dim1] = f42;
1019                           c[i + 2 + (j + 2) * c_dim1] = f33;
1020                           c[i + 3 + (j + 2) * c_dim1] = f43;
1021                           c[i + 2 + (j + 3) * c_dim1] = f34;
1022                           c[i + 3 + (j + 3) * c_dim1] = f44;
1023                         }
1024                       if (uisec < isec)
1025                         {
1026                           i5 = ii + isec - 1;
1027                           for (i = ii + uisec; i <= i5; ++i)
1028                             {
1029                               f11 = c[i + j * c_dim1];
1030                               f12 = c[i + (j + 1) * c_dim1];
1031                               f13 = c[i + (j + 2) * c_dim1];
1032                               f14 = c[i + (j + 3) * c_dim1];
1033                               i6 = ll + lsec - 1;
1034                               for (l = ll; l <= i6; ++l)
1035                                 {
1036                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1037                                           257] * b[l + j * b_dim1];
1038                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1039                                           257] * b[l + (j + 1) * b_dim1];
1040                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1041                                           257] * b[l + (j + 2) * b_dim1];
1042                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1043                                           257] * b[l + (j + 3) * b_dim1];
1044                                 }
1045                               c[i + j * c_dim1] = f11;
1046                               c[i + (j + 1) * c_dim1] = f12;
1047                               c[i + (j + 2) * c_dim1] = f13;
1048                               c[i + (j + 3) * c_dim1] = f14;
1049                             }
1050                         }
1051                     }
1052                   if (ujsec < jsec)
1053                     {
1054                       i4 = jj + jsec - 1;
1055                       for (j = jj + ujsec; j <= i4; ++j)
1056                         {
1057                           i5 = ii + uisec - 1;
1058                           for (i = ii; i <= i5; i += 4)
1059                             {
1060                               f11 = c[i + j * c_dim1];
1061                               f21 = c[i + 1 + j * c_dim1];
1062                               f31 = c[i + 2 + j * c_dim1];
1063                               f41 = c[i + 3 + j * c_dim1];
1064                               i6 = ll + lsec - 1;
1065                               for (l = ll; l <= i6; ++l)
1066                                 {
1067                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1068                                           257] * b[l + j * b_dim1];
1069                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1070                                           257] * b[l + j * b_dim1];
1071                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1072                                           257] * b[l + j * b_dim1];
1073                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1074                                           257] * b[l + j * b_dim1];
1075                                 }
1076                               c[i + j * c_dim1] = f11;
1077                               c[i + 1 + j * c_dim1] = f21;
1078                               c[i + 2 + j * c_dim1] = f31;
1079                               c[i + 3 + j * c_dim1] = f41;
1080                             }
1081                           i5 = ii + isec - 1;
1082                           for (i = ii + uisec; i <= i5; ++i)
1083                             {
1084                               f11 = c[i + j * c_dim1];
1085                               i6 = ll + lsec - 1;
1086                               for (l = ll; l <= i6; ++l)
1087                                 {
1088                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1089                                           257] * b[l + j * b_dim1];
1090                                 }
1091                               c[i + j * c_dim1] = f11;
1092                             }
1093                         }
1094                     }
1095                 }
1096             }
1097         }
1098       free(t1);
1099       return;
1100     }
1101   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1102     {
1103       if (GFC_DESCRIPTOR_RANK (a) != 1)
1104         {
1105           const GFC_COMPLEX_10 *restrict abase_x;
1106           const GFC_COMPLEX_10 *restrict bbase_y;
1107           GFC_COMPLEX_10 *restrict dest_y;
1108           GFC_COMPLEX_10 s;
1109
1110           for (y = 0; y < ycount; y++)
1111             {
1112               bbase_y = &bbase[y*bystride];
1113               dest_y = &dest[y*rystride];
1114               for (x = 0; x < xcount; x++)
1115                 {
1116                   abase_x = &abase[x*axstride];
1117                   s = (GFC_COMPLEX_10) 0;
1118                   for (n = 0; n < count; n++)
1119                     s += abase_x[n] * bbase_y[n];
1120                   dest_y[x] = s;
1121                 }
1122             }
1123         }
1124       else
1125         {
1126           const GFC_COMPLEX_10 *restrict bbase_y;
1127           GFC_COMPLEX_10 s;
1128
1129           for (y = 0; y < ycount; y++)
1130             {
1131               bbase_y = &bbase[y*bystride];
1132               s = (GFC_COMPLEX_10) 0;
1133               for (n = 0; n < count; n++)
1134                 s += abase[n*axstride] * bbase_y[n];
1135               dest[y*rystride] = s;
1136             }
1137         }
1138     }
1139   else if (axstride < aystride)
1140     {
1141       for (y = 0; y < ycount; y++)
1142         for (x = 0; x < xcount; x++)
1143           dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0;
1144
1145       for (y = 0; y < ycount; y++)
1146         for (n = 0; n < count; n++)
1147           for (x = 0; x < xcount; x++)
1148             /* dest[x,y] += a[x,n] * b[n,y] */
1149             dest[x*rxstride + y*rystride] +=
1150                                         abase[x*axstride + n*aystride] *
1151                                         bbase[n*bxstride + y*bystride];
1152     }
1153   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1154     {
1155       const GFC_COMPLEX_10 *restrict bbase_y;
1156       GFC_COMPLEX_10 s;
1157
1158       for (y = 0; y < ycount; y++)
1159         {
1160           bbase_y = &bbase[y*bystride];
1161           s = (GFC_COMPLEX_10) 0;
1162           for (n = 0; n < count; n++)
1163             s += abase[n*axstride] * bbase_y[n*bxstride];
1164           dest[y*rxstride] = s;
1165         }
1166     }
1167   else
1168     {
1169       const GFC_COMPLEX_10 *restrict abase_x;
1170       const GFC_COMPLEX_10 *restrict bbase_y;
1171       GFC_COMPLEX_10 *restrict dest_y;
1172       GFC_COMPLEX_10 s;
1173
1174       for (y = 0; y < ycount; y++)
1175         {
1176           bbase_y = &bbase[y*bystride];
1177           dest_y = &dest[y*rystride];
1178           for (x = 0; x < xcount; x++)
1179             {
1180               abase_x = &abase[x*axstride];
1181               s = (GFC_COMPLEX_10) 0;
1182               for (n = 0; n < count; n++)
1183                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1184               dest_y[x*rxstride] = s;
1185             }
1186         }
1187     }
1188 }
1189 #undef POW3
1190 #undef min
1191 #undef max
1192
1193 #endif /* HAVE_AVX2 */
1194
1195 #ifdef HAVE_AVX512F
1196 static void
1197 matmul_c10_avx512f (gfc_array_c10 * const restrict retarray,
1198         gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
1199         int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1200 static void
1201 matmul_c10_avx512f (gfc_array_c10 * const restrict retarray,
1202         gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
1203         int blas_limit, blas_call gemm)
1204 {
1205   const GFC_COMPLEX_10 * restrict abase;
1206   const GFC_COMPLEX_10 * restrict bbase;
1207   GFC_COMPLEX_10 * restrict dest;
1208
1209   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1210   index_type x, y, n, count, xcount, ycount;
1211
1212   assert (GFC_DESCRIPTOR_RANK (a) == 2
1213           || GFC_DESCRIPTOR_RANK (b) == 2);
1214
1215 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1216
1217    Either A or B (but not both) can be rank 1:
1218
1219    o One-dimensional argument A is implicitly treated as a row matrix
1220      dimensioned [1,count], so xcount=1.
1221
1222    o One-dimensional argument B is implicitly treated as a column matrix
1223      dimensioned [count, 1], so ycount=1.
1224 */
1225
1226   if (retarray->base_addr == NULL)
1227     {
1228       if (GFC_DESCRIPTOR_RANK (a) == 1)
1229         {
1230           GFC_DIMENSION_SET(retarray->dim[0], 0,
1231                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1232         }
1233       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1234         {
1235           GFC_DIMENSION_SET(retarray->dim[0], 0,
1236                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1237         }
1238       else
1239         {
1240           GFC_DIMENSION_SET(retarray->dim[0], 0,
1241                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1242
1243           GFC_DIMENSION_SET(retarray->dim[1], 0,
1244                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1245                             GFC_DESCRIPTOR_EXTENT(retarray,0));
1246         }
1247
1248       retarray->base_addr
1249         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10));
1250       retarray->offset = 0;
1251     }
1252   else if (unlikely (compile_options.bounds_check))
1253     {
1254       index_type ret_extent, arg_extent;
1255
1256       if (GFC_DESCRIPTOR_RANK (a) == 1)
1257         {
1258           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1259           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1260           if (arg_extent != ret_extent)
1261             runtime_error ("Incorrect extent in return array in"
1262                            " MATMUL intrinsic: is %ld, should be %ld",
1263                            (long int) ret_extent, (long int) arg_extent);
1264         }
1265       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1266         {
1267           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1268           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1269           if (arg_extent != ret_extent)
1270             runtime_error ("Incorrect extent in return array in"
1271                            " MATMUL intrinsic: is %ld, should be %ld",
1272                            (long int) ret_extent, (long int) arg_extent);
1273         }
1274       else
1275         {
1276           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1277           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1278           if (arg_extent != ret_extent)
1279             runtime_error ("Incorrect extent in return array in"
1280                            " MATMUL intrinsic for dimension 1:"
1281                            " is %ld, should be %ld",
1282                            (long int) ret_extent, (long int) arg_extent);
1283
1284           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1285           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1286           if (arg_extent != ret_extent)
1287             runtime_error ("Incorrect extent in return array in"
1288                            " MATMUL intrinsic for dimension 2:"
1289                            " is %ld, should be %ld",
1290                            (long int) ret_extent, (long int) arg_extent);
1291         }
1292     }
1293
1294
1295   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1296     {
1297       /* One-dimensional result may be addressed in the code below
1298          either as a row or a column matrix. We want both cases to
1299          work. */
1300       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1301     }
1302   else
1303     {
1304       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1305       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1306     }
1307
1308
1309   if (GFC_DESCRIPTOR_RANK (a) == 1)
1310     {
1311       /* Treat it as a a row matrix A[1,count]. */
1312       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1313       aystride = 1;
1314
1315       xcount = 1;
1316       count = GFC_DESCRIPTOR_EXTENT(a,0);
1317     }
1318   else
1319     {
1320       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1321       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1322
1323       count = GFC_DESCRIPTOR_EXTENT(a,1);
1324       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1325     }
1326
1327   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1328     {
1329       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1330         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1331     }
1332
1333   if (GFC_DESCRIPTOR_RANK (b) == 1)
1334     {
1335       /* Treat it as a column matrix B[count,1] */
1336       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1337
1338       /* bystride should never be used for 1-dimensional b.
1339          The value is only used for calculation of the
1340          memory by the buffer.  */
1341       bystride = 256;
1342       ycount = 1;
1343     }
1344   else
1345     {
1346       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1347       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1348       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1349     }
1350
1351   abase = a->base_addr;
1352   bbase = b->base_addr;
1353   dest = retarray->base_addr;
1354
1355   /* Now that everything is set up, we perform the multiplication
1356      itself.  */
1357
1358 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1359 #define min(a,b) ((a) <= (b) ? (a) : (b))
1360 #define max(a,b) ((a) >= (b) ? (a) : (b))
1361
1362   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1363       && (bxstride == 1 || bystride == 1)
1364       && (((float) xcount) * ((float) ycount) * ((float) count)
1365           > POW3(blas_limit)))
1366     {
1367       const int m = xcount, n = ycount, k = count, ldc = rystride;
1368       const GFC_COMPLEX_10 one = 1, zero = 0;
1369       const int lda = (axstride == 1) ? aystride : axstride,
1370                 ldb = (bxstride == 1) ? bystride : bxstride;
1371
1372       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1373         {
1374           assert (gemm != NULL);
1375           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1376                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1377                 &ldc, 1, 1);
1378           return;
1379         }
1380     }
1381
1382   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1383     {
1384       /* This block of code implements a tuned matmul, derived from
1385          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1386
1387                Bo Kagstrom and Per Ling
1388                Department of Computing Science
1389                Umea University
1390                S-901 87 Umea, Sweden
1391
1392          from netlib.org, translated to C, and modified for matmul.m4.  */
1393
1394       const GFC_COMPLEX_10 *a, *b;
1395       GFC_COMPLEX_10 *c;
1396       const index_type m = xcount, n = ycount, k = count;
1397
1398       /* System generated locals */
1399       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1400                  i1, i2, i3, i4, i5, i6;
1401
1402       /* Local variables */
1403       GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
1404                  f13, f14, f23, f24, f33, f34, f43, f44;
1405       index_type i, j, l, ii, jj, ll;
1406       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1407       GFC_COMPLEX_10 *t1;
1408
1409       a = abase;
1410       b = bbase;
1411       c = retarray->base_addr;
1412
1413       /* Parameter adjustments */
1414       c_dim1 = rystride;
1415       c_offset = 1 + c_dim1;
1416       c -= c_offset;
1417       a_dim1 = aystride;
1418       a_offset = 1 + a_dim1;
1419       a -= a_offset;
1420       b_dim1 = bystride;
1421       b_offset = 1 + b_dim1;
1422       b -= b_offset;
1423
1424       /* Empty c first.  */
1425       for (j=1; j<=n; j++)
1426         for (i=1; i<=m; i++)
1427           c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
1428
1429       /* Early exit if possible */
1430       if (m == 0 || n == 0 || k == 0)
1431         return;
1432
1433       /* Adjust size of t1 to what is needed.  */
1434       index_type t1_dim, a_sz;
1435       if (aystride == 1)
1436         a_sz = rystride;
1437       else
1438         a_sz = a_dim1;
1439
1440       t1_dim = a_sz * 256 + b_dim1;
1441       if (t1_dim > 65536)
1442         t1_dim = 65536;
1443
1444       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
1445
1446       /* Start turning the crank. */
1447       i1 = n;
1448       for (jj = 1; jj <= i1; jj += 512)
1449         {
1450           /* Computing MIN */
1451           i2 = 512;
1452           i3 = n - jj + 1;
1453           jsec = min(i2,i3);
1454           ujsec = jsec - jsec % 4;
1455           i2 = k;
1456           for (ll = 1; ll <= i2; ll += 256)
1457             {
1458               /* Computing MIN */
1459               i3 = 256;
1460               i4 = k - ll + 1;
1461               lsec = min(i3,i4);
1462               ulsec = lsec - lsec % 2;
1463
1464               i3 = m;
1465               for (ii = 1; ii <= i3; ii += 256)
1466                 {
1467                   /* Computing MIN */
1468                   i4 = 256;
1469                   i5 = m - ii + 1;
1470                   isec = min(i4,i5);
1471                   uisec = isec - isec % 2;
1472                   i4 = ll + ulsec - 1;
1473                   for (l = ll; l <= i4; l += 2)
1474                     {
1475                       i5 = ii + uisec - 1;
1476                       for (i = ii; i <= i5; i += 2)
1477                         {
1478                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1479                                         a[i + l * a_dim1];
1480                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1481                                         a[i + (l + 1) * a_dim1];
1482                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1483                                         a[i + 1 + l * a_dim1];
1484                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1485                                         a[i + 1 + (l + 1) * a_dim1];
1486                         }
1487                       if (uisec < isec)
1488                         {
1489                           t1[l - ll + 1 + (isec << 8) - 257] =
1490                                     a[ii + isec - 1 + l * a_dim1];
1491                           t1[l - ll + 2 + (isec << 8) - 257] =
1492                                     a[ii + isec - 1 + (l + 1) * a_dim1];
1493                         }
1494                     }
1495                   if (ulsec < lsec)
1496                     {
1497                       i4 = ii + isec - 1;
1498                       for (i = ii; i<= i4; ++i)
1499                         {
1500                           t1[lsec + ((i - ii + 1) << 8) - 257] =
1501                                     a[i + (ll + lsec - 1) * a_dim1];
1502                         }
1503                     }
1504
1505                   uisec = isec - isec % 4;
1506                   i4 = jj + ujsec - 1;
1507                   for (j = jj; j <= i4; j += 4)
1508                     {
1509                       i5 = ii + uisec - 1;
1510                       for (i = ii; i <= i5; i += 4)
1511                         {
1512                           f11 = c[i + j * c_dim1];
1513                           f21 = c[i + 1 + j * c_dim1];
1514                           f12 = c[i + (j + 1) * c_dim1];
1515                           f22 = c[i + 1 + (j + 1) * c_dim1];
1516                           f13 = c[i + (j + 2) * c_dim1];
1517                           f23 = c[i + 1 + (j + 2) * c_dim1];
1518                           f14 = c[i + (j + 3) * c_dim1];
1519                           f24 = c[i + 1 + (j + 3) * c_dim1];
1520                           f31 = c[i + 2 + j * c_dim1];
1521                           f41 = c[i + 3 + j * c_dim1];
1522                           f32 = c[i + 2 + (j + 1) * c_dim1];
1523                           f42 = c[i + 3 + (j + 1) * c_dim1];
1524                           f33 = c[i + 2 + (j + 2) * c_dim1];
1525                           f43 = c[i + 3 + (j + 2) * c_dim1];
1526                           f34 = c[i + 2 + (j + 3) * c_dim1];
1527                           f44 = c[i + 3 + (j + 3) * c_dim1];
1528                           i6 = ll + lsec - 1;
1529                           for (l = ll; l <= i6; ++l)
1530                             {
1531                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1532                                       * b[l + j * b_dim1];
1533                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1534                                       * b[l + j * b_dim1];
1535                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1536                                       * b[l + (j + 1) * b_dim1];
1537                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1538                                       * b[l + (j + 1) * b_dim1];
1539                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1540                                       * b[l + (j + 2) * b_dim1];
1541                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1542                                       * b[l + (j + 2) * b_dim1];
1543                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1544                                       * b[l + (j + 3) * b_dim1];
1545                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1546                                       * b[l + (j + 3) * b_dim1];
1547                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1548                                       * b[l + j * b_dim1];
1549                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1550                                       * b[l + j * b_dim1];
1551                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1552                                       * b[l + (j + 1) * b_dim1];
1553                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1554                                       * b[l + (j + 1) * b_dim1];
1555                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1556                                       * b[l + (j + 2) * b_dim1];
1557                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1558                                       * b[l + (j + 2) * b_dim1];
1559                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1560                                       * b[l + (j + 3) * b_dim1];
1561                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1562                                       * b[l + (j + 3) * b_dim1];
1563                             }
1564                           c[i + j * c_dim1] = f11;
1565                           c[i + 1 + j * c_dim1] = f21;
1566                           c[i + (j + 1) * c_dim1] = f12;
1567                           c[i + 1 + (j + 1) * c_dim1] = f22;
1568                           c[i + (j + 2) * c_dim1] = f13;
1569                           c[i + 1 + (j + 2) * c_dim1] = f23;
1570                           c[i + (j + 3) * c_dim1] = f14;
1571                           c[i + 1 + (j + 3) * c_dim1] = f24;
1572                           c[i + 2 + j * c_dim1] = f31;
1573                           c[i + 3 + j * c_dim1] = f41;
1574                           c[i + 2 + (j + 1) * c_dim1] = f32;
1575                           c[i + 3 + (j + 1) * c_dim1] = f42;
1576                           c[i + 2 + (j + 2) * c_dim1] = f33;
1577                           c[i + 3 + (j + 2) * c_dim1] = f43;
1578                           c[i + 2 + (j + 3) * c_dim1] = f34;
1579                           c[i + 3 + (j + 3) * c_dim1] = f44;
1580                         }
1581                       if (uisec < isec)
1582                         {
1583                           i5 = ii + isec - 1;
1584                           for (i = ii + uisec; i <= i5; ++i)
1585                             {
1586                               f11 = c[i + j * c_dim1];
1587                               f12 = c[i + (j + 1) * c_dim1];
1588                               f13 = c[i + (j + 2) * c_dim1];
1589                               f14 = c[i + (j + 3) * c_dim1];
1590                               i6 = ll + lsec - 1;
1591                               for (l = ll; l <= i6; ++l)
1592                                 {
1593                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1594                                           257] * b[l + j * b_dim1];
1595                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1596                                           257] * b[l + (j + 1) * b_dim1];
1597                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1598                                           257] * b[l + (j + 2) * b_dim1];
1599                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1600                                           257] * b[l + (j + 3) * b_dim1];
1601                                 }
1602                               c[i + j * c_dim1] = f11;
1603                               c[i + (j + 1) * c_dim1] = f12;
1604                               c[i + (j + 2) * c_dim1] = f13;
1605                               c[i + (j + 3) * c_dim1] = f14;
1606                             }
1607                         }
1608                     }
1609                   if (ujsec < jsec)
1610                     {
1611                       i4 = jj + jsec - 1;
1612                       for (j = jj + ujsec; j <= i4; ++j)
1613                         {
1614                           i5 = ii + uisec - 1;
1615                           for (i = ii; i <= i5; i += 4)
1616                             {
1617                               f11 = c[i + j * c_dim1];
1618                               f21 = c[i + 1 + j * c_dim1];
1619                               f31 = c[i + 2 + j * c_dim1];
1620                               f41 = c[i + 3 + j * c_dim1];
1621                               i6 = ll + lsec - 1;
1622                               for (l = ll; l <= i6; ++l)
1623                                 {
1624                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1625                                           257] * b[l + j * b_dim1];
1626                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1627                                           257] * b[l + j * b_dim1];
1628                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1629                                           257] * b[l + j * b_dim1];
1630                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1631                                           257] * b[l + j * b_dim1];
1632                                 }
1633                               c[i + j * c_dim1] = f11;
1634                               c[i + 1 + j * c_dim1] = f21;
1635                               c[i + 2 + j * c_dim1] = f31;
1636                               c[i + 3 + j * c_dim1] = f41;
1637                             }
1638                           i5 = ii + isec - 1;
1639                           for (i = ii + uisec; i <= i5; ++i)
1640                             {
1641                               f11 = c[i + j * c_dim1];
1642                               i6 = ll + lsec - 1;
1643                               for (l = ll; l <= i6; ++l)
1644                                 {
1645                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1646                                           257] * b[l + j * b_dim1];
1647                                 }
1648                               c[i + j * c_dim1] = f11;
1649                             }
1650                         }
1651                     }
1652                 }
1653             }
1654         }
1655       free(t1);
1656       return;
1657     }
1658   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1659     {
1660       if (GFC_DESCRIPTOR_RANK (a) != 1)
1661         {
1662           const GFC_COMPLEX_10 *restrict abase_x;
1663           const GFC_COMPLEX_10 *restrict bbase_y;
1664           GFC_COMPLEX_10 *restrict dest_y;
1665           GFC_COMPLEX_10 s;
1666
1667           for (y = 0; y < ycount; y++)
1668             {
1669               bbase_y = &bbase[y*bystride];
1670               dest_y = &dest[y*rystride];
1671               for (x = 0; x < xcount; x++)
1672                 {
1673                   abase_x = &abase[x*axstride];
1674                   s = (GFC_COMPLEX_10) 0;
1675                   for (n = 0; n < count; n++)
1676                     s += abase_x[n] * bbase_y[n];
1677                   dest_y[x] = s;
1678                 }
1679             }
1680         }
1681       else
1682         {
1683           const GFC_COMPLEX_10 *restrict bbase_y;
1684           GFC_COMPLEX_10 s;
1685
1686           for (y = 0; y < ycount; y++)
1687             {
1688               bbase_y = &bbase[y*bystride];
1689               s = (GFC_COMPLEX_10) 0;
1690               for (n = 0; n < count; n++)
1691                 s += abase[n*axstride] * bbase_y[n];
1692               dest[y*rystride] = s;
1693             }
1694         }
1695     }
1696   else if (axstride < aystride)
1697     {
1698       for (y = 0; y < ycount; y++)
1699         for (x = 0; x < xcount; x++)
1700           dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0;
1701
1702       for (y = 0; y < ycount; y++)
1703         for (n = 0; n < count; n++)
1704           for (x = 0; x < xcount; x++)
1705             /* dest[x,y] += a[x,n] * b[n,y] */
1706             dest[x*rxstride + y*rystride] +=
1707                                         abase[x*axstride + n*aystride] *
1708                                         bbase[n*bxstride + y*bystride];
1709     }
1710   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1711     {
1712       const GFC_COMPLEX_10 *restrict bbase_y;
1713       GFC_COMPLEX_10 s;
1714
1715       for (y = 0; y < ycount; y++)
1716         {
1717           bbase_y = &bbase[y*bystride];
1718           s = (GFC_COMPLEX_10) 0;
1719           for (n = 0; n < count; n++)
1720             s += abase[n*axstride] * bbase_y[n*bxstride];
1721           dest[y*rxstride] = s;
1722         }
1723     }
1724   else
1725     {
1726       const GFC_COMPLEX_10 *restrict abase_x;
1727       const GFC_COMPLEX_10 *restrict bbase_y;
1728       GFC_COMPLEX_10 *restrict dest_y;
1729       GFC_COMPLEX_10 s;
1730
1731       for (y = 0; y < ycount; y++)
1732         {
1733           bbase_y = &bbase[y*bystride];
1734           dest_y = &dest[y*rystride];
1735           for (x = 0; x < xcount; x++)
1736             {
1737               abase_x = &abase[x*axstride];
1738               s = (GFC_COMPLEX_10) 0;
1739               for (n = 0; n < count; n++)
1740                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1741               dest_y[x*rxstride] = s;
1742             }
1743         }
1744     }
1745 }
1746 #undef POW3
1747 #undef min
1748 #undef max
1749
1750 #endif  /* HAVE_AVX512F */
1751
1752 /* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
1753
1754 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
1755 void
1756 matmul_c10_avx128_fma3 (gfc_array_c10 * const restrict retarray,
1757         gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
1758         int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
1759 internal_proto(matmul_c10_avx128_fma3);
1760 #endif
1761
1762 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
1763 void
1764 matmul_c10_avx128_fma4 (gfc_array_c10 * const restrict retarray,
1765         gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
1766         int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
1767 internal_proto(matmul_c10_avx128_fma4);
1768 #endif
1769
1770 /* Function to fall back to if there is no special processor-specific version.  */
1771 static void
1772 matmul_c10_vanilla (gfc_array_c10 * const restrict retarray,
1773         gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
1774         int blas_limit, blas_call gemm)
1775 {
1776   const GFC_COMPLEX_10 * restrict abase;
1777   const GFC_COMPLEX_10 * restrict bbase;
1778   GFC_COMPLEX_10 * restrict dest;
1779
1780   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1781   index_type x, y, n, count, xcount, ycount;
1782
1783   assert (GFC_DESCRIPTOR_RANK (a) == 2
1784           || GFC_DESCRIPTOR_RANK (b) == 2);
1785
1786 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1787
1788    Either A or B (but not both) can be rank 1:
1789
1790    o One-dimensional argument A is implicitly treated as a row matrix
1791      dimensioned [1,count], so xcount=1.
1792
1793    o One-dimensional argument B is implicitly treated as a column matrix
1794      dimensioned [count, 1], so ycount=1.
1795 */
1796
1797   if (retarray->base_addr == NULL)
1798     {
1799       if (GFC_DESCRIPTOR_RANK (a) == 1)
1800         {
1801           GFC_DIMENSION_SET(retarray->dim[0], 0,
1802                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1803         }
1804       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1805         {
1806           GFC_DIMENSION_SET(retarray->dim[0], 0,
1807                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1808         }
1809       else
1810         {
1811           GFC_DIMENSION_SET(retarray->dim[0], 0,
1812                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1813
1814           GFC_DIMENSION_SET(retarray->dim[1], 0,
1815                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1816                             GFC_DESCRIPTOR_EXTENT(retarray,0));
1817         }
1818
1819       retarray->base_addr
1820         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10));
1821       retarray->offset = 0;
1822     }
1823   else if (unlikely (compile_options.bounds_check))
1824     {
1825       index_type ret_extent, arg_extent;
1826
1827       if (GFC_DESCRIPTOR_RANK (a) == 1)
1828         {
1829           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1830           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1831           if (arg_extent != ret_extent)
1832             runtime_error ("Incorrect extent in return array in"
1833                            " MATMUL intrinsic: is %ld, should be %ld",
1834                            (long int) ret_extent, (long int) arg_extent);
1835         }
1836       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1837         {
1838           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1839           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1840           if (arg_extent != ret_extent)
1841             runtime_error ("Incorrect extent in return array in"
1842                            " MATMUL intrinsic: is %ld, should be %ld",
1843                            (long int) ret_extent, (long int) arg_extent);
1844         }
1845       else
1846         {
1847           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1848           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1849           if (arg_extent != ret_extent)
1850             runtime_error ("Incorrect extent in return array in"
1851                            " MATMUL intrinsic for dimension 1:"
1852                            " is %ld, should be %ld",
1853                            (long int) ret_extent, (long int) arg_extent);
1854
1855           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1856           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1857           if (arg_extent != ret_extent)
1858             runtime_error ("Incorrect extent in return array in"
1859                            " MATMUL intrinsic for dimension 2:"
1860                            " is %ld, should be %ld",
1861                            (long int) ret_extent, (long int) arg_extent);
1862         }
1863     }
1864
1865
1866   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1867     {
1868       /* One-dimensional result may be addressed in the code below
1869          either as a row or a column matrix. We want both cases to
1870          work. */
1871       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1872     }
1873   else
1874     {
1875       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1876       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1877     }
1878
1879
1880   if (GFC_DESCRIPTOR_RANK (a) == 1)
1881     {
1882       /* Treat it as a a row matrix A[1,count]. */
1883       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1884       aystride = 1;
1885
1886       xcount = 1;
1887       count = GFC_DESCRIPTOR_EXTENT(a,0);
1888     }
1889   else
1890     {
1891       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1892       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1893
1894       count = GFC_DESCRIPTOR_EXTENT(a,1);
1895       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1896     }
1897
1898   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1899     {
1900       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1901         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1902     }
1903
1904   if (GFC_DESCRIPTOR_RANK (b) == 1)
1905     {
1906       /* Treat it as a column matrix B[count,1] */
1907       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1908
1909       /* bystride should never be used for 1-dimensional b.
1910          The value is only used for calculation of the
1911          memory by the buffer.  */
1912       bystride = 256;
1913       ycount = 1;
1914     }
1915   else
1916     {
1917       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1918       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1919       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1920     }
1921
1922   abase = a->base_addr;
1923   bbase = b->base_addr;
1924   dest = retarray->base_addr;
1925
1926   /* Now that everything is set up, we perform the multiplication
1927      itself.  */
1928
1929 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1930 #define min(a,b) ((a) <= (b) ? (a) : (b))
1931 #define max(a,b) ((a) >= (b) ? (a) : (b))
1932
1933   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1934       && (bxstride == 1 || bystride == 1)
1935       && (((float) xcount) * ((float) ycount) * ((float) count)
1936           > POW3(blas_limit)))
1937     {
1938       const int m = xcount, n = ycount, k = count, ldc = rystride;
1939       const GFC_COMPLEX_10 one = 1, zero = 0;
1940       const int lda = (axstride == 1) ? aystride : axstride,
1941                 ldb = (bxstride == 1) ? bystride : bxstride;
1942
1943       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1944         {
1945           assert (gemm != NULL);
1946           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1947                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1948                 &ldc, 1, 1);
1949           return;
1950         }
1951     }
1952
1953   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1954     {
1955       /* This block of code implements a tuned matmul, derived from
1956          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1957
1958                Bo Kagstrom and Per Ling
1959                Department of Computing Science
1960                Umea University
1961                S-901 87 Umea, Sweden
1962
1963          from netlib.org, translated to C, and modified for matmul.m4.  */
1964
1965       const GFC_COMPLEX_10 *a, *b;
1966       GFC_COMPLEX_10 *c;
1967       const index_type m = xcount, n = ycount, k = count;
1968
1969       /* System generated locals */
1970       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1971                  i1, i2, i3, i4, i5, i6;
1972
1973       /* Local variables */
1974       GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
1975                  f13, f14, f23, f24, f33, f34, f43, f44;
1976       index_type i, j, l, ii, jj, ll;
1977       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1978       GFC_COMPLEX_10 *t1;
1979
1980       a = abase;
1981       b = bbase;
1982       c = retarray->base_addr;
1983
1984       /* Parameter adjustments */
1985       c_dim1 = rystride;
1986       c_offset = 1 + c_dim1;
1987       c -= c_offset;
1988       a_dim1 = aystride;
1989       a_offset = 1 + a_dim1;
1990       a -= a_offset;
1991       b_dim1 = bystride;
1992       b_offset = 1 + b_dim1;
1993       b -= b_offset;
1994
1995       /* Empty c first.  */
1996       for (j=1; j<=n; j++)
1997         for (i=1; i<=m; i++)
1998           c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
1999
2000       /* Early exit if possible */
2001       if (m == 0 || n == 0 || k == 0)
2002         return;
2003
2004       /* Adjust size of t1 to what is needed.  */
2005       index_type t1_dim, a_sz;
2006       if (aystride == 1)
2007         a_sz = rystride;
2008       else
2009         a_sz = a_dim1;
2010
2011       t1_dim = a_sz * 256 + b_dim1;
2012       if (t1_dim > 65536)
2013         t1_dim = 65536;
2014
2015       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
2016
2017       /* Start turning the crank. */
2018       i1 = n;
2019       for (jj = 1; jj <= i1; jj += 512)
2020         {
2021           /* Computing MIN */
2022           i2 = 512;
2023           i3 = n - jj + 1;
2024           jsec = min(i2,i3);
2025           ujsec = jsec - jsec % 4;
2026           i2 = k;
2027           for (ll = 1; ll <= i2; ll += 256)
2028             {
2029               /* Computing MIN */
2030               i3 = 256;
2031               i4 = k - ll + 1;
2032               lsec = min(i3,i4);
2033               ulsec = lsec - lsec % 2;
2034
2035               i3 = m;
2036               for (ii = 1; ii <= i3; ii += 256)
2037                 {
2038                   /* Computing MIN */
2039                   i4 = 256;
2040                   i5 = m - ii + 1;
2041                   isec = min(i4,i5);
2042                   uisec = isec - isec % 2;
2043                   i4 = ll + ulsec - 1;
2044                   for (l = ll; l <= i4; l += 2)
2045                     {
2046                       i5 = ii + uisec - 1;
2047                       for (i = ii; i <= i5; i += 2)
2048                         {
2049                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2050                                         a[i + l * a_dim1];
2051                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2052                                         a[i + (l + 1) * a_dim1];
2053                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2054                                         a[i + 1 + l * a_dim1];
2055                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2056                                         a[i + 1 + (l + 1) * a_dim1];
2057                         }
2058                       if (uisec < isec)
2059                         {
2060                           t1[l - ll + 1 + (isec << 8) - 257] =
2061                                     a[ii + isec - 1 + l * a_dim1];
2062                           t1[l - ll + 2 + (isec << 8) - 257] =
2063                                     a[ii + isec - 1 + (l + 1) * a_dim1];
2064                         }
2065                     }
2066                   if (ulsec < lsec)
2067                     {
2068                       i4 = ii + isec - 1;
2069                       for (i = ii; i<= i4; ++i)
2070                         {
2071                           t1[lsec + ((i - ii + 1) << 8) - 257] =
2072                                     a[i + (ll + lsec - 1) * a_dim1];
2073                         }
2074                     }
2075
2076                   uisec = isec - isec % 4;
2077                   i4 = jj + ujsec - 1;
2078                   for (j = jj; j <= i4; j += 4)
2079                     {
2080                       i5 = ii + uisec - 1;
2081                       for (i = ii; i <= i5; i += 4)
2082                         {
2083                           f11 = c[i + j * c_dim1];
2084                           f21 = c[i + 1 + j * c_dim1];
2085                           f12 = c[i + (j + 1) * c_dim1];
2086                           f22 = c[i + 1 + (j + 1) * c_dim1];
2087                           f13 = c[i + (j + 2) * c_dim1];
2088                           f23 = c[i + 1 + (j + 2) * c_dim1];
2089                           f14 = c[i + (j + 3) * c_dim1];
2090                           f24 = c[i + 1 + (j + 3) * c_dim1];
2091                           f31 = c[i + 2 + j * c_dim1];
2092                           f41 = c[i + 3 + j * c_dim1];
2093                           f32 = c[i + 2 + (j + 1) * c_dim1];
2094                           f42 = c[i + 3 + (j + 1) * c_dim1];
2095                           f33 = c[i + 2 + (j + 2) * c_dim1];
2096                           f43 = c[i + 3 + (j + 2) * c_dim1];
2097                           f34 = c[i + 2 + (j + 3) * c_dim1];
2098                           f44 = c[i + 3 + (j + 3) * c_dim1];
2099                           i6 = ll + lsec - 1;
2100                           for (l = ll; l <= i6; ++l)
2101                             {
2102                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2103                                       * b[l + j * b_dim1];
2104                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2105                                       * b[l + j * b_dim1];
2106                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2107                                       * b[l + (j + 1) * b_dim1];
2108                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2109                                       * b[l + (j + 1) * b_dim1];
2110                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2111                                       * b[l + (j + 2) * b_dim1];
2112                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2113                                       * b[l + (j + 2) * b_dim1];
2114                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2115                                       * b[l + (j + 3) * b_dim1];
2116                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2117                                       * b[l + (j + 3) * b_dim1];
2118                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2119                                       * b[l + j * b_dim1];
2120                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2121                                       * b[l + j * b_dim1];
2122                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2123                                       * b[l + (j + 1) * b_dim1];
2124                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2125                                       * b[l + (j + 1) * b_dim1];
2126                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2127                                       * b[l + (j + 2) * b_dim1];
2128                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2129                                       * b[l + (j + 2) * b_dim1];
2130                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2131                                       * b[l + (j + 3) * b_dim1];
2132                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2133                                       * b[l + (j + 3) * b_dim1];
2134                             }
2135                           c[i + j * c_dim1] = f11;
2136                           c[i + 1 + j * c_dim1] = f21;
2137                           c[i + (j + 1) * c_dim1] = f12;
2138                           c[i + 1 + (j + 1) * c_dim1] = f22;
2139                           c[i + (j + 2) * c_dim1] = f13;
2140                           c[i + 1 + (j + 2) * c_dim1] = f23;
2141                           c[i + (j + 3) * c_dim1] = f14;
2142                           c[i + 1 + (j + 3) * c_dim1] = f24;
2143                           c[i + 2 + j * c_dim1] = f31;
2144                           c[i + 3 + j * c_dim1] = f41;
2145                           c[i + 2 + (j + 1) * c_dim1] = f32;
2146                           c[i + 3 + (j + 1) * c_dim1] = f42;
2147                           c[i + 2 + (j + 2) * c_dim1] = f33;
2148                           c[i + 3 + (j + 2) * c_dim1] = f43;
2149                           c[i + 2 + (j + 3) * c_dim1] = f34;
2150                           c[i + 3 + (j + 3) * c_dim1] = f44;
2151                         }
2152                       if (uisec < isec)
2153                         {
2154                           i5 = ii + isec - 1;
2155                           for (i = ii + uisec; i <= i5; ++i)
2156                             {
2157                               f11 = c[i + j * c_dim1];
2158                               f12 = c[i + (j + 1) * c_dim1];
2159                               f13 = c[i + (j + 2) * c_dim1];
2160                               f14 = c[i + (j + 3) * c_dim1];
2161                               i6 = ll + lsec - 1;
2162                               for (l = ll; l <= i6; ++l)
2163                                 {
2164                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2165                                           257] * b[l + j * b_dim1];
2166                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2167                                           257] * b[l + (j + 1) * b_dim1];
2168                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2169                                           257] * b[l + (j + 2) * b_dim1];
2170                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2171                                           257] * b[l + (j + 3) * b_dim1];
2172                                 }
2173                               c[i + j * c_dim1] = f11;
2174                               c[i + (j + 1) * c_dim1] = f12;
2175                               c[i + (j + 2) * c_dim1] = f13;
2176                               c[i + (j + 3) * c_dim1] = f14;
2177                             }
2178                         }
2179                     }
2180                   if (ujsec < jsec)
2181                     {
2182                       i4 = jj + jsec - 1;
2183                       for (j = jj + ujsec; j <= i4; ++j)
2184                         {
2185                           i5 = ii + uisec - 1;
2186                           for (i = ii; i <= i5; i += 4)
2187                             {
2188                               f11 = c[i + j * c_dim1];
2189                               f21 = c[i + 1 + j * c_dim1];
2190                               f31 = c[i + 2 + j * c_dim1];
2191                               f41 = c[i + 3 + j * c_dim1];
2192                               i6 = ll + lsec - 1;
2193                               for (l = ll; l <= i6; ++l)
2194                                 {
2195                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2196                                           257] * b[l + j * b_dim1];
2197                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2198                                           257] * b[l + j * b_dim1];
2199                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2200                                           257] * b[l + j * b_dim1];
2201                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2202                                           257] * b[l + j * b_dim1];
2203                                 }
2204                               c[i + j * c_dim1] = f11;
2205                               c[i + 1 + j * c_dim1] = f21;
2206                               c[i + 2 + j * c_dim1] = f31;
2207                               c[i + 3 + j * c_dim1] = f41;
2208                             }
2209                           i5 = ii + isec - 1;
2210                           for (i = ii + uisec; i <= i5; ++i)
2211                             {
2212                               f11 = c[i + j * c_dim1];
2213                               i6 = ll + lsec - 1;
2214                               for (l = ll; l <= i6; ++l)
2215                                 {
2216                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2217                                           257] * b[l + j * b_dim1];
2218                                 }
2219                               c[i + j * c_dim1] = f11;
2220                             }
2221                         }
2222                     }
2223                 }
2224             }
2225         }
2226       free(t1);
2227       return;
2228     }
2229   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2230     {
2231       if (GFC_DESCRIPTOR_RANK (a) != 1)
2232         {
2233           const GFC_COMPLEX_10 *restrict abase_x;
2234           const GFC_COMPLEX_10 *restrict bbase_y;
2235           GFC_COMPLEX_10 *restrict dest_y;
2236           GFC_COMPLEX_10 s;
2237
2238           for (y = 0; y < ycount; y++)
2239             {
2240               bbase_y = &bbase[y*bystride];
2241               dest_y = &dest[y*rystride];
2242               for (x = 0; x < xcount; x++)
2243                 {
2244                   abase_x = &abase[x*axstride];
2245                   s = (GFC_COMPLEX_10) 0;
2246                   for (n = 0; n < count; n++)
2247                     s += abase_x[n] * bbase_y[n];
2248                   dest_y[x] = s;
2249                 }
2250             }
2251         }
2252       else
2253         {
2254           const GFC_COMPLEX_10 *restrict bbase_y;
2255           GFC_COMPLEX_10 s;
2256
2257           for (y = 0; y < ycount; y++)
2258             {
2259               bbase_y = &bbase[y*bystride];
2260               s = (GFC_COMPLEX_10) 0;
2261               for (n = 0; n < count; n++)
2262                 s += abase[n*axstride] * bbase_y[n];
2263               dest[y*rystride] = s;
2264             }
2265         }
2266     }
2267   else if (axstride < aystride)
2268     {
2269       for (y = 0; y < ycount; y++)
2270         for (x = 0; x < xcount; x++)
2271           dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0;
2272
2273       for (y = 0; y < ycount; y++)
2274         for (n = 0; n < count; n++)
2275           for (x = 0; x < xcount; x++)
2276             /* dest[x,y] += a[x,n] * b[n,y] */
2277             dest[x*rxstride + y*rystride] +=
2278                                         abase[x*axstride + n*aystride] *
2279                                         bbase[n*bxstride + y*bystride];
2280     }
2281   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2282     {
2283       const GFC_COMPLEX_10 *restrict bbase_y;
2284       GFC_COMPLEX_10 s;
2285
2286       for (y = 0; y < ycount; y++)
2287         {
2288           bbase_y = &bbase[y*bystride];
2289           s = (GFC_COMPLEX_10) 0;
2290           for (n = 0; n < count; n++)
2291             s += abase[n*axstride] * bbase_y[n*bxstride];
2292           dest[y*rxstride] = s;
2293         }
2294     }
2295   else
2296     {
2297       const GFC_COMPLEX_10 *restrict abase_x;
2298       const GFC_COMPLEX_10 *restrict bbase_y;
2299       GFC_COMPLEX_10 *restrict dest_y;
2300       GFC_COMPLEX_10 s;
2301
2302       for (y = 0; y < ycount; y++)
2303         {
2304           bbase_y = &bbase[y*bystride];
2305           dest_y = &dest[y*rystride];
2306           for (x = 0; x < xcount; x++)
2307             {
2308               abase_x = &abase[x*axstride];
2309               s = (GFC_COMPLEX_10) 0;
2310               for (n = 0; n < count; n++)
2311                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2312               dest_y[x*rxstride] = s;
2313             }
2314         }
2315     }
2316 }
2317 #undef POW3
2318 #undef min
2319 #undef max
2320
2321
2322 /* Compiling main function, with selection code for the processor.  */
2323
2324 /* Currently, this is i386 only.  Adjust for other architectures.  */
2325
2326 #include <config/i386/cpuinfo.h>
2327 void matmul_c10 (gfc_array_c10 * const restrict retarray,
2328         gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
2329         int blas_limit, blas_call gemm)
2330 {
2331   static void (*matmul_p) (gfc_array_c10 * const restrict retarray,
2332         gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
2333         int blas_limit, blas_call gemm);
2334
2335   void (*matmul_fn) (gfc_array_c10 * const restrict retarray,
2336         gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
2337         int blas_limit, blas_call gemm);
2338
2339   matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2340   if (matmul_fn == NULL)
2341     {
2342       matmul_fn = matmul_c10_vanilla;
2343       if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2344         {
2345           /* Run down the available processors in order of preference.  */
2346 #ifdef HAVE_AVX512F
2347           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2348             {
2349               matmul_fn = matmul_c10_avx512f;
2350               goto store;
2351             }
2352
2353 #endif  /* HAVE_AVX512F */
2354
2355 #ifdef HAVE_AVX2
2356           if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2357              && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2358             {
2359               matmul_fn = matmul_c10_avx2;
2360               goto store;
2361             }
2362
2363 #endif
2364
2365 #ifdef HAVE_AVX
2366           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2367             {
2368               matmul_fn = matmul_c10_avx;
2369               goto store;
2370             }
2371 #endif  /* HAVE_AVX */
2372         }
2373     else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
2374       {
2375 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
2376         if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2377             && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2378           {
2379             matmul_fn = matmul_c10_avx128_fma3;
2380             goto store;
2381           }
2382 #endif
2383 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
2384         if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2385              && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
2386           {
2387             matmul_fn = matmul_c10_avx128_fma4;
2388             goto store;
2389           }
2390 #endif
2391
2392       }
2393    store:
2394       __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2395    }
2396
2397    (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2398 }
2399
2400 #else  /* Just the vanilla function.  */
2401
2402 void
2403 matmul_c10 (gfc_array_c10 * const restrict retarray,
2404         gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
2405         int blas_limit, blas_call gemm)
2406 {
2407   const GFC_COMPLEX_10 * restrict abase;
2408   const GFC_COMPLEX_10 * restrict bbase;
2409   GFC_COMPLEX_10 * restrict dest;
2410
2411   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2412   index_type x, y, n, count, xcount, ycount;
2413
2414   assert (GFC_DESCRIPTOR_RANK (a) == 2
2415           || GFC_DESCRIPTOR_RANK (b) == 2);
2416
2417 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2418
2419    Either A or B (but not both) can be rank 1:
2420
2421    o One-dimensional argument A is implicitly treated as a row matrix
2422      dimensioned [1,count], so xcount=1.
2423
2424    o One-dimensional argument B is implicitly treated as a column matrix
2425      dimensioned [count, 1], so ycount=1.
2426 */
2427
2428   if (retarray->base_addr == NULL)
2429     {
2430       if (GFC_DESCRIPTOR_RANK (a) == 1)
2431         {
2432           GFC_DIMENSION_SET(retarray->dim[0], 0,
2433                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2434         }
2435       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2436         {
2437           GFC_DIMENSION_SET(retarray->dim[0], 0,
2438                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2439         }
2440       else
2441         {
2442           GFC_DIMENSION_SET(retarray->dim[0], 0,
2443                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2444
2445           GFC_DIMENSION_SET(retarray->dim[1], 0,
2446                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2447                             GFC_DESCRIPTOR_EXTENT(retarray,0));
2448         }
2449
2450       retarray->base_addr
2451         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10));
2452       retarray->offset = 0;
2453     }
2454   else if (unlikely (compile_options.bounds_check))
2455     {
2456       index_type ret_extent, arg_extent;
2457
2458       if (GFC_DESCRIPTOR_RANK (a) == 1)
2459         {
2460           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2461           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2462           if (arg_extent != ret_extent)
2463             runtime_error ("Incorrect extent in return array in"
2464                            " MATMUL intrinsic: is %ld, should be %ld",
2465                            (long int) ret_extent, (long int) arg_extent);
2466         }
2467       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2468         {
2469           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2470           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2471           if (arg_extent != ret_extent)
2472             runtime_error ("Incorrect extent in return array in"
2473                            " MATMUL intrinsic: is %ld, should be %ld",
2474                            (long int) ret_extent, (long int) arg_extent);
2475         }
2476       else
2477         {
2478           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2479           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2480           if (arg_extent != ret_extent)
2481             runtime_error ("Incorrect extent in return array in"
2482                            " MATMUL intrinsic for dimension 1:"
2483                            " is %ld, should be %ld",
2484                            (long int) ret_extent, (long int) arg_extent);
2485
2486           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2487           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2488           if (arg_extent != ret_extent)
2489             runtime_error ("Incorrect extent in return array in"
2490                            " MATMUL intrinsic for dimension 2:"
2491                            " is %ld, should be %ld",
2492                            (long int) ret_extent, (long int) arg_extent);
2493         }
2494     }
2495
2496
2497   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2498     {
2499       /* One-dimensional result may be addressed in the code below
2500          either as a row or a column matrix. We want both cases to
2501          work. */
2502       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2503     }
2504   else
2505     {
2506       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2507       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2508     }
2509
2510
2511   if (GFC_DESCRIPTOR_RANK (a) == 1)
2512     {
2513       /* Treat it as a a row matrix A[1,count]. */
2514       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2515       aystride = 1;
2516
2517       xcount = 1;
2518       count = GFC_DESCRIPTOR_EXTENT(a,0);
2519     }
2520   else
2521     {
2522       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2523       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2524
2525       count = GFC_DESCRIPTOR_EXTENT(a,1);
2526       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2527     }
2528
2529   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2530     {
2531       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2532         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2533     }
2534
2535   if (GFC_DESCRIPTOR_RANK (b) == 1)
2536     {
2537       /* Treat it as a column matrix B[count,1] */
2538       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2539
2540       /* bystride should never be used for 1-dimensional b.
2541          The value is only used for calculation of the
2542          memory by the buffer.  */
2543       bystride = 256;
2544       ycount = 1;
2545     }
2546   else
2547     {
2548       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2549       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2550       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2551     }
2552
2553   abase = a->base_addr;
2554   bbase = b->base_addr;
2555   dest = retarray->base_addr;
2556
2557   /* Now that everything is set up, we perform the multiplication
2558      itself.  */
2559
2560 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2561 #define min(a,b) ((a) <= (b) ? (a) : (b))
2562 #define max(a,b) ((a) >= (b) ? (a) : (b))
2563
2564   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2565       && (bxstride == 1 || bystride == 1)
2566       && (((float) xcount) * ((float) ycount) * ((float) count)
2567           > POW3(blas_limit)))
2568     {
2569       const int m = xcount, n = ycount, k = count, ldc = rystride;
2570       const GFC_COMPLEX_10 one = 1, zero = 0;
2571       const int lda = (axstride == 1) ? aystride : axstride,
2572                 ldb = (bxstride == 1) ? bystride : bxstride;
2573
2574       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2575         {
2576           assert (gemm != NULL);
2577           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2578                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2579                 &ldc, 1, 1);
2580           return;
2581         }
2582     }
2583
2584   if (rxstride == 1 && axstride == 1 && bxstride == 1)
2585     {
2586       /* This block of code implements a tuned matmul, derived from
2587          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2588
2589                Bo Kagstrom and Per Ling
2590                Department of Computing Science
2591                Umea University
2592                S-901 87 Umea, Sweden
2593
2594          from netlib.org, translated to C, and modified for matmul.m4.  */
2595
2596       const GFC_COMPLEX_10 *a, *b;
2597       GFC_COMPLEX_10 *c;
2598       const index_type m = xcount, n = ycount, k = count;
2599
2600       /* System generated locals */
2601       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2602                  i1, i2, i3, i4, i5, i6;
2603
2604       /* Local variables */
2605       GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
2606                  f13, f14, f23, f24, f33, f34, f43, f44;
2607       index_type i, j, l, ii, jj, ll;
2608       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2609       GFC_COMPLEX_10 *t1;
2610
2611       a = abase;
2612       b = bbase;
2613       c = retarray->base_addr;
2614
2615       /* Parameter adjustments */
2616       c_dim1 = rystride;
2617       c_offset = 1 + c_dim1;
2618       c -= c_offset;
2619       a_dim1 = aystride;
2620       a_offset = 1 + a_dim1;
2621       a -= a_offset;
2622       b_dim1 = bystride;
2623       b_offset = 1 + b_dim1;
2624       b -= b_offset;
2625
2626       /* Empty c first.  */
2627       for (j=1; j<=n; j++)
2628         for (i=1; i<=m; i++)
2629           c[i + j * c_dim1] = (GFC_COMPLEX_10)0;
2630
2631       /* Early exit if possible */
2632       if (m == 0 || n == 0 || k == 0)
2633         return;
2634
2635       /* Adjust size of t1 to what is needed.  */
2636       index_type t1_dim, a_sz;
2637       if (aystride == 1)
2638         a_sz = rystride;
2639       else
2640         a_sz = a_dim1;
2641
2642       t1_dim = a_sz * 256 + b_dim1;
2643       if (t1_dim > 65536)
2644         t1_dim = 65536;
2645
2646       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_10));
2647
2648       /* Start turning the crank. */
2649       i1 = n;
2650       for (jj = 1; jj <= i1; jj += 512)
2651         {
2652           /* Computing MIN */
2653           i2 = 512;
2654           i3 = n - jj + 1;
2655           jsec = min(i2,i3);
2656           ujsec = jsec - jsec % 4;
2657           i2 = k;
2658           for (ll = 1; ll <= i2; ll += 256)
2659             {
2660               /* Computing MIN */
2661               i3 = 256;
2662               i4 = k - ll + 1;
2663               lsec = min(i3,i4);
2664               ulsec = lsec - lsec % 2;
2665
2666               i3 = m;
2667               for (ii = 1; ii <= i3; ii += 256)
2668                 {
2669                   /* Computing MIN */
2670                   i4 = 256;
2671                   i5 = m - ii + 1;
2672                   isec = min(i4,i5);
2673                   uisec = isec - isec % 2;
2674                   i4 = ll + ulsec - 1;
2675                   for (l = ll; l <= i4; l += 2)
2676                     {
2677                       i5 = ii + uisec - 1;
2678                       for (i = ii; i <= i5; i += 2)
2679                         {
2680                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2681                                         a[i + l * a_dim1];
2682                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2683                                         a[i + (l + 1) * a_dim1];
2684                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2685                                         a[i + 1 + l * a_dim1];
2686                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2687                                         a[i + 1 + (l + 1) * a_dim1];
2688                         }
2689                       if (uisec < isec)
2690                         {
2691                           t1[l - ll + 1 + (isec << 8) - 257] =
2692                                     a[ii + isec - 1 + l * a_dim1];
2693                           t1[l - ll + 2 + (isec << 8) - 257] =
2694                                     a[ii + isec - 1 + (l + 1) * a_dim1];
2695                         }
2696                     }
2697                   if (ulsec < lsec)
2698                     {
2699                       i4 = ii + isec - 1;
2700                       for (i = ii; i<= i4; ++i)
2701                         {
2702                           t1[lsec + ((i - ii + 1) << 8) - 257] =
2703                                     a[i + (ll + lsec - 1) * a_dim1];
2704                         }
2705                     }
2706
2707                   uisec = isec - isec % 4;
2708                   i4 = jj + ujsec - 1;
2709                   for (j = jj; j <= i4; j += 4)
2710                     {
2711                       i5 = ii + uisec - 1;
2712                       for (i = ii; i <= i5; i += 4)
2713                         {
2714                           f11 = c[i + j * c_dim1];
2715                           f21 = c[i + 1 + j * c_dim1];
2716                           f12 = c[i + (j + 1) * c_dim1];
2717                           f22 = c[i + 1 + (j + 1) * c_dim1];
2718                           f13 = c[i + (j + 2) * c_dim1];
2719                           f23 = c[i + 1 + (j + 2) * c_dim1];
2720                           f14 = c[i + (j + 3) * c_dim1];
2721                           f24 = c[i + 1 + (j + 3) * c_dim1];
2722                           f31 = c[i + 2 + j * c_dim1];
2723                           f41 = c[i + 3 + j * c_dim1];
2724                           f32 = c[i + 2 + (j + 1) * c_dim1];
2725                           f42 = c[i + 3 + (j + 1) * c_dim1];
2726                           f33 = c[i + 2 + (j + 2) * c_dim1];
2727                           f43 = c[i + 3 + (j + 2) * c_dim1];
2728                           f34 = c[i + 2 + (j + 3) * c_dim1];
2729                           f44 = c[i + 3 + (j + 3) * c_dim1];
2730                           i6 = ll + lsec - 1;
2731                           for (l = ll; l <= i6; ++l)
2732                             {
2733                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2734                                       * b[l + j * b_dim1];
2735                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2736                                       * b[l + j * b_dim1];
2737                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2738                                       * b[l + (j + 1) * b_dim1];
2739                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2740                                       * b[l + (j + 1) * b_dim1];
2741                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2742                                       * b[l + (j + 2) * b_dim1];
2743                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2744                                       * b[l + (j + 2) * b_dim1];
2745                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2746                                       * b[l + (j + 3) * b_dim1];
2747                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2748                                       * b[l + (j + 3) * b_dim1];
2749                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2750                                       * b[l + j * b_dim1];
2751                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2752                                       * b[l + j * b_dim1];
2753                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2754                                       * b[l + (j + 1) * b_dim1];
2755                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2756                                       * b[l + (j + 1) * b_dim1];
2757                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2758                                       * b[l + (j + 2) * b_dim1];
2759                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2760                                       * b[l + (j + 2) * b_dim1];
2761                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2762                                       * b[l + (j + 3) * b_dim1];
2763                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2764                                       * b[l + (j + 3) * b_dim1];
2765                             }
2766                           c[i + j * c_dim1] = f11;
2767                           c[i + 1 + j * c_dim1] = f21;
2768                           c[i + (j + 1) * c_dim1] = f12;
2769                           c[i + 1 + (j + 1) * c_dim1] = f22;
2770                           c[i + (j + 2) * c_dim1] = f13;
2771                           c[i + 1 + (j + 2) * c_dim1] = f23;
2772                           c[i + (j + 3) * c_dim1] = f14;
2773                           c[i + 1 + (j + 3) * c_dim1] = f24;
2774                           c[i + 2 + j * c_dim1] = f31;
2775                           c[i + 3 + j * c_dim1] = f41;
2776                           c[i + 2 + (j + 1) * c_dim1] = f32;
2777                           c[i + 3 + (j + 1) * c_dim1] = f42;
2778                           c[i + 2 + (j + 2) * c_dim1] = f33;
2779                           c[i + 3 + (j + 2) * c_dim1] = f43;
2780                           c[i + 2 + (j + 3) * c_dim1] = f34;
2781                           c[i + 3 + (j + 3) * c_dim1] = f44;
2782                         }
2783                       if (uisec < isec)
2784                         {
2785                           i5 = ii + isec - 1;
2786                           for (i = ii + uisec; i <= i5; ++i)
2787                             {
2788                               f11 = c[i + j * c_dim1];
2789                               f12 = c[i + (j + 1) * c_dim1];
2790                               f13 = c[i + (j + 2) * c_dim1];
2791                               f14 = c[i + (j + 3) * c_dim1];
2792                               i6 = ll + lsec - 1;
2793                               for (l = ll; l <= i6; ++l)
2794                                 {
2795                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2796                                           257] * b[l + j * b_dim1];
2797                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2798                                           257] * b[l + (j + 1) * b_dim1];
2799                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2800                                           257] * b[l + (j + 2) * b_dim1];
2801                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2802                                           257] * b[l + (j + 3) * b_dim1];
2803                                 }
2804                               c[i + j * c_dim1] = f11;
2805                               c[i + (j + 1) * c_dim1] = f12;
2806                               c[i + (j + 2) * c_dim1] = f13;
2807                               c[i + (j + 3) * c_dim1] = f14;
2808                             }
2809                         }
2810                     }
2811                   if (ujsec < jsec)
2812                     {
2813                       i4 = jj + jsec - 1;
2814                       for (j = jj + ujsec; j <= i4; ++j)
2815                         {
2816                           i5 = ii + uisec - 1;
2817                           for (i = ii; i <= i5; i += 4)
2818                             {
2819                               f11 = c[i + j * c_dim1];
2820                               f21 = c[i + 1 + j * c_dim1];
2821                               f31 = c[i + 2 + j * c_dim1];
2822                               f41 = c[i + 3 + j * c_dim1];
2823                               i6 = ll + lsec - 1;
2824                               for (l = ll; l <= i6; ++l)
2825                                 {
2826                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2827                                           257] * b[l + j * b_dim1];
2828                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2829                                           257] * b[l + j * b_dim1];
2830                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2831                                           257] * b[l + j * b_dim1];
2832                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2833                                           257] * b[l + j * b_dim1];
2834                                 }
2835                               c[i + j * c_dim1] = f11;
2836                               c[i + 1 + j * c_dim1] = f21;
2837                               c[i + 2 + j * c_dim1] = f31;
2838                               c[i + 3 + j * c_dim1] = f41;
2839                             }
2840                           i5 = ii + isec - 1;
2841                           for (i = ii + uisec; i <= i5; ++i)
2842                             {
2843                               f11 = c[i + j * c_dim1];
2844                               i6 = ll + lsec - 1;
2845                               for (l = ll; l <= i6; ++l)
2846                                 {
2847                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2848                                           257] * b[l + j * b_dim1];
2849                                 }
2850                               c[i + j * c_dim1] = f11;
2851                             }
2852                         }
2853                     }
2854                 }
2855             }
2856         }
2857       free(t1);
2858       return;
2859     }
2860   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2861     {
2862       if (GFC_DESCRIPTOR_RANK (a) != 1)
2863         {
2864           const GFC_COMPLEX_10 *restrict abase_x;
2865           const GFC_COMPLEX_10 *restrict bbase_y;
2866           GFC_COMPLEX_10 *restrict dest_y;
2867           GFC_COMPLEX_10 s;
2868
2869           for (y = 0; y < ycount; y++)
2870             {
2871               bbase_y = &bbase[y*bystride];
2872               dest_y = &dest[y*rystride];
2873               for (x = 0; x < xcount; x++)
2874                 {
2875                   abase_x = &abase[x*axstride];
2876                   s = (GFC_COMPLEX_10) 0;
2877                   for (n = 0; n < count; n++)
2878                     s += abase_x[n] * bbase_y[n];
2879                   dest_y[x] = s;
2880                 }
2881             }
2882         }
2883       else
2884         {
2885           const GFC_COMPLEX_10 *restrict bbase_y;
2886           GFC_COMPLEX_10 s;
2887
2888           for (y = 0; y < ycount; y++)
2889             {
2890               bbase_y = &bbase[y*bystride];
2891               s = (GFC_COMPLEX_10) 0;
2892               for (n = 0; n < count; n++)
2893                 s += abase[n*axstride] * bbase_y[n];
2894               dest[y*rystride] = s;
2895             }
2896         }
2897     }
2898   else if (axstride < aystride)
2899     {
2900       for (y = 0; y < ycount; y++)
2901         for (x = 0; x < xcount; x++)
2902           dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0;
2903
2904       for (y = 0; y < ycount; y++)
2905         for (n = 0; n < count; n++)
2906           for (x = 0; x < xcount; x++)
2907             /* dest[x,y] += a[x,n] * b[n,y] */
2908             dest[x*rxstride + y*rystride] +=
2909                                         abase[x*axstride + n*aystride] *
2910                                         bbase[n*bxstride + y*bystride];
2911     }
2912   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2913     {
2914       const GFC_COMPLEX_10 *restrict bbase_y;
2915       GFC_COMPLEX_10 s;
2916
2917       for (y = 0; y < ycount; y++)
2918         {
2919           bbase_y = &bbase[y*bystride];
2920           s = (GFC_COMPLEX_10) 0;
2921           for (n = 0; n < count; n++)
2922             s += abase[n*axstride] * bbase_y[n*bxstride];
2923           dest[y*rxstride] = s;
2924         }
2925     }
2926   else
2927     {
2928       const GFC_COMPLEX_10 *restrict abase_x;
2929       const GFC_COMPLEX_10 *restrict bbase_y;
2930       GFC_COMPLEX_10 *restrict dest_y;
2931       GFC_COMPLEX_10 s;
2932
2933       for (y = 0; y < ycount; y++)
2934         {
2935           bbase_y = &bbase[y*bystride];
2936           dest_y = &dest[y*rystride];
2937           for (x = 0; x < xcount; x++)
2938             {
2939               abase_x = &abase[x*axstride];
2940               s = (GFC_COMPLEX_10) 0;
2941               for (n = 0; n < count; n++)
2942                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2943               dest_y[x*rxstride] = s;
2944             }
2945         }
2946     }
2947 }
2948 #undef POW3
2949 #undef min
2950 #undef max
2951
2952 #endif
2953 #endif
2954