libgfortran/generated/matmul_c4.c

   1 /* Implementation of the MATMUL intrinsic
   2    Copyright (C) 2002-2018 Free Software Foundation, Inc.
   3    Contributed by Paul Brook <paul@nowt.org>
   4
   5 This file is part of the GNU Fortran runtime library (libgfortran).
   6
   7 Libgfortran is free software; you can redistribute it and/or
   8 modify it under the terms of the GNU General Public
   9 License as published by the Free Software Foundation; either
  10 version 3 of the License, or (at your option) any later version.
  11
  12 Libgfortran is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 Under Section 7 of GPL version 3, you are granted additional
  18 permissions described in the GCC Runtime Library Exception, version
  19 3.1, as published by the Free Software Foundation.
  20
  21 You should have received a copy of the GNU General Public License and
  22 a copy of the GCC Runtime Library Exception along with this program;
  23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24 <http://www.gnu.org/licenses/>.  */
  25
  26 #include "libgfortran.h"
  27 #include <string.h>
  28 #include <assert.h>
  29
  30
  31 #if defined (HAVE_GFC_COMPLEX_4)
  32
  33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
  34    passed to us by the front-end, in which case we call it for large
  35    matrices.  */
  36
  37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
  38                           const int *, const GFC_COMPLEX_4 *, const GFC_COMPLEX_4 *,
  39                           const int *, const GFC_COMPLEX_4 *, const int *,
  40                           const GFC_COMPLEX_4 *, GFC_COMPLEX_4 *, const int *,
  41                           int, int);
  42
  43 /* The order of loops is different in the case of plain matrix
  44    multiplication C=MATMUL(A,B), and in the frequent special case where
  45    the argument A is the temporary result of a TRANSPOSE intrinsic:
  46    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
  47    looking at their strides.
  48
  49    The equivalent Fortran pseudo-code is:
  50
  51    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
  52    IF (.NOT.IS_TRANSPOSED(A)) THEN
  53      C = 0
  54      DO J=1,N
  55        DO K=1,COUNT
  56          DO I=1,M
  57            C(I,J) = C(I,J)+A(I,K)*B(K,J)
  58    ELSE
  59      DO J=1,N
  60        DO I=1,M
  61          S = 0
  62          DO K=1,COUNT
  63            S = S+A(I,K)*B(K,J)
  64          C(I,J) = S
  65    ENDIF
  66 */
  67
  68 /* If try_blas is set to a nonzero value, then the matmul function will
  69    see if there is a way to perform the matrix multiplication by a call
  70    to the BLAS gemm function.  */
  71
  72 extern void matmul_c4 (gfc_array_c4 * const restrict retarray,
  73         gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
  74         int blas_limit, blas_call gemm);
  75 export_proto(matmul_c4);
  76
  77 /* Put exhaustive list of possible architectures here here, ORed together.  */
  78
  79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
  80
  81 #ifdef HAVE_AVX
  82 static void
  83 matmul_c4_avx (gfc_array_c4 * const restrict retarray,
  84         gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
  85         int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
  86 static void
  87 matmul_c4_avx (gfc_array_c4 * const restrict retarray,
  88         gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
  89         int blas_limit, blas_call gemm)
  90 {
  91   const GFC_COMPLEX_4 * restrict abase;
  92   const GFC_COMPLEX_4 * restrict bbase;
  93   GFC_COMPLEX_4 * restrict dest;
  94
  95   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
  96   index_type x, y, n, count, xcount, ycount;
  97
  98   assert (GFC_DESCRIPTOR_RANK (a) == 2
  99           || GFC_DESCRIPTOR_RANK (b) == 2);
 100
 101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
 102
 103    Either A or B (but not both) can be rank 1:
 104
 105    o One-dimensional argument A is implicitly treated as a row matrix
 106      dimensioned [1,count], so xcount=1.
 107
 108    o One-dimensional argument B is implicitly treated as a column matrix
 109      dimensioned [count, 1], so ycount=1.
 110 */
 111
 112   if (retarray->base_addr == NULL)
 113     {
 114       if (GFC_DESCRIPTOR_RANK (a) == 1)
 115         {
 116           GFC_DIMENSION_SET(retarray->dim[0], 0,
 117                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
 118         }
 119       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 120         {
 121           GFC_DIMENSION_SET(retarray->dim[0], 0,
 122                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 123         }
 124       else
 125         {
 126           GFC_DIMENSION_SET(retarray->dim[0], 0,
 127                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 128
 129           GFC_DIMENSION_SET(retarray->dim[1], 0,
 130                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
 131                             GFC_DESCRIPTOR_EXTENT(retarray,0));
 132         }
 133
 134       retarray->base_addr
 135         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4));
 136       retarray->offset = 0;
 137     }
 138   else if (unlikely (compile_options.bounds_check))
 139     {
 140       index_type ret_extent, arg_extent;
 141
 142       if (GFC_DESCRIPTOR_RANK (a) == 1)
 143         {
 144           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 145           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 146           if (arg_extent != ret_extent)
 147             runtime_error ("Incorrect extent in return array in"
 148                            " MATMUL intrinsic: is %ld, should be %ld",
 149                            (long int) ret_extent, (long int) arg_extent);
 150         }
 151       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 152         {
 153           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 154           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 155           if (arg_extent != ret_extent)
 156             runtime_error ("Incorrect extent in return array in"
 157                            " MATMUL intrinsic: is %ld, should be %ld",
 158                            (long int) ret_extent, (long int) arg_extent);
 159         }
 160       else
 161         {
 162           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 163           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 164           if (arg_extent != ret_extent)
 165             runtime_error ("Incorrect extent in return array in"
 166                            " MATMUL intrinsic for dimension 1:"
 167                            " is %ld, should be %ld",
 168                            (long int) ret_extent, (long int) arg_extent);
 169
 170           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 171           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 172           if (arg_extent != ret_extent)
 173             runtime_error ("Incorrect extent in return array in"
 174                            " MATMUL intrinsic for dimension 2:"
 175                            " is %ld, should be %ld",
 176                            (long int) ret_extent, (long int) arg_extent);
 177         }
 178     }
 179
 180
 181   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 182     {
 183       /* One-dimensional result may be addressed in the code below
 184          either as a row or a column matrix. We want both cases to
 185          work. */
 186       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 187     }
 188   else
 189     {
 190       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 191       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 192     }
 193
 194
 195   if (GFC_DESCRIPTOR_RANK (a) == 1)
 196     {
 197       /* Treat it as a a row matrix A[1,count]. */
 198       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 199       aystride = 1;
 200
 201       xcount = 1;
 202       count = GFC_DESCRIPTOR_EXTENT(a,0);
 203     }
 204   else
 205     {
 206       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 207       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 208
 209       count = GFC_DESCRIPTOR_EXTENT(a,1);
 210       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 211     }
 212
 213   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 214     {
 215       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 216         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
 217     }
 218
 219   if (GFC_DESCRIPTOR_RANK (b) == 1)
 220     {
 221       /* Treat it as a column matrix B[count,1] */
 222       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 223
 224       /* bystride should never be used for 1-dimensional b.
 225          The value is only used for calculation of the
 226          memory by the buffer.  */
 227       bystride = 256;
 228       ycount = 1;
 229     }
 230   else
 231     {
 232       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 233       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 234       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 235     }
 236
 237   abase = a->base_addr;
 238   bbase = b->base_addr;
 239   dest = retarray->base_addr;
 240
 241   /* Now that everything is set up, we perform the multiplication
 242      itself.  */
 243
 244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 245 #define min(a,b) ((a) <= (b) ? (a) : (b))
 246 #define max(a,b) ((a) >= (b) ? (a) : (b))
 247
 248   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 249       && (bxstride == 1 || bystride == 1)
 250       && (((float) xcount) * ((float) ycount) * ((float) count)
 251           > POW3(blas_limit)))
 252     {
 253       const int m = xcount, n = ycount, k = count, ldc = rystride;
 254       const GFC_COMPLEX_4 one = 1, zero = 0;
 255       const int lda = (axstride == 1) ? aystride : axstride,
 256                 ldb = (bxstride == 1) ? bystride : bxstride;
 257
 258       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 259         {
 260           assert (gemm != NULL);
 261           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
 262                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 263                 &ldc, 1, 1);
 264           return;
 265         }
 266     }
 267
 268   if (rxstride == 1 && axstride == 1 && bxstride == 1)
 269     {
 270       /* This block of code implements a tuned matmul, derived from
 271          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 272
 273                Bo Kagstrom and Per Ling
 274                Department of Computing Science
 275                Umea University
 276                S-901 87 Umea, Sweden
 277
 278          from netlib.org, translated to C, and modified for matmul.m4.  */
 279
 280       const GFC_COMPLEX_4 *a, *b;
 281       GFC_COMPLEX_4 *c;
 282       const index_type m = xcount, n = ycount, k = count;
 283
 284       /* System generated locals */
 285       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 286                  i1, i2, i3, i4, i5, i6;
 287
 288       /* Local variables */
 289       GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
 290                  f13, f14, f23, f24, f33, f34, f43, f44;
 291       index_type i, j, l, ii, jj, ll;
 292       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 293       GFC_COMPLEX_4 *t1;
 294
 295       a = abase;
 296       b = bbase;
 297       c = retarray->base_addr;
 298
 299       /* Parameter adjustments */
 300       c_dim1 = rystride;
 301       c_offset = 1 + c_dim1;
 302       c -= c_offset;
 303       a_dim1 = aystride;
 304       a_offset = 1 + a_dim1;
 305       a -= a_offset;
 306       b_dim1 = bystride;
 307       b_offset = 1 + b_dim1;
 308       b -= b_offset;
 309
 310       /* Empty c first.  */
 311       for (j=1; j<=n; j++)
 312         for (i=1; i<=m; i++)
 313           c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
 314
 315       /* Early exit if possible */
 316       if (m == 0 || n == 0 || k == 0)
 317         return;
 318
 319       /* Adjust size of t1 to what is needed.  */
 320       index_type t1_dim;
 321       t1_dim = (a_dim1-1) * 256 + b_dim1;
 322       if (t1_dim > 65536)
 323         t1_dim = 65536;
 324
 325       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
 326
 327       /* Start turning the crank. */
 328       i1 = n;
 329       for (jj = 1; jj <= i1; jj += 512)
 330         {
 331           /* Computing MIN */
 332           i2 = 512;
 333           i3 = n - jj + 1;
 334           jsec = min(i2,i3);
 335           ujsec = jsec - jsec % 4;
 336           i2 = k;
 337           for (ll = 1; ll <= i2; ll += 256)
 338             {
 339               /* Computing MIN */
 340               i3 = 256;
 341               i4 = k - ll + 1;
 342               lsec = min(i3,i4);
 343               ulsec = lsec - lsec % 2;
 344
 345               i3 = m;
 346               for (ii = 1; ii <= i3; ii += 256)
 347                 {
 348                   /* Computing MIN */
 349                   i4 = 256;
 350                   i5 = m - ii + 1;
 351                   isec = min(i4,i5);
 352                   uisec = isec - isec % 2;
 353                   i4 = ll + ulsec - 1;
 354                   for (l = ll; l <= i4; l += 2)
 355                     {
 356                       i5 = ii + uisec - 1;
 357                       for (i = ii; i <= i5; i += 2)
 358                         {
 359                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 360                                         a[i + l * a_dim1];
 361                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 362                                         a[i + (l + 1) * a_dim1];
 363                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 364                                         a[i + 1 + l * a_dim1];
 365                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 366                                         a[i + 1 + (l + 1) * a_dim1];
 367                         }
 368                       if (uisec < isec)
 369                         {
 370                           t1[l - ll + 1 + (isec << 8) - 257] =
 371                                     a[ii + isec - 1 + l * a_dim1];
 372                           t1[l - ll + 2 + (isec << 8) - 257] =
 373                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 374                         }
 375                     }
 376                   if (ulsec < lsec)
 377                     {
 378                       i4 = ii + isec - 1;
 379                       for (i = ii; i<= i4; ++i)
 380                         {
 381                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 382                                     a[i + (ll + lsec - 1) * a_dim1];
 383                         }
 384                     }
 385
 386                   uisec = isec - isec % 4;
 387                   i4 = jj + ujsec - 1;
 388                   for (j = jj; j <= i4; j += 4)
 389                     {
 390                       i5 = ii + uisec - 1;
 391                       for (i = ii; i <= i5; i += 4)
 392                         {
 393                           f11 = c[i + j * c_dim1];
 394                           f21 = c[i + 1 + j * c_dim1];
 395                           f12 = c[i + (j + 1) * c_dim1];
 396                           f22 = c[i + 1 + (j + 1) * c_dim1];
 397                           f13 = c[i + (j + 2) * c_dim1];
 398                           f23 = c[i + 1 + (j + 2) * c_dim1];
 399                           f14 = c[i + (j + 3) * c_dim1];
 400                           f24 = c[i + 1 + (j + 3) * c_dim1];
 401                           f31 = c[i + 2 + j * c_dim1];
 402                           f41 = c[i + 3 + j * c_dim1];
 403                           f32 = c[i + 2 + (j + 1) * c_dim1];
 404                           f42 = c[i + 3 + (j + 1) * c_dim1];
 405                           f33 = c[i + 2 + (j + 2) * c_dim1];
 406                           f43 = c[i + 3 + (j + 2) * c_dim1];
 407                           f34 = c[i + 2 + (j + 3) * c_dim1];
 408                           f44 = c[i + 3 + (j + 3) * c_dim1];
 409                           i6 = ll + lsec - 1;
 410                           for (l = ll; l <= i6; ++l)
 411                             {
 412                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 413                                       * b[l + j * b_dim1];
 414                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 415                                       * b[l + j * b_dim1];
 416                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 417                                       * b[l + (j + 1) * b_dim1];
 418                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 419                                       * b[l + (j + 1) * b_dim1];
 420                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 421                                       * b[l + (j + 2) * b_dim1];
 422                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 423                                       * b[l + (j + 2) * b_dim1];
 424                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 425                                       * b[l + (j + 3) * b_dim1];
 426                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 427                                       * b[l + (j + 3) * b_dim1];
 428                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 429                                       * b[l + j * b_dim1];
 430                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 431                                       * b[l + j * b_dim1];
 432                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 433                                       * b[l + (j + 1) * b_dim1];
 434                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 435                                       * b[l + (j + 1) * b_dim1];
 436                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 437                                       * b[l + (j + 2) * b_dim1];
 438                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 439                                       * b[l + (j + 2) * b_dim1];
 440                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 441                                       * b[l + (j + 3) * b_dim1];
 442                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 443                                       * b[l + (j + 3) * b_dim1];
 444                             }
 445                           c[i + j * c_dim1] = f11;
 446                           c[i + 1 + j * c_dim1] = f21;
 447                           c[i + (j + 1) * c_dim1] = f12;
 448                           c[i + 1 + (j + 1) * c_dim1] = f22;
 449                           c[i + (j + 2) * c_dim1] = f13;
 450                           c[i + 1 + (j + 2) * c_dim1] = f23;
 451                           c[i + (j + 3) * c_dim1] = f14;
 452                           c[i + 1 + (j + 3) * c_dim1] = f24;
 453                           c[i + 2 + j * c_dim1] = f31;
 454                           c[i + 3 + j * c_dim1] = f41;
 455                           c[i + 2 + (j + 1) * c_dim1] = f32;
 456                           c[i + 3 + (j + 1) * c_dim1] = f42;
 457                           c[i + 2 + (j + 2) * c_dim1] = f33;
 458                           c[i + 3 + (j + 2) * c_dim1] = f43;
 459                           c[i + 2 + (j + 3) * c_dim1] = f34;
 460                           c[i + 3 + (j + 3) * c_dim1] = f44;
 461                         }
 462                       if (uisec < isec)
 463                         {
 464                           i5 = ii + isec - 1;
 465                           for (i = ii + uisec; i <= i5; ++i)
 466                             {
 467                               f11 = c[i + j * c_dim1];
 468                               f12 = c[i + (j + 1) * c_dim1];
 469                               f13 = c[i + (j + 2) * c_dim1];
 470                               f14 = c[i + (j + 3) * c_dim1];
 471                               i6 = ll + lsec - 1;
 472                               for (l = ll; l <= i6; ++l)
 473                                 {
 474                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 475                                           257] * b[l + j * b_dim1];
 476                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 477                                           257] * b[l + (j + 1) * b_dim1];
 478                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 479                                           257] * b[l + (j + 2) * b_dim1];
 480                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 481                                           257] * b[l + (j + 3) * b_dim1];
 482                                 }
 483                               c[i + j * c_dim1] = f11;
 484                               c[i + (j + 1) * c_dim1] = f12;
 485                               c[i + (j + 2) * c_dim1] = f13;
 486                               c[i + (j + 3) * c_dim1] = f14;
 487                             }
 488                         }
 489                     }
 490                   if (ujsec < jsec)
 491                     {
 492                       i4 = jj + jsec - 1;
 493                       for (j = jj + ujsec; j <= i4; ++j)
 494                         {
 495                           i5 = ii + uisec - 1;
 496                           for (i = ii; i <= i5; i += 4)
 497                             {
 498                               f11 = c[i + j * c_dim1];
 499                               f21 = c[i + 1 + j * c_dim1];
 500                               f31 = c[i + 2 + j * c_dim1];
 501                               f41 = c[i + 3 + j * c_dim1];
 502                               i6 = ll + lsec - 1;
 503                               for (l = ll; l <= i6; ++l)
 504                                 {
 505                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 506                                           257] * b[l + j * b_dim1];
 507                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
 508                                           257] * b[l + j * b_dim1];
 509                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
 510                                           257] * b[l + j * b_dim1];
 511                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
 512                                           257] * b[l + j * b_dim1];
 513                                 }
 514                               c[i + j * c_dim1] = f11;
 515                               c[i + 1 + j * c_dim1] = f21;
 516                               c[i + 2 + j * c_dim1] = f31;
 517                               c[i + 3 + j * c_dim1] = f41;
 518                             }
 519                           i5 = ii + isec - 1;
 520                           for (i = ii + uisec; i <= i5; ++i)
 521                             {
 522                               f11 = c[i + j * c_dim1];
 523                               i6 = ll + lsec - 1;
 524                               for (l = ll; l <= i6; ++l)
 525                                 {
 526                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 527                                           257] * b[l + j * b_dim1];
 528                                 }
 529                               c[i + j * c_dim1] = f11;
 530                             }
 531                         }
 532                     }
 533                 }
 534             }
 535         }
 536       free(t1);
 537       return;
 538     }
 539   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
 540     {
 541       if (GFC_DESCRIPTOR_RANK (a) != 1)
 542         {
 543           const GFC_COMPLEX_4 *restrict abase_x;
 544           const GFC_COMPLEX_4 *restrict bbase_y;
 545           GFC_COMPLEX_4 *restrict dest_y;
 546           GFC_COMPLEX_4 s;
 547
 548           for (y = 0; y < ycount; y++)
 549             {
 550               bbase_y = &bbase[y*bystride];
 551               dest_y = &dest[y*rystride];
 552               for (x = 0; x < xcount; x++)
 553                 {
 554                   abase_x = &abase[x*axstride];
 555                   s = (GFC_COMPLEX_4) 0;
 556                   for (n = 0; n < count; n++)
 557                     s += abase_x[n] * bbase_y[n];
 558                   dest_y[x] = s;
 559                 }
 560             }
 561         }
 562       else
 563         {
 564           const GFC_COMPLEX_4 *restrict bbase_y;
 565           GFC_COMPLEX_4 s;
 566
 567           for (y = 0; y < ycount; y++)
 568             {
 569               bbase_y = &bbase[y*bystride];
 570               s = (GFC_COMPLEX_4) 0;
 571               for (n = 0; n < count; n++)
 572                 s += abase[n*axstride] * bbase_y[n];
 573               dest[y*rystride] = s;
 574             }
 575         }
 576     }
 577   else if (axstride < aystride)
 578     {
 579       for (y = 0; y < ycount; y++)
 580         for (x = 0; x < xcount; x++)
 581           dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0;
 582
 583       for (y = 0; y < ycount; y++)
 584         for (n = 0; n < count; n++)
 585           for (x = 0; x < xcount; x++)
 586             /* dest[x,y] += a[x,n] * b[n,y] */
 587             dest[x*rxstride + y*rystride] +=
 588                                         abase[x*axstride + n*aystride] *
 589                                         bbase[n*bxstride + y*bystride];
 590     }
 591   else if (GFC_DESCRIPTOR_RANK (a) == 1)
 592     {
 593       const GFC_COMPLEX_4 *restrict bbase_y;
 594       GFC_COMPLEX_4 s;
 595
 596       for (y = 0; y < ycount; y++)
 597         {
 598           bbase_y = &bbase[y*bystride];
 599           s = (GFC_COMPLEX_4) 0;
 600           for (n = 0; n < count; n++)
 601             s += abase[n*axstride] * bbase_y[n*bxstride];
 602           dest[y*rxstride] = s;
 603         }
 604     }
 605   else
 606     {
 607       const GFC_COMPLEX_4 *restrict abase_x;
 608       const GFC_COMPLEX_4 *restrict bbase_y;
 609       GFC_COMPLEX_4 *restrict dest_y;
 610       GFC_COMPLEX_4 s;
 611
 612       for (y = 0; y < ycount; y++)
 613         {
 614           bbase_y = &bbase[y*bystride];
 615           dest_y = &dest[y*rystride];
 616           for (x = 0; x < xcount; x++)
 617             {
 618               abase_x = &abase[x*axstride];
 619               s = (GFC_COMPLEX_4) 0;
 620               for (n = 0; n < count; n++)
 621                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
 622               dest_y[x*rxstride] = s;
 623             }
 624         }
 625     }
 626 }
 627 #undef POW3
 628 #undef min
 629 #undef max
 630
 631 #endif /* HAVE_AVX */
 632
 633 #ifdef HAVE_AVX2
 634 static void
 635 matmul_c4_avx2 (gfc_array_c4 * const restrict retarray,
 636         gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
 637         int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 638 static void
 639 matmul_c4_avx2 (gfc_array_c4 * const restrict retarray,
 640         gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
 641         int blas_limit, blas_call gemm)
 642 {
 643   const GFC_COMPLEX_4 * restrict abase;
 644   const GFC_COMPLEX_4 * restrict bbase;
 645   GFC_COMPLEX_4 * restrict dest;
 646
 647   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
 648   index_type x, y, n, count, xcount, ycount;
 649
 650   assert (GFC_DESCRIPTOR_RANK (a) == 2
 651           || GFC_DESCRIPTOR_RANK (b) == 2);
 652
 653 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
 654
 655    Either A or B (but not both) can be rank 1:
 656
 657    o One-dimensional argument A is implicitly treated as a row matrix
 658      dimensioned [1,count], so xcount=1.
 659
 660    o One-dimensional argument B is implicitly treated as a column matrix
 661      dimensioned [count, 1], so ycount=1.
 662 */
 663
 664   if (retarray->base_addr == NULL)
 665     {
 666       if (GFC_DESCRIPTOR_RANK (a) == 1)
 667         {
 668           GFC_DIMENSION_SET(retarray->dim[0], 0,
 669                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
 670         }
 671       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 672         {
 673           GFC_DIMENSION_SET(retarray->dim[0], 0,
 674                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 675         }
 676       else
 677         {
 678           GFC_DIMENSION_SET(retarray->dim[0], 0,
 679                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 680
 681           GFC_DIMENSION_SET(retarray->dim[1], 0,
 682                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
 683                             GFC_DESCRIPTOR_EXTENT(retarray,0));
 684         }
 685
 686       retarray->base_addr
 687         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4));
 688       retarray->offset = 0;
 689     }
 690   else if (unlikely (compile_options.bounds_check))
 691     {
 692       index_type ret_extent, arg_extent;
 693
 694       if (GFC_DESCRIPTOR_RANK (a) == 1)
 695         {
 696           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 697           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 698           if (arg_extent != ret_extent)
 699             runtime_error ("Incorrect extent in return array in"
 700                            " MATMUL intrinsic: is %ld, should be %ld",
 701                            (long int) ret_extent, (long int) arg_extent);
 702         }
 703       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 704         {
 705           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 706           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 707           if (arg_extent != ret_extent)
 708             runtime_error ("Incorrect extent in return array in"
 709                            " MATMUL intrinsic: is %ld, should be %ld",
 710                            (long int) ret_extent, (long int) arg_extent);
 711         }
 712       else
 713         {
 714           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 715           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 716           if (arg_extent != ret_extent)
 717             runtime_error ("Incorrect extent in return array in"
 718                            " MATMUL intrinsic for dimension 1:"
 719                            " is %ld, should be %ld",
 720                            (long int) ret_extent, (long int) arg_extent);
 721
 722           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 723           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 724           if (arg_extent != ret_extent)
 725             runtime_error ("Incorrect extent in return array in"
 726                            " MATMUL intrinsic for dimension 2:"
 727                            " is %ld, should be %ld",
 728                            (long int) ret_extent, (long int) arg_extent);
 729         }
 730     }
 731
 732
 733   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 734     {
 735       /* One-dimensional result may be addressed in the code below
 736          either as a row or a column matrix. We want both cases to
 737          work. */
 738       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 739     }
 740   else
 741     {
 742       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 743       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 744     }
 745
 746
 747   if (GFC_DESCRIPTOR_RANK (a) == 1)
 748     {
 749       /* Treat it as a a row matrix A[1,count]. */
 750       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 751       aystride = 1;
 752
 753       xcount = 1;
 754       count = GFC_DESCRIPTOR_EXTENT(a,0);
 755     }
 756   else
 757     {
 758       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 759       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 760
 761       count = GFC_DESCRIPTOR_EXTENT(a,1);
 762       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 763     }
 764
 765   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 766     {
 767       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 768         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
 769     }
 770
 771   if (GFC_DESCRIPTOR_RANK (b) == 1)
 772     {
 773       /* Treat it as a column matrix B[count,1] */
 774       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 775
 776       /* bystride should never be used for 1-dimensional b.
 777          The value is only used for calculation of the
 778          memory by the buffer.  */
 779       bystride = 256;
 780       ycount = 1;
 781     }
 782   else
 783     {
 784       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 785       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 786       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 787     }
 788
 789   abase = a->base_addr;
 790   bbase = b->base_addr;
 791   dest = retarray->base_addr;
 792
 793   /* Now that everything is set up, we perform the multiplication
 794      itself.  */
 795
 796 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 797 #define min(a,b) ((a) <= (b) ? (a) : (b))
 798 #define max(a,b) ((a) >= (b) ? (a) : (b))
 799
 800   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 801       && (bxstride == 1 || bystride == 1)
 802       && (((float) xcount) * ((float) ycount) * ((float) count)
 803           > POW3(blas_limit)))
 804     {
 805       const int m = xcount, n = ycount, k = count, ldc = rystride;
 806       const GFC_COMPLEX_4 one = 1, zero = 0;
 807       const int lda = (axstride == 1) ? aystride : axstride,
 808                 ldb = (bxstride == 1) ? bystride : bxstride;
 809
 810       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 811         {
 812           assert (gemm != NULL);
 813           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
 814                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 815                 &ldc, 1, 1);
 816           return;
 817         }
 818     }
 819
 820   if (rxstride == 1 && axstride == 1 && bxstride == 1)
 821     {
 822       /* This block of code implements a tuned matmul, derived from
 823          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 824
 825                Bo Kagstrom and Per Ling
 826                Department of Computing Science
 827                Umea University
 828                S-901 87 Umea, Sweden
 829
 830          from netlib.org, translated to C, and modified for matmul.m4.  */
 831
 832       const GFC_COMPLEX_4 *a, *b;
 833       GFC_COMPLEX_4 *c;
 834       const index_type m = xcount, n = ycount, k = count;
 835
 836       /* System generated locals */
 837       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 838                  i1, i2, i3, i4, i5, i6;
 839
 840       /* Local variables */
 841       GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
 842                  f13, f14, f23, f24, f33, f34, f43, f44;
 843       index_type i, j, l, ii, jj, ll;
 844       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 845       GFC_COMPLEX_4 *t1;
 846
 847       a = abase;
 848       b = bbase;
 849       c = retarray->base_addr;
 850
 851       /* Parameter adjustments */
 852       c_dim1 = rystride;
 853       c_offset = 1 + c_dim1;
 854       c -= c_offset;
 855       a_dim1 = aystride;
 856       a_offset = 1 + a_dim1;
 857       a -= a_offset;
 858       b_dim1 = bystride;
 859       b_offset = 1 + b_dim1;
 860       b -= b_offset;
 861
 862       /* Empty c first.  */
 863       for (j=1; j<=n; j++)
 864         for (i=1; i<=m; i++)
 865           c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
 866
 867       /* Early exit if possible */
 868       if (m == 0 || n == 0 || k == 0)
 869         return;
 870
 871       /* Adjust size of t1 to what is needed.  */
 872       index_type t1_dim;
 873       t1_dim = (a_dim1-1) * 256 + b_dim1;
 874       if (t1_dim > 65536)
 875         t1_dim = 65536;
 876
 877       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
 878
 879       /* Start turning the crank. */
 880       i1 = n;
 881       for (jj = 1; jj <= i1; jj += 512)
 882         {
 883           /* Computing MIN */
 884           i2 = 512;
 885           i3 = n - jj + 1;
 886           jsec = min(i2,i3);
 887           ujsec = jsec - jsec % 4;
 888           i2 = k;
 889           for (ll = 1; ll <= i2; ll += 256)
 890             {
 891               /* Computing MIN */
 892               i3 = 256;
 893               i4 = k - ll + 1;
 894               lsec = min(i3,i4);
 895               ulsec = lsec - lsec % 2;
 896
 897               i3 = m;
 898               for (ii = 1; ii <= i3; ii += 256)
 899                 {
 900                   /* Computing MIN */
 901                   i4 = 256;
 902                   i5 = m - ii + 1;
 903                   isec = min(i4,i5);
 904                   uisec = isec - isec % 2;
 905                   i4 = ll + ulsec - 1;
 906                   for (l = ll; l <= i4; l += 2)
 907                     {
 908                       i5 = ii + uisec - 1;
 909                       for (i = ii; i <= i5; i += 2)
 910                         {
 911                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 912                                         a[i + l * a_dim1];
 913                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 914                                         a[i + (l + 1) * a_dim1];
 915                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 916                                         a[i + 1 + l * a_dim1];
 917                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 918                                         a[i + 1 + (l + 1) * a_dim1];
 919                         }
 920                       if (uisec < isec)
 921                         {
 922                           t1[l - ll + 1 + (isec << 8) - 257] =
 923                                     a[ii + isec - 1 + l * a_dim1];
 924                           t1[l - ll + 2 + (isec << 8) - 257] =
 925                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 926                         }
 927                     }
 928                   if (ulsec < lsec)
 929                     {
 930                       i4 = ii + isec - 1;
 931                       for (i = ii; i<= i4; ++i)
 932                         {
 933                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 934                                     a[i + (ll + lsec - 1) * a_dim1];
 935                         }
 936                     }
 937
 938                   uisec = isec - isec % 4;
 939                   i4 = jj + ujsec - 1;
 940                   for (j = jj; j <= i4; j += 4)
 941                     {
 942                       i5 = ii + uisec - 1;
 943                       for (i = ii; i <= i5; i += 4)
 944                         {
 945                           f11 = c[i + j * c_dim1];
 946                           f21 = c[i + 1 + j * c_dim1];
 947                           f12 = c[i + (j + 1) * c_dim1];
 948                           f22 = c[i + 1 + (j + 1) * c_dim1];
 949                           f13 = c[i + (j + 2) * c_dim1];
 950                           f23 = c[i + 1 + (j + 2) * c_dim1];
 951                           f14 = c[i + (j + 3) * c_dim1];
 952                           f24 = c[i + 1 + (j + 3) * c_dim1];
 953                           f31 = c[i + 2 + j * c_dim1];
 954                           f41 = c[i + 3 + j * c_dim1];
 955                           f32 = c[i + 2 + (j + 1) * c_dim1];
 956                           f42 = c[i + 3 + (j + 1) * c_dim1];
 957                           f33 = c[i + 2 + (j + 2) * c_dim1];
 958                           f43 = c[i + 3 + (j + 2) * c_dim1];
 959                           f34 = c[i + 2 + (j + 3) * c_dim1];
 960                           f44 = c[i + 3 + (j + 3) * c_dim1];
 961                           i6 = ll + lsec - 1;
 962                           for (l = ll; l <= i6; ++l)
 963                             {
 964                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 965                                       * b[l + j * b_dim1];
 966                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 967                                       * b[l + j * b_dim1];
 968                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 969                                       * b[l + (j + 1) * b_dim1];
 970                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 971                                       * b[l + (j + 1) * b_dim1];
 972                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 973                                       * b[l + (j + 2) * b_dim1];
 974                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 975                                       * b[l + (j + 2) * b_dim1];
 976                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 977                                       * b[l + (j + 3) * b_dim1];
 978                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 979                                       * b[l + (j + 3) * b_dim1];
 980                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 981                                       * b[l + j * b_dim1];
 982                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 983                                       * b[l + j * b_dim1];
 984                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 985                                       * b[l + (j + 1) * b_dim1];
 986                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 987                                       * b[l + (j + 1) * b_dim1];
 988                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 989                                       * b[l + (j + 2) * b_dim1];
 990                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 991                                       * b[l + (j + 2) * b_dim1];
 992                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 993                                       * b[l + (j + 3) * b_dim1];
 994                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 995                                       * b[l + (j + 3) * b_dim1];
 996                             }
 997                           c[i + j * c_dim1] = f11;
 998                           c[i + 1 + j * c_dim1] = f21;
 999                           c[i + (j + 1) * c_dim1] = f12;
1000                           c[i + 1 + (j + 1) * c_dim1] = f22;
1001                           c[i + (j + 2) * c_dim1] = f13;
1002                           c[i + 1 + (j + 2) * c_dim1] = f23;
1003                           c[i + (j + 3) * c_dim1] = f14;
1004                           c[i + 1 + (j + 3) * c_dim1] = f24;
1005                           c[i + 2 + j * c_dim1] = f31;
1006                           c[i + 3 + j * c_dim1] = f41;
1007                           c[i + 2 + (j + 1) * c_dim1] = f32;
1008                           c[i + 3 + (j + 1) * c_dim1] = f42;
1009                           c[i + 2 + (j + 2) * c_dim1] = f33;
1010                           c[i + 3 + (j + 2) * c_dim1] = f43;
1011                           c[i + 2 + (j + 3) * c_dim1] = f34;
1012                           c[i + 3 + (j + 3) * c_dim1] = f44;
1013                         }
1014                       if (uisec < isec)
1015                         {
1016                           i5 = ii + isec - 1;
1017                           for (i = ii + uisec; i <= i5; ++i)
1018                             {
1019                               f11 = c[i + j * c_dim1];
1020                               f12 = c[i + (j + 1) * c_dim1];
1021                               f13 = c[i + (j + 2) * c_dim1];
1022                               f14 = c[i + (j + 3) * c_dim1];
1023                               i6 = ll + lsec - 1;
1024                               for (l = ll; l <= i6; ++l)
1025                                 {
1026                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1027                                           257] * b[l + j * b_dim1];
1028                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1029                                           257] * b[l + (j + 1) * b_dim1];
1030                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1031                                           257] * b[l + (j + 2) * b_dim1];
1032                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1033                                           257] * b[l + (j + 3) * b_dim1];
1034                                 }
1035                               c[i + j * c_dim1] = f11;
1036                               c[i + (j + 1) * c_dim1] = f12;
1037                               c[i + (j + 2) * c_dim1] = f13;
1038                               c[i + (j + 3) * c_dim1] = f14;
1039                             }
1040                         }
1041                     }
1042                   if (ujsec < jsec)
1043                     {
1044                       i4 = jj + jsec - 1;
1045                       for (j = jj + ujsec; j <= i4; ++j)
1046                         {
1047                           i5 = ii + uisec - 1;
1048                           for (i = ii; i <= i5; i += 4)
1049                             {
1050                               f11 = c[i + j * c_dim1];
1051                               f21 = c[i + 1 + j * c_dim1];
1052                               f31 = c[i + 2 + j * c_dim1];
1053                               f41 = c[i + 3 + j * c_dim1];
1054                               i6 = ll + lsec - 1;
1055                               for (l = ll; l <= i6; ++l)
1056                                 {
1057                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1058                                           257] * b[l + j * b_dim1];
1059                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1060                                           257] * b[l + j * b_dim1];
1061                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1062                                           257] * b[l + j * b_dim1];
1063                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1064                                           257] * b[l + j * b_dim1];
1065                                 }
1066                               c[i + j * c_dim1] = f11;
1067                               c[i + 1 + j * c_dim1] = f21;
1068                               c[i + 2 + j * c_dim1] = f31;
1069                               c[i + 3 + j * c_dim1] = f41;
1070                             }
1071                           i5 = ii + isec - 1;
1072                           for (i = ii + uisec; i <= i5; ++i)
1073                             {
1074                               f11 = c[i + j * c_dim1];
1075                               i6 = ll + lsec - 1;
1076                               for (l = ll; l <= i6; ++l)
1077                                 {
1078                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1079                                           257] * b[l + j * b_dim1];
1080                                 }
1081                               c[i + j * c_dim1] = f11;
1082                             }
1083                         }
1084                     }
1085                 }
1086             }
1087         }
1088       free(t1);
1089       return;
1090     }
1091   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1092     {
1093       if (GFC_DESCRIPTOR_RANK (a) != 1)
1094         {
1095           const GFC_COMPLEX_4 *restrict abase_x;
1096           const GFC_COMPLEX_4 *restrict bbase_y;
1097           GFC_COMPLEX_4 *restrict dest_y;
1098           GFC_COMPLEX_4 s;
1099
1100           for (y = 0; y < ycount; y++)
1101             {
1102               bbase_y = &bbase[y*bystride];
1103               dest_y = &dest[y*rystride];
1104               for (x = 0; x < xcount; x++)
1105                 {
1106                   abase_x = &abase[x*axstride];
1107                   s = (GFC_COMPLEX_4) 0;
1108                   for (n = 0; n < count; n++)
1109                     s += abase_x[n] * bbase_y[n];
1110                   dest_y[x] = s;
1111                 }
1112             }
1113         }
1114       else
1115         {
1116           const GFC_COMPLEX_4 *restrict bbase_y;
1117           GFC_COMPLEX_4 s;
1118
1119           for (y = 0; y < ycount; y++)
1120             {
1121               bbase_y = &bbase[y*bystride];
1122               s = (GFC_COMPLEX_4) 0;
1123               for (n = 0; n < count; n++)
1124                 s += abase[n*axstride] * bbase_y[n];
1125               dest[y*rystride] = s;
1126             }
1127         }
1128     }
1129   else if (axstride < aystride)
1130     {
1131       for (y = 0; y < ycount; y++)
1132         for (x = 0; x < xcount; x++)
1133           dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0;
1134
1135       for (y = 0; y < ycount; y++)
1136         for (n = 0; n < count; n++)
1137           for (x = 0; x < xcount; x++)
1138             /* dest[x,y] += a[x,n] * b[n,y] */
1139             dest[x*rxstride + y*rystride] +=
1140                                         abase[x*axstride + n*aystride] *
1141                                         bbase[n*bxstride + y*bystride];
1142     }
1143   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1144     {
1145       const GFC_COMPLEX_4 *restrict bbase_y;
1146       GFC_COMPLEX_4 s;
1147
1148       for (y = 0; y < ycount; y++)
1149         {
1150           bbase_y = &bbase[y*bystride];
1151           s = (GFC_COMPLEX_4) 0;
1152           for (n = 0; n < count; n++)
1153             s += abase[n*axstride] * bbase_y[n*bxstride];
1154           dest[y*rxstride] = s;
1155         }
1156     }
1157   else
1158     {
1159       const GFC_COMPLEX_4 *restrict abase_x;
1160       const GFC_COMPLEX_4 *restrict bbase_y;
1161       GFC_COMPLEX_4 *restrict dest_y;
1162       GFC_COMPLEX_4 s;
1163
1164       for (y = 0; y < ycount; y++)
1165         {
1166           bbase_y = &bbase[y*bystride];
1167           dest_y = &dest[y*rystride];
1168           for (x = 0; x < xcount; x++)
1169             {
1170               abase_x = &abase[x*axstride];
1171               s = (GFC_COMPLEX_4) 0;
1172               for (n = 0; n < count; n++)
1173                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1174               dest_y[x*rxstride] = s;
1175             }
1176         }
1177     }
1178 }
1179 #undef POW3
1180 #undef min
1181 #undef max
1182
1183 #endif /* HAVE_AVX2 */
1184
1185 #ifdef HAVE_AVX512F
1186 static void
1187 matmul_c4_avx512f (gfc_array_c4 * const restrict retarray,
1188         gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
1189         int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1190 static void
1191 matmul_c4_avx512f (gfc_array_c4 * const restrict retarray,
1192         gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
1193         int blas_limit, blas_call gemm)
1194 {
1195   const GFC_COMPLEX_4 * restrict abase;
1196   const GFC_COMPLEX_4 * restrict bbase;
1197   GFC_COMPLEX_4 * restrict dest;
1198
1199   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1200   index_type x, y, n, count, xcount, ycount;
1201
1202   assert (GFC_DESCRIPTOR_RANK (a) == 2
1203           || GFC_DESCRIPTOR_RANK (b) == 2);
1204
1205 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1206
1207    Either A or B (but not both) can be rank 1:
1208
1209    o One-dimensional argument A is implicitly treated as a row matrix
1210      dimensioned [1,count], so xcount=1.
1211
1212    o One-dimensional argument B is implicitly treated as a column matrix
1213      dimensioned [count, 1], so ycount=1.
1214 */
1215
1216   if (retarray->base_addr == NULL)
1217     {
1218       if (GFC_DESCRIPTOR_RANK (a) == 1)
1219         {
1220           GFC_DIMENSION_SET(retarray->dim[0], 0,
1221                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1222         }
1223       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1224         {
1225           GFC_DIMENSION_SET(retarray->dim[0], 0,
1226                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1227         }
1228       else
1229         {
1230           GFC_DIMENSION_SET(retarray->dim[0], 0,
1231                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1232
1233           GFC_DIMENSION_SET(retarray->dim[1], 0,
1234                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1235                             GFC_DESCRIPTOR_EXTENT(retarray,0));
1236         }
1237
1238       retarray->base_addr
1239         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4));
1240       retarray->offset = 0;
1241     }
1242   else if (unlikely (compile_options.bounds_check))
1243     {
1244       index_type ret_extent, arg_extent;
1245
1246       if (GFC_DESCRIPTOR_RANK (a) == 1)
1247         {
1248           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1249           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1250           if (arg_extent != ret_extent)
1251             runtime_error ("Incorrect extent in return array in"
1252                            " MATMUL intrinsic: is %ld, should be %ld",
1253                            (long int) ret_extent, (long int) arg_extent);
1254         }
1255       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1256         {
1257           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1258           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1259           if (arg_extent != ret_extent)
1260             runtime_error ("Incorrect extent in return array in"
1261                            " MATMUL intrinsic: is %ld, should be %ld",
1262                            (long int) ret_extent, (long int) arg_extent);
1263         }
1264       else
1265         {
1266           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1267           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1268           if (arg_extent != ret_extent)
1269             runtime_error ("Incorrect extent in return array in"
1270                            " MATMUL intrinsic for dimension 1:"
1271                            " is %ld, should be %ld",
1272                            (long int) ret_extent, (long int) arg_extent);
1273
1274           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1275           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1276           if (arg_extent != ret_extent)
1277             runtime_error ("Incorrect extent in return array in"
1278                            " MATMUL intrinsic for dimension 2:"
1279                            " is %ld, should be %ld",
1280                            (long int) ret_extent, (long int) arg_extent);
1281         }
1282     }
1283
1284
1285   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1286     {
1287       /* One-dimensional result may be addressed in the code below
1288          either as a row or a column matrix. We want both cases to
1289          work. */
1290       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1291     }
1292   else
1293     {
1294       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1295       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1296     }
1297
1298
1299   if (GFC_DESCRIPTOR_RANK (a) == 1)
1300     {
1301       /* Treat it as a a row matrix A[1,count]. */
1302       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1303       aystride = 1;
1304
1305       xcount = 1;
1306       count = GFC_DESCRIPTOR_EXTENT(a,0);
1307     }
1308   else
1309     {
1310       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1311       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1312
1313       count = GFC_DESCRIPTOR_EXTENT(a,1);
1314       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1315     }
1316
1317   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1318     {
1319       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1320         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1321     }
1322
1323   if (GFC_DESCRIPTOR_RANK (b) == 1)
1324     {
1325       /* Treat it as a column matrix B[count,1] */
1326       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1327
1328       /* bystride should never be used for 1-dimensional b.
1329          The value is only used for calculation of the
1330          memory by the buffer.  */
1331       bystride = 256;
1332       ycount = 1;
1333     }
1334   else
1335     {
1336       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1337       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1338       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1339     }
1340
1341   abase = a->base_addr;
1342   bbase = b->base_addr;
1343   dest = retarray->base_addr;
1344
1345   /* Now that everything is set up, we perform the multiplication
1346      itself.  */
1347
1348 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1349 #define min(a,b) ((a) <= (b) ? (a) : (b))
1350 #define max(a,b) ((a) >= (b) ? (a) : (b))
1351
1352   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1353       && (bxstride == 1 || bystride == 1)
1354       && (((float) xcount) * ((float) ycount) * ((float) count)
1355           > POW3(blas_limit)))
1356     {
1357       const int m = xcount, n = ycount, k = count, ldc = rystride;
1358       const GFC_COMPLEX_4 one = 1, zero = 0;
1359       const int lda = (axstride == 1) ? aystride : axstride,
1360                 ldb = (bxstride == 1) ? bystride : bxstride;
1361
1362       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1363         {
1364           assert (gemm != NULL);
1365           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1366                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1367                 &ldc, 1, 1);
1368           return;
1369         }
1370     }
1371
1372   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1373     {
1374       /* This block of code implements a tuned matmul, derived from
1375          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1376
1377                Bo Kagstrom and Per Ling
1378                Department of Computing Science
1379                Umea University
1380                S-901 87 Umea, Sweden
1381
1382          from netlib.org, translated to C, and modified for matmul.m4.  */
1383
1384       const GFC_COMPLEX_4 *a, *b;
1385       GFC_COMPLEX_4 *c;
1386       const index_type m = xcount, n = ycount, k = count;
1387
1388       /* System generated locals */
1389       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1390                  i1, i2, i3, i4, i5, i6;
1391
1392       /* Local variables */
1393       GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
1394                  f13, f14, f23, f24, f33, f34, f43, f44;
1395       index_type i, j, l, ii, jj, ll;
1396       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1397       GFC_COMPLEX_4 *t1;
1398
1399       a = abase;
1400       b = bbase;
1401       c = retarray->base_addr;
1402
1403       /* Parameter adjustments */
1404       c_dim1 = rystride;
1405       c_offset = 1 + c_dim1;
1406       c -= c_offset;
1407       a_dim1 = aystride;
1408       a_offset = 1 + a_dim1;
1409       a -= a_offset;
1410       b_dim1 = bystride;
1411       b_offset = 1 + b_dim1;
1412       b -= b_offset;
1413
1414       /* Empty c first.  */
1415       for (j=1; j<=n; j++)
1416         for (i=1; i<=m; i++)
1417           c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
1418
1419       /* Early exit if possible */
1420       if (m == 0 || n == 0 || k == 0)
1421         return;
1422
1423       /* Adjust size of t1 to what is needed.  */
1424       index_type t1_dim;
1425       t1_dim = (a_dim1-1) * 256 + b_dim1;
1426       if (t1_dim > 65536)
1427         t1_dim = 65536;
1428
1429       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
1430
1431       /* Start turning the crank. */
1432       i1 = n;
1433       for (jj = 1; jj <= i1; jj += 512)
1434         {
1435           /* Computing MIN */
1436           i2 = 512;
1437           i3 = n - jj + 1;
1438           jsec = min(i2,i3);
1439           ujsec = jsec - jsec % 4;
1440           i2 = k;
1441           for (ll = 1; ll <= i2; ll += 256)
1442             {
1443               /* Computing MIN */
1444               i3 = 256;
1445               i4 = k - ll + 1;
1446               lsec = min(i3,i4);
1447               ulsec = lsec - lsec % 2;
1448
1449               i3 = m;
1450               for (ii = 1; ii <= i3; ii += 256)
1451                 {
1452                   /* Computing MIN */
1453                   i4 = 256;
1454                   i5 = m - ii + 1;
1455                   isec = min(i4,i5);
1456                   uisec = isec - isec % 2;
1457                   i4 = ll + ulsec - 1;
1458                   for (l = ll; l <= i4; l += 2)
1459                     {
1460                       i5 = ii + uisec - 1;
1461                       for (i = ii; i <= i5; i += 2)
1462                         {
1463                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1464                                         a[i + l * a_dim1];
1465                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1466                                         a[i + (l + 1) * a_dim1];
1467                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1468                                         a[i + 1 + l * a_dim1];
1469                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1470                                         a[i + 1 + (l + 1) * a_dim1];
1471                         }
1472                       if (uisec < isec)
1473                         {
1474                           t1[l - ll + 1 + (isec << 8) - 257] =
1475                                     a[ii + isec - 1 + l * a_dim1];
1476                           t1[l - ll + 2 + (isec << 8) - 257] =
1477                                     a[ii + isec - 1 + (l + 1) * a_dim1];
1478                         }
1479                     }
1480                   if (ulsec < lsec)
1481                     {
1482                       i4 = ii + isec - 1;
1483                       for (i = ii; i<= i4; ++i)
1484                         {
1485                           t1[lsec + ((i - ii + 1) << 8) - 257] =
1486                                     a[i + (ll + lsec - 1) * a_dim1];
1487                         }
1488                     }
1489
1490                   uisec = isec - isec % 4;
1491                   i4 = jj + ujsec - 1;
1492                   for (j = jj; j <= i4; j += 4)
1493                     {
1494                       i5 = ii + uisec - 1;
1495                       for (i = ii; i <= i5; i += 4)
1496                         {
1497                           f11 = c[i + j * c_dim1];
1498                           f21 = c[i + 1 + j * c_dim1];
1499                           f12 = c[i + (j + 1) * c_dim1];
1500                           f22 = c[i + 1 + (j + 1) * c_dim1];
1501                           f13 = c[i + (j + 2) * c_dim1];
1502                           f23 = c[i + 1 + (j + 2) * c_dim1];
1503                           f14 = c[i + (j + 3) * c_dim1];
1504                           f24 = c[i + 1 + (j + 3) * c_dim1];
1505                           f31 = c[i + 2 + j * c_dim1];
1506                           f41 = c[i + 3 + j * c_dim1];
1507                           f32 = c[i + 2 + (j + 1) * c_dim1];
1508                           f42 = c[i + 3 + (j + 1) * c_dim1];
1509                           f33 = c[i + 2 + (j + 2) * c_dim1];
1510                           f43 = c[i + 3 + (j + 2) * c_dim1];
1511                           f34 = c[i + 2 + (j + 3) * c_dim1];
1512                           f44 = c[i + 3 + (j + 3) * c_dim1];
1513                           i6 = ll + lsec - 1;
1514                           for (l = ll; l <= i6; ++l)
1515                             {
1516                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1517                                       * b[l + j * b_dim1];
1518                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1519                                       * b[l + j * b_dim1];
1520                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1521                                       * b[l + (j + 1) * b_dim1];
1522                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1523                                       * b[l + (j + 1) * b_dim1];
1524                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1525                                       * b[l + (j + 2) * b_dim1];
1526                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1527                                       * b[l + (j + 2) * b_dim1];
1528                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1529                                       * b[l + (j + 3) * b_dim1];
1530                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1531                                       * b[l + (j + 3) * b_dim1];
1532                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1533                                       * b[l + j * b_dim1];
1534                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1535                                       * b[l + j * b_dim1];
1536                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1537                                       * b[l + (j + 1) * b_dim1];
1538                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1539                                       * b[l + (j + 1) * b_dim1];
1540                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1541                                       * b[l + (j + 2) * b_dim1];
1542                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1543                                       * b[l + (j + 2) * b_dim1];
1544                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1545                                       * b[l + (j + 3) * b_dim1];
1546                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1547                                       * b[l + (j + 3) * b_dim1];
1548                             }
1549                           c[i + j * c_dim1] = f11;
1550                           c[i + 1 + j * c_dim1] = f21;
1551                           c[i + (j + 1) * c_dim1] = f12;
1552                           c[i + 1 + (j + 1) * c_dim1] = f22;
1553                           c[i + (j + 2) * c_dim1] = f13;
1554                           c[i + 1 + (j + 2) * c_dim1] = f23;
1555                           c[i + (j + 3) * c_dim1] = f14;
1556                           c[i + 1 + (j + 3) * c_dim1] = f24;
1557                           c[i + 2 + j * c_dim1] = f31;
1558                           c[i + 3 + j * c_dim1] = f41;
1559                           c[i + 2 + (j + 1) * c_dim1] = f32;
1560                           c[i + 3 + (j + 1) * c_dim1] = f42;
1561                           c[i + 2 + (j + 2) * c_dim1] = f33;
1562                           c[i + 3 + (j + 2) * c_dim1] = f43;
1563                           c[i + 2 + (j + 3) * c_dim1] = f34;
1564                           c[i + 3 + (j + 3) * c_dim1] = f44;
1565                         }
1566                       if (uisec < isec)
1567                         {
1568                           i5 = ii + isec - 1;
1569                           for (i = ii + uisec; i <= i5; ++i)
1570                             {
1571                               f11 = c[i + j * c_dim1];
1572                               f12 = c[i + (j + 1) * c_dim1];
1573                               f13 = c[i + (j + 2) * c_dim1];
1574                               f14 = c[i + (j + 3) * c_dim1];
1575                               i6 = ll + lsec - 1;
1576                               for (l = ll; l <= i6; ++l)
1577                                 {
1578                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1579                                           257] * b[l + j * b_dim1];
1580                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1581                                           257] * b[l + (j + 1) * b_dim1];
1582                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1583                                           257] * b[l + (j + 2) * b_dim1];
1584                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1585                                           257] * b[l + (j + 3) * b_dim1];
1586                                 }
1587                               c[i + j * c_dim1] = f11;
1588                               c[i + (j + 1) * c_dim1] = f12;
1589                               c[i + (j + 2) * c_dim1] = f13;
1590                               c[i + (j + 3) * c_dim1] = f14;
1591                             }
1592                         }
1593                     }
1594                   if (ujsec < jsec)
1595                     {
1596                       i4 = jj + jsec - 1;
1597                       for (j = jj + ujsec; j <= i4; ++j)
1598                         {
1599                           i5 = ii + uisec - 1;
1600                           for (i = ii; i <= i5; i += 4)
1601                             {
1602                               f11 = c[i + j * c_dim1];
1603                               f21 = c[i + 1 + j * c_dim1];
1604                               f31 = c[i + 2 + j * c_dim1];
1605                               f41 = c[i + 3 + j * c_dim1];
1606                               i6 = ll + lsec - 1;
1607                               for (l = ll; l <= i6; ++l)
1608                                 {
1609                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1610                                           257] * b[l + j * b_dim1];
1611                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1612                                           257] * b[l + j * b_dim1];
1613                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1614                                           257] * b[l + j * b_dim1];
1615                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1616                                           257] * b[l + j * b_dim1];
1617                                 }
1618                               c[i + j * c_dim1] = f11;
1619                               c[i + 1 + j * c_dim1] = f21;
1620                               c[i + 2 + j * c_dim1] = f31;
1621                               c[i + 3 + j * c_dim1] = f41;
1622                             }
1623                           i5 = ii + isec - 1;
1624                           for (i = ii + uisec; i <= i5; ++i)
1625                             {
1626                               f11 = c[i + j * c_dim1];
1627                               i6 = ll + lsec - 1;
1628                               for (l = ll; l <= i6; ++l)
1629                                 {
1630                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1631                                           257] * b[l + j * b_dim1];
1632                                 }
1633                               c[i + j * c_dim1] = f11;
1634                             }
1635                         }
1636                     }
1637                 }
1638             }
1639         }
1640       free(t1);
1641       return;
1642     }
1643   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1644     {
1645       if (GFC_DESCRIPTOR_RANK (a) != 1)
1646         {
1647           const GFC_COMPLEX_4 *restrict abase_x;
1648           const GFC_COMPLEX_4 *restrict bbase_y;
1649           GFC_COMPLEX_4 *restrict dest_y;
1650           GFC_COMPLEX_4 s;
1651
1652           for (y = 0; y < ycount; y++)
1653             {
1654               bbase_y = &bbase[y*bystride];
1655               dest_y = &dest[y*rystride];
1656               for (x = 0; x < xcount; x++)
1657                 {
1658                   abase_x = &abase[x*axstride];
1659                   s = (GFC_COMPLEX_4) 0;
1660                   for (n = 0; n < count; n++)
1661                     s += abase_x[n] * bbase_y[n];
1662                   dest_y[x] = s;
1663                 }
1664             }
1665         }
1666       else
1667         {
1668           const GFC_COMPLEX_4 *restrict bbase_y;
1669           GFC_COMPLEX_4 s;
1670
1671           for (y = 0; y < ycount; y++)
1672             {
1673               bbase_y = &bbase[y*bystride];
1674               s = (GFC_COMPLEX_4) 0;
1675               for (n = 0; n < count; n++)
1676                 s += abase[n*axstride] * bbase_y[n];
1677               dest[y*rystride] = s;
1678             }
1679         }
1680     }
1681   else if (axstride < aystride)
1682     {
1683       for (y = 0; y < ycount; y++)
1684         for (x = 0; x < xcount; x++)
1685           dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0;
1686
1687       for (y = 0; y < ycount; y++)
1688         for (n = 0; n < count; n++)
1689           for (x = 0; x < xcount; x++)
1690             /* dest[x,y] += a[x,n] * b[n,y] */
1691             dest[x*rxstride + y*rystride] +=
1692                                         abase[x*axstride + n*aystride] *
1693                                         bbase[n*bxstride + y*bystride];
1694     }
1695   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1696     {
1697       const GFC_COMPLEX_4 *restrict bbase_y;
1698       GFC_COMPLEX_4 s;
1699
1700       for (y = 0; y < ycount; y++)
1701         {
1702           bbase_y = &bbase[y*bystride];
1703           s = (GFC_COMPLEX_4) 0;
1704           for (n = 0; n < count; n++)
1705             s += abase[n*axstride] * bbase_y[n*bxstride];
1706           dest[y*rxstride] = s;
1707         }
1708     }
1709   else
1710     {
1711       const GFC_COMPLEX_4 *restrict abase_x;
1712       const GFC_COMPLEX_4 *restrict bbase_y;
1713       GFC_COMPLEX_4 *restrict dest_y;
1714       GFC_COMPLEX_4 s;
1715
1716       for (y = 0; y < ycount; y++)
1717         {
1718           bbase_y = &bbase[y*bystride];
1719           dest_y = &dest[y*rystride];
1720           for (x = 0; x < xcount; x++)
1721             {
1722               abase_x = &abase[x*axstride];
1723               s = (GFC_COMPLEX_4) 0;
1724               for (n = 0; n < count; n++)
1725                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1726               dest_y[x*rxstride] = s;
1727             }
1728         }
1729     }
1730 }
1731 #undef POW3
1732 #undef min
1733 #undef max
1734
1735 #endif  /* HAVE_AVX512F */
1736
1737 /* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
1738
1739 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
1740 void
1741 matmul_c4_avx128_fma3 (gfc_array_c4 * const restrict retarray,
1742         gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
1743         int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
1744 internal_proto(matmul_c4_avx128_fma3);
1745 #endif
1746
1747 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
1748 void
1749 matmul_c4_avx128_fma4 (gfc_array_c4 * const restrict retarray,
1750         gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
1751         int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
1752 internal_proto(matmul_c4_avx128_fma4);
1753 #endif
1754
1755 /* Function to fall back to if there is no special processor-specific version.  */
1756 static void
1757 matmul_c4_vanilla (gfc_array_c4 * const restrict retarray,
1758         gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
1759         int blas_limit, blas_call gemm)
1760 {
1761   const GFC_COMPLEX_4 * restrict abase;
1762   const GFC_COMPLEX_4 * restrict bbase;
1763   GFC_COMPLEX_4 * restrict dest;
1764
1765   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1766   index_type x, y, n, count, xcount, ycount;
1767
1768   assert (GFC_DESCRIPTOR_RANK (a) == 2
1769           || GFC_DESCRIPTOR_RANK (b) == 2);
1770
1771 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1772
1773    Either A or B (but not both) can be rank 1:
1774
1775    o One-dimensional argument A is implicitly treated as a row matrix
1776      dimensioned [1,count], so xcount=1.
1777
1778    o One-dimensional argument B is implicitly treated as a column matrix
1779      dimensioned [count, 1], so ycount=1.
1780 */
1781
1782   if (retarray->base_addr == NULL)
1783     {
1784       if (GFC_DESCRIPTOR_RANK (a) == 1)
1785         {
1786           GFC_DIMENSION_SET(retarray->dim[0], 0,
1787                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1788         }
1789       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1790         {
1791           GFC_DIMENSION_SET(retarray->dim[0], 0,
1792                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1793         }
1794       else
1795         {
1796           GFC_DIMENSION_SET(retarray->dim[0], 0,
1797                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1798
1799           GFC_DIMENSION_SET(retarray->dim[1], 0,
1800                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1801                             GFC_DESCRIPTOR_EXTENT(retarray,0));
1802         }
1803
1804       retarray->base_addr
1805         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4));
1806       retarray->offset = 0;
1807     }
1808   else if (unlikely (compile_options.bounds_check))
1809     {
1810       index_type ret_extent, arg_extent;
1811
1812       if (GFC_DESCRIPTOR_RANK (a) == 1)
1813         {
1814           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1815           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1816           if (arg_extent != ret_extent)
1817             runtime_error ("Incorrect extent in return array in"
1818                            " MATMUL intrinsic: is %ld, should be %ld",
1819                            (long int) ret_extent, (long int) arg_extent);
1820         }
1821       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1822         {
1823           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1824           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1825           if (arg_extent != ret_extent)
1826             runtime_error ("Incorrect extent in return array in"
1827                            " MATMUL intrinsic: is %ld, should be %ld",
1828                            (long int) ret_extent, (long int) arg_extent);
1829         }
1830       else
1831         {
1832           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1833           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1834           if (arg_extent != ret_extent)
1835             runtime_error ("Incorrect extent in return array in"
1836                            " MATMUL intrinsic for dimension 1:"
1837                            " is %ld, should be %ld",
1838                            (long int) ret_extent, (long int) arg_extent);
1839
1840           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1841           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1842           if (arg_extent != ret_extent)
1843             runtime_error ("Incorrect extent in return array in"
1844                            " MATMUL intrinsic for dimension 2:"
1845                            " is %ld, should be %ld",
1846                            (long int) ret_extent, (long int) arg_extent);
1847         }
1848     }
1849
1850
1851   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1852     {
1853       /* One-dimensional result may be addressed in the code below
1854          either as a row or a column matrix. We want both cases to
1855          work. */
1856       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1857     }
1858   else
1859     {
1860       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1861       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1862     }
1863
1864
1865   if (GFC_DESCRIPTOR_RANK (a) == 1)
1866     {
1867       /* Treat it as a a row matrix A[1,count]. */
1868       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1869       aystride = 1;
1870
1871       xcount = 1;
1872       count = GFC_DESCRIPTOR_EXTENT(a,0);
1873     }
1874   else
1875     {
1876       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1877       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1878
1879       count = GFC_DESCRIPTOR_EXTENT(a,1);
1880       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1881     }
1882
1883   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1884     {
1885       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1886         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1887     }
1888
1889   if (GFC_DESCRIPTOR_RANK (b) == 1)
1890     {
1891       /* Treat it as a column matrix B[count,1] */
1892       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1893
1894       /* bystride should never be used for 1-dimensional b.
1895          The value is only used for calculation of the
1896          memory by the buffer.  */
1897       bystride = 256;
1898       ycount = 1;
1899     }
1900   else
1901     {
1902       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1903       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1904       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1905     }
1906
1907   abase = a->base_addr;
1908   bbase = b->base_addr;
1909   dest = retarray->base_addr;
1910
1911   /* Now that everything is set up, we perform the multiplication
1912      itself.  */
1913
1914 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1915 #define min(a,b) ((a) <= (b) ? (a) : (b))
1916 #define max(a,b) ((a) >= (b) ? (a) : (b))
1917
1918   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1919       && (bxstride == 1 || bystride == 1)
1920       && (((float) xcount) * ((float) ycount) * ((float) count)
1921           > POW3(blas_limit)))
1922     {
1923       const int m = xcount, n = ycount, k = count, ldc = rystride;
1924       const GFC_COMPLEX_4 one = 1, zero = 0;
1925       const int lda = (axstride == 1) ? aystride : axstride,
1926                 ldb = (bxstride == 1) ? bystride : bxstride;
1927
1928       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1929         {
1930           assert (gemm != NULL);
1931           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1932                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1933                 &ldc, 1, 1);
1934           return;
1935         }
1936     }
1937
1938   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1939     {
1940       /* This block of code implements a tuned matmul, derived from
1941          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1942
1943                Bo Kagstrom and Per Ling
1944                Department of Computing Science
1945                Umea University
1946                S-901 87 Umea, Sweden
1947
1948          from netlib.org, translated to C, and modified for matmul.m4.  */
1949
1950       const GFC_COMPLEX_4 *a, *b;
1951       GFC_COMPLEX_4 *c;
1952       const index_type m = xcount, n = ycount, k = count;
1953
1954       /* System generated locals */
1955       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1956                  i1, i2, i3, i4, i5, i6;
1957
1958       /* Local variables */
1959       GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
1960                  f13, f14, f23, f24, f33, f34, f43, f44;
1961       index_type i, j, l, ii, jj, ll;
1962       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1963       GFC_COMPLEX_4 *t1;
1964
1965       a = abase;
1966       b = bbase;
1967       c = retarray->base_addr;
1968
1969       /* Parameter adjustments */
1970       c_dim1 = rystride;
1971       c_offset = 1 + c_dim1;
1972       c -= c_offset;
1973       a_dim1 = aystride;
1974       a_offset = 1 + a_dim1;
1975       a -= a_offset;
1976       b_dim1 = bystride;
1977       b_offset = 1 + b_dim1;
1978       b -= b_offset;
1979
1980       /* Empty c first.  */
1981       for (j=1; j<=n; j++)
1982         for (i=1; i<=m; i++)
1983           c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
1984
1985       /* Early exit if possible */
1986       if (m == 0 || n == 0 || k == 0)
1987         return;
1988
1989       /* Adjust size of t1 to what is needed.  */
1990       index_type t1_dim;
1991       t1_dim = (a_dim1-1) * 256 + b_dim1;
1992       if (t1_dim > 65536)
1993         t1_dim = 65536;
1994
1995       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
1996
1997       /* Start turning the crank. */
1998       i1 = n;
1999       for (jj = 1; jj <= i1; jj += 512)
2000         {
2001           /* Computing MIN */
2002           i2 = 512;
2003           i3 = n - jj + 1;
2004           jsec = min(i2,i3);
2005           ujsec = jsec - jsec % 4;
2006           i2 = k;
2007           for (ll = 1; ll <= i2; ll += 256)
2008             {
2009               /* Computing MIN */
2010               i3 = 256;
2011               i4 = k - ll + 1;
2012               lsec = min(i3,i4);
2013               ulsec = lsec - lsec % 2;
2014
2015               i3 = m;
2016               for (ii = 1; ii <= i3; ii += 256)
2017                 {
2018                   /* Computing MIN */
2019                   i4 = 256;
2020                   i5 = m - ii + 1;
2021                   isec = min(i4,i5);
2022                   uisec = isec - isec % 2;
2023                   i4 = ll + ulsec - 1;
2024                   for (l = ll; l <= i4; l += 2)
2025                     {
2026                       i5 = ii + uisec - 1;
2027                       for (i = ii; i <= i5; i += 2)
2028                         {
2029                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2030                                         a[i + l * a_dim1];
2031                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2032                                         a[i + (l + 1) * a_dim1];
2033                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2034                                         a[i + 1 + l * a_dim1];
2035                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2036                                         a[i + 1 + (l + 1) * a_dim1];
2037                         }
2038                       if (uisec < isec)
2039                         {
2040                           t1[l - ll + 1 + (isec << 8) - 257] =
2041                                     a[ii + isec - 1 + l * a_dim1];
2042                           t1[l - ll + 2 + (isec << 8) - 257] =
2043                                     a[ii + isec - 1 + (l + 1) * a_dim1];
2044                         }
2045                     }
2046                   if (ulsec < lsec)
2047                     {
2048                       i4 = ii + isec - 1;
2049                       for (i = ii; i<= i4; ++i)
2050                         {
2051                           t1[lsec + ((i - ii + 1) << 8) - 257] =
2052                                     a[i + (ll + lsec - 1) * a_dim1];
2053                         }
2054                     }
2055
2056                   uisec = isec - isec % 4;
2057                   i4 = jj + ujsec - 1;
2058                   for (j = jj; j <= i4; j += 4)
2059                     {
2060                       i5 = ii + uisec - 1;
2061                       for (i = ii; i <= i5; i += 4)
2062                         {
2063                           f11 = c[i + j * c_dim1];
2064                           f21 = c[i + 1 + j * c_dim1];
2065                           f12 = c[i + (j + 1) * c_dim1];
2066                           f22 = c[i + 1 + (j + 1) * c_dim1];
2067                           f13 = c[i + (j + 2) * c_dim1];
2068                           f23 = c[i + 1 + (j + 2) * c_dim1];
2069                           f14 = c[i + (j + 3) * c_dim1];
2070                           f24 = c[i + 1 + (j + 3) * c_dim1];
2071                           f31 = c[i + 2 + j * c_dim1];
2072                           f41 = c[i + 3 + j * c_dim1];
2073                           f32 = c[i + 2 + (j + 1) * c_dim1];
2074                           f42 = c[i + 3 + (j + 1) * c_dim1];
2075                           f33 = c[i + 2 + (j + 2) * c_dim1];
2076                           f43 = c[i + 3 + (j + 2) * c_dim1];
2077                           f34 = c[i + 2 + (j + 3) * c_dim1];
2078                           f44 = c[i + 3 + (j + 3) * c_dim1];
2079                           i6 = ll + lsec - 1;
2080                           for (l = ll; l <= i6; ++l)
2081                             {
2082                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2083                                       * b[l + j * b_dim1];
2084                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2085                                       * b[l + j * b_dim1];
2086                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2087                                       * b[l + (j + 1) * b_dim1];
2088                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2089                                       * b[l + (j + 1) * b_dim1];
2090                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2091                                       * b[l + (j + 2) * b_dim1];
2092                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2093                                       * b[l + (j + 2) * b_dim1];
2094                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2095                                       * b[l + (j + 3) * b_dim1];
2096                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2097                                       * b[l + (j + 3) * b_dim1];
2098                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2099                                       * b[l + j * b_dim1];
2100                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2101                                       * b[l + j * b_dim1];
2102                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2103                                       * b[l + (j + 1) * b_dim1];
2104                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2105                                       * b[l + (j + 1) * b_dim1];
2106                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2107                                       * b[l + (j + 2) * b_dim1];
2108                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2109                                       * b[l + (j + 2) * b_dim1];
2110                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2111                                       * b[l + (j + 3) * b_dim1];
2112                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2113                                       * b[l + (j + 3) * b_dim1];
2114                             }
2115                           c[i + j * c_dim1] = f11;
2116                           c[i + 1 + j * c_dim1] = f21;
2117                           c[i + (j + 1) * c_dim1] = f12;
2118                           c[i + 1 + (j + 1) * c_dim1] = f22;
2119                           c[i + (j + 2) * c_dim1] = f13;
2120                           c[i + 1 + (j + 2) * c_dim1] = f23;
2121                           c[i + (j + 3) * c_dim1] = f14;
2122                           c[i + 1 + (j + 3) * c_dim1] = f24;
2123                           c[i + 2 + j * c_dim1] = f31;
2124                           c[i + 3 + j * c_dim1] = f41;
2125                           c[i + 2 + (j + 1) * c_dim1] = f32;
2126                           c[i + 3 + (j + 1) * c_dim1] = f42;
2127                           c[i + 2 + (j + 2) * c_dim1] = f33;
2128                           c[i + 3 + (j + 2) * c_dim1] = f43;
2129                           c[i + 2 + (j + 3) * c_dim1] = f34;
2130                           c[i + 3 + (j + 3) * c_dim1] = f44;
2131                         }
2132                       if (uisec < isec)
2133                         {
2134                           i5 = ii + isec - 1;
2135                           for (i = ii + uisec; i <= i5; ++i)
2136                             {
2137                               f11 = c[i + j * c_dim1];
2138                               f12 = c[i + (j + 1) * c_dim1];
2139                               f13 = c[i + (j + 2) * c_dim1];
2140                               f14 = c[i + (j + 3) * c_dim1];
2141                               i6 = ll + lsec - 1;
2142                               for (l = ll; l <= i6; ++l)
2143                                 {
2144                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2145                                           257] * b[l + j * b_dim1];
2146                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2147                                           257] * b[l + (j + 1) * b_dim1];
2148                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2149                                           257] * b[l + (j + 2) * b_dim1];
2150                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2151                                           257] * b[l + (j + 3) * b_dim1];
2152                                 }
2153                               c[i + j * c_dim1] = f11;
2154                               c[i + (j + 1) * c_dim1] = f12;
2155                               c[i + (j + 2) * c_dim1] = f13;
2156                               c[i + (j + 3) * c_dim1] = f14;
2157                             }
2158                         }
2159                     }
2160                   if (ujsec < jsec)
2161                     {
2162                       i4 = jj + jsec - 1;
2163                       for (j = jj + ujsec; j <= i4; ++j)
2164                         {
2165                           i5 = ii + uisec - 1;
2166                           for (i = ii; i <= i5; i += 4)
2167                             {
2168                               f11 = c[i + j * c_dim1];
2169                               f21 = c[i + 1 + j * c_dim1];
2170                               f31 = c[i + 2 + j * c_dim1];
2171                               f41 = c[i + 3 + j * c_dim1];
2172                               i6 = ll + lsec - 1;
2173                               for (l = ll; l <= i6; ++l)
2174                                 {
2175                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2176                                           257] * b[l + j * b_dim1];
2177                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2178                                           257] * b[l + j * b_dim1];
2179                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2180                                           257] * b[l + j * b_dim1];
2181                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2182                                           257] * b[l + j * b_dim1];
2183                                 }
2184                               c[i + j * c_dim1] = f11;
2185                               c[i + 1 + j * c_dim1] = f21;
2186                               c[i + 2 + j * c_dim1] = f31;
2187                               c[i + 3 + j * c_dim1] = f41;
2188                             }
2189                           i5 = ii + isec - 1;
2190                           for (i = ii + uisec; i <= i5; ++i)
2191                             {
2192                               f11 = c[i + j * c_dim1];
2193                               i6 = ll + lsec - 1;
2194                               for (l = ll; l <= i6; ++l)
2195                                 {
2196                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2197                                           257] * b[l + j * b_dim1];
2198                                 }
2199                               c[i + j * c_dim1] = f11;
2200                             }
2201                         }
2202                     }
2203                 }
2204             }
2205         }
2206       free(t1);
2207       return;
2208     }
2209   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2210     {
2211       if (GFC_DESCRIPTOR_RANK (a) != 1)
2212         {
2213           const GFC_COMPLEX_4 *restrict abase_x;
2214           const GFC_COMPLEX_4 *restrict bbase_y;
2215           GFC_COMPLEX_4 *restrict dest_y;
2216           GFC_COMPLEX_4 s;
2217
2218           for (y = 0; y < ycount; y++)
2219             {
2220               bbase_y = &bbase[y*bystride];
2221               dest_y = &dest[y*rystride];
2222               for (x = 0; x < xcount; x++)
2223                 {
2224                   abase_x = &abase[x*axstride];
2225                   s = (GFC_COMPLEX_4) 0;
2226                   for (n = 0; n < count; n++)
2227                     s += abase_x[n] * bbase_y[n];
2228                   dest_y[x] = s;
2229                 }
2230             }
2231         }
2232       else
2233         {
2234           const GFC_COMPLEX_4 *restrict bbase_y;
2235           GFC_COMPLEX_4 s;
2236
2237           for (y = 0; y < ycount; y++)
2238             {
2239               bbase_y = &bbase[y*bystride];
2240               s = (GFC_COMPLEX_4) 0;
2241               for (n = 0; n < count; n++)
2242                 s += abase[n*axstride] * bbase_y[n];
2243               dest[y*rystride] = s;
2244             }
2245         }
2246     }
2247   else if (axstride < aystride)
2248     {
2249       for (y = 0; y < ycount; y++)
2250         for (x = 0; x < xcount; x++)
2251           dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0;
2252
2253       for (y = 0; y < ycount; y++)
2254         for (n = 0; n < count; n++)
2255           for (x = 0; x < xcount; x++)
2256             /* dest[x,y] += a[x,n] * b[n,y] */
2257             dest[x*rxstride + y*rystride] +=
2258                                         abase[x*axstride + n*aystride] *
2259                                         bbase[n*bxstride + y*bystride];
2260     }
2261   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2262     {
2263       const GFC_COMPLEX_4 *restrict bbase_y;
2264       GFC_COMPLEX_4 s;
2265
2266       for (y = 0; y < ycount; y++)
2267         {
2268           bbase_y = &bbase[y*bystride];
2269           s = (GFC_COMPLEX_4) 0;
2270           for (n = 0; n < count; n++)
2271             s += abase[n*axstride] * bbase_y[n*bxstride];
2272           dest[y*rxstride] = s;
2273         }
2274     }
2275   else
2276     {
2277       const GFC_COMPLEX_4 *restrict abase_x;
2278       const GFC_COMPLEX_4 *restrict bbase_y;
2279       GFC_COMPLEX_4 *restrict dest_y;
2280       GFC_COMPLEX_4 s;
2281
2282       for (y = 0; y < ycount; y++)
2283         {
2284           bbase_y = &bbase[y*bystride];
2285           dest_y = &dest[y*rystride];
2286           for (x = 0; x < xcount; x++)
2287             {
2288               abase_x = &abase[x*axstride];
2289               s = (GFC_COMPLEX_4) 0;
2290               for (n = 0; n < count; n++)
2291                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2292               dest_y[x*rxstride] = s;
2293             }
2294         }
2295     }
2296 }
2297 #undef POW3
2298 #undef min
2299 #undef max
2300
2301
2302 /* Compiling main function, with selection code for the processor.  */
2303
2304 /* Currently, this is i386 only.  Adjust for other architectures.  */
2305
2306 #include <config/i386/cpuinfo.h>
2307 void matmul_c4 (gfc_array_c4 * const restrict retarray,
2308         gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
2309         int blas_limit, blas_call gemm)
2310 {
2311   static void (*matmul_p) (gfc_array_c4 * const restrict retarray,
2312         gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
2313         int blas_limit, blas_call gemm);
2314
2315   void (*matmul_fn) (gfc_array_c4 * const restrict retarray,
2316         gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
2317         int blas_limit, blas_call gemm);
2318
2319   matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2320   if (matmul_fn == NULL)
2321     {
2322       matmul_fn = matmul_c4_vanilla;
2323       if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2324         {
2325           /* Run down the available processors in order of preference.  */
2326 #ifdef HAVE_AVX512F
2327           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2328             {
2329               matmul_fn = matmul_c4_avx512f;
2330               goto store;
2331             }
2332
2333 #endif  /* HAVE_AVX512F */
2334
2335 #ifdef HAVE_AVX2
2336           if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2337              && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2338             {
2339               matmul_fn = matmul_c4_avx2;
2340               goto store;
2341             }
2342
2343 #endif
2344
2345 #ifdef HAVE_AVX
2346           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2347             {
2348               matmul_fn = matmul_c4_avx;
2349               goto store;
2350             }
2351 #endif  /* HAVE_AVX */
2352         }
2353     else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
2354       {
2355 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
2356         if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2357             && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2358           {
2359             matmul_fn = matmul_c4_avx128_fma3;
2360             goto store;
2361           }
2362 #endif
2363 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
2364         if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2365              && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
2366           {
2367             matmul_fn = matmul_c4_avx128_fma4;
2368             goto store;
2369           }
2370 #endif
2371
2372       }
2373    store:
2374       __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2375    }
2376
2377    (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2378 }
2379
2380 #else  /* Just the vanilla function.  */
2381
2382 void
2383 matmul_c4 (gfc_array_c4 * const restrict retarray,
2384         gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
2385         int blas_limit, blas_call gemm)
2386 {
2387   const GFC_COMPLEX_4 * restrict abase;
2388   const GFC_COMPLEX_4 * restrict bbase;
2389   GFC_COMPLEX_4 * restrict dest;
2390
2391   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2392   index_type x, y, n, count, xcount, ycount;
2393
2394   assert (GFC_DESCRIPTOR_RANK (a) == 2
2395           || GFC_DESCRIPTOR_RANK (b) == 2);
2396
2397 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2398
2399    Either A or B (but not both) can be rank 1:
2400
2401    o One-dimensional argument A is implicitly treated as a row matrix
2402      dimensioned [1,count], so xcount=1.
2403
2404    o One-dimensional argument B is implicitly treated as a column matrix
2405      dimensioned [count, 1], so ycount=1.
2406 */
2407
2408   if (retarray->base_addr == NULL)
2409     {
2410       if (GFC_DESCRIPTOR_RANK (a) == 1)
2411         {
2412           GFC_DIMENSION_SET(retarray->dim[0], 0,
2413                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2414         }
2415       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2416         {
2417           GFC_DIMENSION_SET(retarray->dim[0], 0,
2418                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2419         }
2420       else
2421         {
2422           GFC_DIMENSION_SET(retarray->dim[0], 0,
2423                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2424
2425           GFC_DIMENSION_SET(retarray->dim[1], 0,
2426                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2427                             GFC_DESCRIPTOR_EXTENT(retarray,0));
2428         }
2429
2430       retarray->base_addr
2431         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4));
2432       retarray->offset = 0;
2433     }
2434   else if (unlikely (compile_options.bounds_check))
2435     {
2436       index_type ret_extent, arg_extent;
2437
2438       if (GFC_DESCRIPTOR_RANK (a) == 1)
2439         {
2440           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2441           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2442           if (arg_extent != ret_extent)
2443             runtime_error ("Incorrect extent in return array in"
2444                            " MATMUL intrinsic: is %ld, should be %ld",
2445                            (long int) ret_extent, (long int) arg_extent);
2446         }
2447       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2448         {
2449           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2450           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2451           if (arg_extent != ret_extent)
2452             runtime_error ("Incorrect extent in return array in"
2453                            " MATMUL intrinsic: is %ld, should be %ld",
2454                            (long int) ret_extent, (long int) arg_extent);
2455         }
2456       else
2457         {
2458           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2459           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2460           if (arg_extent != ret_extent)
2461             runtime_error ("Incorrect extent in return array in"
2462                            " MATMUL intrinsic for dimension 1:"
2463                            " is %ld, should be %ld",
2464                            (long int) ret_extent, (long int) arg_extent);
2465
2466           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2467           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2468           if (arg_extent != ret_extent)
2469             runtime_error ("Incorrect extent in return array in"
2470                            " MATMUL intrinsic for dimension 2:"
2471                            " is %ld, should be %ld",
2472                            (long int) ret_extent, (long int) arg_extent);
2473         }
2474     }
2475
2476
2477   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2478     {
2479       /* One-dimensional result may be addressed in the code below
2480          either as a row or a column matrix. We want both cases to
2481          work. */
2482       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2483     }
2484   else
2485     {
2486       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2487       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2488     }
2489
2490
2491   if (GFC_DESCRIPTOR_RANK (a) == 1)
2492     {
2493       /* Treat it as a a row matrix A[1,count]. */
2494       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2495       aystride = 1;
2496
2497       xcount = 1;
2498       count = GFC_DESCRIPTOR_EXTENT(a,0);
2499     }
2500   else
2501     {
2502       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2503       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2504
2505       count = GFC_DESCRIPTOR_EXTENT(a,1);
2506       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2507     }
2508
2509   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2510     {
2511       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2512         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2513     }
2514
2515   if (GFC_DESCRIPTOR_RANK (b) == 1)
2516     {
2517       /* Treat it as a column matrix B[count,1] */
2518       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2519
2520       /* bystride should never be used for 1-dimensional b.
2521          The value is only used for calculation of the
2522          memory by the buffer.  */
2523       bystride = 256;
2524       ycount = 1;
2525     }
2526   else
2527     {
2528       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2529       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2530       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2531     }
2532
2533   abase = a->base_addr;
2534   bbase = b->base_addr;
2535   dest = retarray->base_addr;
2536
2537   /* Now that everything is set up, we perform the multiplication
2538      itself.  */
2539
2540 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2541 #define min(a,b) ((a) <= (b) ? (a) : (b))
2542 #define max(a,b) ((a) >= (b) ? (a) : (b))
2543
2544   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2545       && (bxstride == 1 || bystride == 1)
2546       && (((float) xcount) * ((float) ycount) * ((float) count)
2547           > POW3(blas_limit)))
2548     {
2549       const int m = xcount, n = ycount, k = count, ldc = rystride;
2550       const GFC_COMPLEX_4 one = 1, zero = 0;
2551       const int lda = (axstride == 1) ? aystride : axstride,
2552                 ldb = (bxstride == 1) ? bystride : bxstride;
2553
2554       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2555         {
2556           assert (gemm != NULL);
2557           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2558                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2559                 &ldc, 1, 1);
2560           return;
2561         }
2562     }
2563
2564   if (rxstride == 1 && axstride == 1 && bxstride == 1)
2565     {
2566       /* This block of code implements a tuned matmul, derived from
2567          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2568
2569                Bo Kagstrom and Per Ling
2570                Department of Computing Science
2571                Umea University
2572                S-901 87 Umea, Sweden
2573
2574          from netlib.org, translated to C, and modified for matmul.m4.  */
2575
2576       const GFC_COMPLEX_4 *a, *b;
2577       GFC_COMPLEX_4 *c;
2578       const index_type m = xcount, n = ycount, k = count;
2579
2580       /* System generated locals */
2581       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2582                  i1, i2, i3, i4, i5, i6;
2583
2584       /* Local variables */
2585       GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
2586                  f13, f14, f23, f24, f33, f34, f43, f44;
2587       index_type i, j, l, ii, jj, ll;
2588       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2589       GFC_COMPLEX_4 *t1;
2590
2591       a = abase;
2592       b = bbase;
2593       c = retarray->base_addr;
2594
2595       /* Parameter adjustments */
2596       c_dim1 = rystride;
2597       c_offset = 1 + c_dim1;
2598       c -= c_offset;
2599       a_dim1 = aystride;
2600       a_offset = 1 + a_dim1;
2601       a -= a_offset;
2602       b_dim1 = bystride;
2603       b_offset = 1 + b_dim1;
2604       b -= b_offset;
2605
2606       /* Empty c first.  */
2607       for (j=1; j<=n; j++)
2608         for (i=1; i<=m; i++)
2609           c[i + j * c_dim1] = (GFC_COMPLEX_4)0;
2610
2611       /* Early exit if possible */
2612       if (m == 0 || n == 0 || k == 0)
2613         return;
2614
2615       /* Adjust size of t1 to what is needed.  */
2616       index_type t1_dim;
2617       t1_dim = (a_dim1-1) * 256 + b_dim1;
2618       if (t1_dim > 65536)
2619         t1_dim = 65536;
2620
2621       t1 = malloc (t1_dim * sizeof(GFC_COMPLEX_4));
2622
2623       /* Start turning the crank. */
2624       i1 = n;
2625       for (jj = 1; jj <= i1; jj += 512)
2626         {
2627           /* Computing MIN */
2628           i2 = 512;
2629           i3 = n - jj + 1;
2630           jsec = min(i2,i3);
2631           ujsec = jsec - jsec % 4;
2632           i2 = k;
2633           for (ll = 1; ll <= i2; ll += 256)
2634             {
2635               /* Computing MIN */
2636               i3 = 256;
2637               i4 = k - ll + 1;
2638               lsec = min(i3,i4);
2639               ulsec = lsec - lsec % 2;
2640
2641               i3 = m;
2642               for (ii = 1; ii <= i3; ii += 256)
2643                 {
2644                   /* Computing MIN */
2645                   i4 = 256;
2646                   i5 = m - ii + 1;
2647                   isec = min(i4,i5);
2648                   uisec = isec - isec % 2;
2649                   i4 = ll + ulsec - 1;
2650                   for (l = ll; l <= i4; l += 2)
2651                     {
2652                       i5 = ii + uisec - 1;
2653                       for (i = ii; i <= i5; i += 2)
2654                         {
2655                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2656                                         a[i + l * a_dim1];
2657                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2658                                         a[i + (l + 1) * a_dim1];
2659                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2660                                         a[i + 1 + l * a_dim1];
2661                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2662                                         a[i + 1 + (l + 1) * a_dim1];
2663                         }
2664                       if (uisec < isec)
2665                         {
2666                           t1[l - ll + 1 + (isec << 8) - 257] =
2667                                     a[ii + isec - 1 + l * a_dim1];
2668                           t1[l - ll + 2 + (isec << 8) - 257] =
2669                                     a[ii + isec - 1 + (l + 1) * a_dim1];
2670                         }
2671                     }
2672                   if (ulsec < lsec)
2673                     {
2674                       i4 = ii + isec - 1;
2675                       for (i = ii; i<= i4; ++i)
2676                         {
2677                           t1[lsec + ((i - ii + 1) << 8) - 257] =
2678                                     a[i + (ll + lsec - 1) * a_dim1];
2679                         }
2680                     }
2681
2682                   uisec = isec - isec % 4;
2683                   i4 = jj + ujsec - 1;
2684                   for (j = jj; j <= i4; j += 4)
2685                     {
2686                       i5 = ii + uisec - 1;
2687                       for (i = ii; i <= i5; i += 4)
2688                         {
2689                           f11 = c[i + j * c_dim1];
2690                           f21 = c[i + 1 + j * c_dim1];
2691                           f12 = c[i + (j + 1) * c_dim1];
2692                           f22 = c[i + 1 + (j + 1) * c_dim1];
2693                           f13 = c[i + (j + 2) * c_dim1];
2694                           f23 = c[i + 1 + (j + 2) * c_dim1];
2695                           f14 = c[i + (j + 3) * c_dim1];
2696                           f24 = c[i + 1 + (j + 3) * c_dim1];
2697                           f31 = c[i + 2 + j * c_dim1];
2698                           f41 = c[i + 3 + j * c_dim1];
2699                           f32 = c[i + 2 + (j + 1) * c_dim1];
2700                           f42 = c[i + 3 + (j + 1) * c_dim1];
2701                           f33 = c[i + 2 + (j + 2) * c_dim1];
2702                           f43 = c[i + 3 + (j + 2) * c_dim1];
2703                           f34 = c[i + 2 + (j + 3) * c_dim1];
2704                           f44 = c[i + 3 + (j + 3) * c_dim1];
2705                           i6 = ll + lsec - 1;
2706                           for (l = ll; l <= i6; ++l)
2707                             {
2708                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2709                                       * b[l + j * b_dim1];
2710                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2711                                       * b[l + j * b_dim1];
2712                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2713                                       * b[l + (j + 1) * b_dim1];
2714                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2715                                       * b[l + (j + 1) * b_dim1];
2716                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2717                                       * b[l + (j + 2) * b_dim1];
2718                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2719                                       * b[l + (j + 2) * b_dim1];
2720                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2721                                       * b[l + (j + 3) * b_dim1];
2722                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2723                                       * b[l + (j + 3) * b_dim1];
2724                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2725                                       * b[l + j * b_dim1];
2726                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2727                                       * b[l + j * b_dim1];
2728                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2729                                       * b[l + (j + 1) * b_dim1];
2730                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2731                                       * b[l + (j + 1) * b_dim1];
2732                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2733                                       * b[l + (j + 2) * b_dim1];
2734                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2735                                       * b[l + (j + 2) * b_dim1];
2736                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2737                                       * b[l + (j + 3) * b_dim1];
2738                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2739                                       * b[l + (j + 3) * b_dim1];
2740                             }
2741                           c[i + j * c_dim1] = f11;
2742                           c[i + 1 + j * c_dim1] = f21;
2743                           c[i + (j + 1) * c_dim1] = f12;
2744                           c[i + 1 + (j + 1) * c_dim1] = f22;
2745                           c[i + (j + 2) * c_dim1] = f13;
2746                           c[i + 1 + (j + 2) * c_dim1] = f23;
2747                           c[i + (j + 3) * c_dim1] = f14;
2748                           c[i + 1 + (j + 3) * c_dim1] = f24;
2749                           c[i + 2 + j * c_dim1] = f31;
2750                           c[i + 3 + j * c_dim1] = f41;
2751                           c[i + 2 + (j + 1) * c_dim1] = f32;
2752                           c[i + 3 + (j + 1) * c_dim1] = f42;
2753                           c[i + 2 + (j + 2) * c_dim1] = f33;
2754                           c[i + 3 + (j + 2) * c_dim1] = f43;
2755                           c[i + 2 + (j + 3) * c_dim1] = f34;
2756                           c[i + 3 + (j + 3) * c_dim1] = f44;
2757                         }
2758                       if (uisec < isec)
2759                         {
2760                           i5 = ii + isec - 1;
2761                           for (i = ii + uisec; i <= i5; ++i)
2762                             {
2763                               f11 = c[i + j * c_dim1];
2764                               f12 = c[i + (j + 1) * c_dim1];
2765                               f13 = c[i + (j + 2) * c_dim1];
2766                               f14 = c[i + (j + 3) * c_dim1];
2767                               i6 = ll + lsec - 1;
2768                               for (l = ll; l <= i6; ++l)
2769                                 {
2770                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2771                                           257] * b[l + j * b_dim1];
2772                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2773                                           257] * b[l + (j + 1) * b_dim1];
2774                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2775                                           257] * b[l + (j + 2) * b_dim1];
2776                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2777                                           257] * b[l + (j + 3) * b_dim1];
2778                                 }
2779                               c[i + j * c_dim1] = f11;
2780                               c[i + (j + 1) * c_dim1] = f12;
2781                               c[i + (j + 2) * c_dim1] = f13;
2782                               c[i + (j + 3) * c_dim1] = f14;
2783                             }
2784                         }
2785                     }
2786                   if (ujsec < jsec)
2787                     {
2788                       i4 = jj + jsec - 1;
2789                       for (j = jj + ujsec; j <= i4; ++j)
2790                         {
2791                           i5 = ii + uisec - 1;
2792                           for (i = ii; i <= i5; i += 4)
2793                             {
2794                               f11 = c[i + j * c_dim1];
2795                               f21 = c[i + 1 + j * c_dim1];
2796                               f31 = c[i + 2 + j * c_dim1];
2797                               f41 = c[i + 3 + j * c_dim1];
2798                               i6 = ll + lsec - 1;
2799                               for (l = ll; l <= i6; ++l)
2800                                 {
2801                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2802                                           257] * b[l + j * b_dim1];
2803                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2804                                           257] * b[l + j * b_dim1];
2805                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2806                                           257] * b[l + j * b_dim1];
2807                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2808                                           257] * b[l + j * b_dim1];
2809                                 }
2810                               c[i + j * c_dim1] = f11;
2811                               c[i + 1 + j * c_dim1] = f21;
2812                               c[i + 2 + j * c_dim1] = f31;
2813                               c[i + 3 + j * c_dim1] = f41;
2814                             }
2815                           i5 = ii + isec - 1;
2816                           for (i = ii + uisec; i <= i5; ++i)
2817                             {
2818                               f11 = c[i + j * c_dim1];
2819                               i6 = ll + lsec - 1;
2820                               for (l = ll; l <= i6; ++l)
2821                                 {
2822                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2823                                           257] * b[l + j * b_dim1];
2824                                 }
2825                               c[i + j * c_dim1] = f11;
2826                             }
2827                         }
2828                     }
2829                 }
2830             }
2831         }
2832       free(t1);
2833       return;
2834     }
2835   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2836     {
2837       if (GFC_DESCRIPTOR_RANK (a) != 1)
2838         {
2839           const GFC_COMPLEX_4 *restrict abase_x;
2840           const GFC_COMPLEX_4 *restrict bbase_y;
2841           GFC_COMPLEX_4 *restrict dest_y;
2842           GFC_COMPLEX_4 s;
2843
2844           for (y = 0; y < ycount; y++)
2845             {
2846               bbase_y = &bbase[y*bystride];
2847               dest_y = &dest[y*rystride];
2848               for (x = 0; x < xcount; x++)
2849                 {
2850                   abase_x = &abase[x*axstride];
2851                   s = (GFC_COMPLEX_4) 0;
2852                   for (n = 0; n < count; n++)
2853                     s += abase_x[n] * bbase_y[n];
2854                   dest_y[x] = s;
2855                 }
2856             }
2857         }
2858       else
2859         {
2860           const GFC_COMPLEX_4 *restrict bbase_y;
2861           GFC_COMPLEX_4 s;
2862
2863           for (y = 0; y < ycount; y++)
2864             {
2865               bbase_y = &bbase[y*bystride];
2866               s = (GFC_COMPLEX_4) 0;
2867               for (n = 0; n < count; n++)
2868                 s += abase[n*axstride] * bbase_y[n];
2869               dest[y*rystride] = s;
2870             }
2871         }
2872     }
2873   else if (axstride < aystride)
2874     {
2875       for (y = 0; y < ycount; y++)
2876         for (x = 0; x < xcount; x++)
2877           dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0;
2878
2879       for (y = 0; y < ycount; y++)
2880         for (n = 0; n < count; n++)
2881           for (x = 0; x < xcount; x++)
2882             /* dest[x,y] += a[x,n] * b[n,y] */
2883             dest[x*rxstride + y*rystride] +=
2884                                         abase[x*axstride + n*aystride] *
2885                                         bbase[n*bxstride + y*bystride];
2886     }
2887   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2888     {
2889       const GFC_COMPLEX_4 *restrict bbase_y;
2890       GFC_COMPLEX_4 s;
2891
2892       for (y = 0; y < ycount; y++)
2893         {
2894           bbase_y = &bbase[y*bystride];
2895           s = (GFC_COMPLEX_4) 0;
2896           for (n = 0; n < count; n++)
2897             s += abase[n*axstride] * bbase_y[n*bxstride];
2898           dest[y*rxstride] = s;
2899         }
2900     }
2901   else
2902     {
2903       const GFC_COMPLEX_4 *restrict abase_x;
2904       const GFC_COMPLEX_4 *restrict bbase_y;
2905       GFC_COMPLEX_4 *restrict dest_y;
2906       GFC_COMPLEX_4 s;
2907
2908       for (y = 0; y < ycount; y++)
2909         {
2910           bbase_y = &bbase[y*bystride];
2911           dest_y = &dest[y*rystride];
2912           for (x = 0; x < xcount; x++)
2913             {
2914               abase_x = &abase[x*axstride];
2915               s = (GFC_COMPLEX_4) 0;
2916               for (n = 0; n < count; n++)
2917                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2918               dest_y[x*rxstride] = s;
2919             }
2920         }
2921     }
2922 }
2923 #undef POW3
2924 #undef min
2925 #undef max
2926
2927 #endif
2928 #endif
2929