libgfortran/generated/matmulavx128_r17.c

   1 /* Implementation of the MATMUL intrinsic
   2    Copyright (C) 2002-2024 Free Software Foundation, Inc.
   3    Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
   4
   5 This file is part of the GNU Fortran runtime library (libgfortran).
   6
   7 Libgfortran is free software; you can redistribute it and/or
   8 modify it under the terms of the GNU General Public
   9 License as published by the Free Software Foundation; either
  10 version 3 of the License, or (at your option) any later version.
  11
  12 Libgfortran is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 Under Section 7 of GPL version 3, you are granted additional
  18 permissions described in the GCC Runtime Library Exception, version
  19 3.1, as published by the Free Software Foundation.
  20
  21 You should have received a copy of the GNU General Public License and
  22 a copy of the GCC Runtime Library Exception along with this program;
  23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24 <http://www.gnu.org/licenses/>.  */
  25
  26 #include "libgfortran.h"
  27 #include <string.h>
  28 #include <assert.h>
  29
  30
  31 /* These are the specific versions of matmul with -mprefer-avx128.  */
  32
  33 #if defined (HAVE_GFC_REAL_17)
  34
  35 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
  36    passed to us by the front-end, in which case we call it for large
  37    matrices.  */
  38
  39 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
  40                           const int *, const GFC_REAL_17 *, const GFC_REAL_17 *,
  41                           const int *, const GFC_REAL_17 *, const int *,
  42                           const GFC_REAL_17 *, GFC_REAL_17 *, const int *,
  43                           int, int);
  44
  45 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
  46 void
  47 matmul_r17_avx128_fma3 (gfc_array_r17 * const restrict retarray,
  48         gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
  49         int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
  50 internal_proto(matmul_r17_avx128_fma3);
  51 void
  52 matmul_r17_avx128_fma3 (gfc_array_r17 * const restrict retarray,
  53         gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
  54         int blas_limit, blas_call gemm)
  55 {
  56   const GFC_REAL_17 * restrict abase;
  57   const GFC_REAL_17 * restrict bbase;
  58   GFC_REAL_17 * restrict dest;
  59
  60   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
  61   index_type x, y, n, count, xcount, ycount;
  62
  63   assert (GFC_DESCRIPTOR_RANK (a) == 2
  64           || GFC_DESCRIPTOR_RANK (b) == 2);
  65
  66 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
  67
  68    Either A or B (but not both) can be rank 1:
  69
  70    o One-dimensional argument A is implicitly treated as a row matrix
  71      dimensioned [1,count], so xcount=1.
  72
  73    o One-dimensional argument B is implicitly treated as a column matrix
  74      dimensioned [count, 1], so ycount=1.
  75 */
  76
  77   if (retarray->base_addr == NULL)
  78     {
  79       if (GFC_DESCRIPTOR_RANK (a) == 1)
  80         {
  81           GFC_DIMENSION_SET(retarray->dim[0], 0,
  82                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
  83         }
  84       else if (GFC_DESCRIPTOR_RANK (b) == 1)
  85         {
  86           GFC_DIMENSION_SET(retarray->dim[0], 0,
  87                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
  88         }
  89       else
  90         {
  91           GFC_DIMENSION_SET(retarray->dim[0], 0,
  92                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
  93
  94           GFC_DIMENSION_SET(retarray->dim[1], 0,
  95                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
  96                             GFC_DESCRIPTOR_EXTENT(retarray,0));
  97         }
  98
  99       retarray->base_addr
 100         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_17));
 101       retarray->offset = 0;
 102     }
 103   else if (unlikely (compile_options.bounds_check))
 104     {
 105       index_type ret_extent, arg_extent;
 106
 107       if (GFC_DESCRIPTOR_RANK (a) == 1)
 108         {
 109           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 110           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 111           if (arg_extent != ret_extent)
 112             runtime_error ("Array bound mismatch for dimension 1 of "
 113                            "array (%ld/%ld) ",
 114                            (long int) ret_extent, (long int) arg_extent);
 115         }
 116       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 117         {
 118           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 119           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 120           if (arg_extent != ret_extent)
 121             runtime_error ("Array bound mismatch for dimension 1 of "
 122                            "array (%ld/%ld) ",
 123                            (long int) ret_extent, (long int) arg_extent);
 124         }
 125       else
 126         {
 127           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 128           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 129           if (arg_extent != ret_extent)
 130             runtime_error ("Array bound mismatch for dimension 1 of "
 131                            "array (%ld/%ld) ",
 132                            (long int) ret_extent, (long int) arg_extent);
 133
 134           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 135           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 136           if (arg_extent != ret_extent)
 137             runtime_error ("Array bound mismatch for dimension 2 of "
 138                            "array (%ld/%ld) ",
 139                            (long int) ret_extent, (long int) arg_extent);
 140         }
 141     }
 142
 143
 144   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 145     {
 146       /* One-dimensional result may be addressed in the code below
 147          either as a row or a column matrix. We want both cases to
 148          work. */
 149       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 150     }
 151   else
 152     {
 153       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 154       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 155     }
 156
 157
 158   if (GFC_DESCRIPTOR_RANK (a) == 1)
 159     {
 160       /* Treat it as a a row matrix A[1,count]. */
 161       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 162       aystride = 1;
 163
 164       xcount = 1;
 165       count = GFC_DESCRIPTOR_EXTENT(a,0);
 166     }
 167   else
 168     {
 169       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 170       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 171
 172       count = GFC_DESCRIPTOR_EXTENT(a,1);
 173       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 174     }
 175
 176   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 177     {
 178       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 179         runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
 180                        "in dimension 1: is %ld, should be %ld",
 181                        (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
 182     }
 183
 184   if (GFC_DESCRIPTOR_RANK (b) == 1)
 185     {
 186       /* Treat it as a column matrix B[count,1] */
 187       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 188
 189       /* bystride should never be used for 1-dimensional b.
 190          The value is only used for calculation of the
 191          memory by the buffer.  */
 192       bystride = 256;
 193       ycount = 1;
 194     }
 195   else
 196     {
 197       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 198       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 199       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 200     }
 201
 202   abase = a->base_addr;
 203   bbase = b->base_addr;
 204   dest = retarray->base_addr;
 205
 206   /* Now that everything is set up, we perform the multiplication
 207      itself.  */
 208
 209 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 210 #define min(a,b) ((a) <= (b) ? (a) : (b))
 211 #define max(a,b) ((a) >= (b) ? (a) : (b))
 212
 213   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 214       && (bxstride == 1 || bystride == 1)
 215       && (((float) xcount) * ((float) ycount) * ((float) count)
 216           > POW3(blas_limit)))
 217     {
 218       const int m = xcount, n = ycount, k = count, ldc = rystride;
 219       const GFC_REAL_17 one = 1, zero = 0;
 220       const int lda = (axstride == 1) ? aystride : axstride,
 221                 ldb = (bxstride == 1) ? bystride : bxstride;
 222
 223       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 224         {
 225           assert (gemm != NULL);
 226           const char *transa, *transb;
 227           if (try_blas & 2)
 228             transa = "C";
 229           else
 230             transa = axstride == 1 ? "N" : "T";
 231
 232           if (try_blas & 4)
 233             transb = "C";
 234           else
 235             transb = bxstride == 1 ? "N" : "T";
 236
 237           gemm (transa, transb , &m,
 238                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 239                 &ldc, 1, 1);
 240           return;
 241         }
 242     }
 243
 244   if (rxstride == 1 && axstride == 1 && bxstride == 1
 245       && GFC_DESCRIPTOR_RANK (b) != 1)
 246     {
 247       /* This block of code implements a tuned matmul, derived from
 248          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 249
 250                Bo Kagstrom and Per Ling
 251                Department of Computing Science
 252                Umea University
 253                S-901 87 Umea, Sweden
 254
 255          from netlib.org, translated to C, and modified for matmul.m4.  */
 256
 257       const GFC_REAL_17 *a, *b;
 258       GFC_REAL_17 *c;
 259       const index_type m = xcount, n = ycount, k = count;
 260
 261       /* System generated locals */
 262       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 263                  i1, i2, i3, i4, i5, i6;
 264
 265       /* Local variables */
 266       GFC_REAL_17 f11, f12, f21, f22, f31, f32, f41, f42,
 267                  f13, f14, f23, f24, f33, f34, f43, f44;
 268       index_type i, j, l, ii, jj, ll;
 269       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 270       GFC_REAL_17 *t1;
 271
 272       a = abase;
 273       b = bbase;
 274       c = retarray->base_addr;
 275
 276       /* Parameter adjustments */
 277       c_dim1 = rystride;
 278       c_offset = 1 + c_dim1;
 279       c -= c_offset;
 280       a_dim1 = aystride;
 281       a_offset = 1 + a_dim1;
 282       a -= a_offset;
 283       b_dim1 = bystride;
 284       b_offset = 1 + b_dim1;
 285       b -= b_offset;
 286
 287       /* Empty c first.  */
 288       for (j=1; j<=n; j++)
 289         for (i=1; i<=m; i++)
 290           c[i + j * c_dim1] = (GFC_REAL_17)0;
 291
 292       /* Early exit if possible */
 293       if (m == 0 || n == 0 || k == 0)
 294         return;
 295
 296       /* Adjust size of t1 to what is needed.  */
 297       index_type t1_dim, a_sz;
 298       if (aystride == 1)
 299         a_sz = rystride;
 300       else
 301         a_sz = a_dim1;
 302
 303       t1_dim = a_sz * 256 + b_dim1;
 304       if (t1_dim > 65536)
 305         t1_dim = 65536;
 306
 307       t1 = malloc (t1_dim * sizeof(GFC_REAL_17));
 308
 309       /* Start turning the crank. */
 310       i1 = n;
 311       for (jj = 1; jj <= i1; jj += 512)
 312         {
 313           /* Computing MIN */
 314           i2 = 512;
 315           i3 = n - jj + 1;
 316           jsec = min(i2,i3);
 317           ujsec = jsec - jsec % 4;
 318           i2 = k;
 319           for (ll = 1; ll <= i2; ll += 256)
 320             {
 321               /* Computing MIN */
 322               i3 = 256;
 323               i4 = k - ll + 1;
 324               lsec = min(i3,i4);
 325               ulsec = lsec - lsec % 2;
 326
 327               i3 = m;
 328               for (ii = 1; ii <= i3; ii += 256)
 329                 {
 330                   /* Computing MIN */
 331                   i4 = 256;
 332                   i5 = m - ii + 1;
 333                   isec = min(i4,i5);
 334                   uisec = isec - isec % 2;
 335                   i4 = ll + ulsec - 1;
 336                   for (l = ll; l <= i4; l += 2)
 337                     {
 338                       i5 = ii + uisec - 1;
 339                       for (i = ii; i <= i5; i += 2)
 340                         {
 341                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 342                                         a[i + l * a_dim1];
 343                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 344                                         a[i + (l + 1) * a_dim1];
 345                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 346                                         a[i + 1 + l * a_dim1];
 347                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 348                                         a[i + 1 + (l + 1) * a_dim1];
 349                         }
 350                       if (uisec < isec)
 351                         {
 352                           t1[l - ll + 1 + (isec << 8) - 257] =
 353                                     a[ii + isec - 1 + l * a_dim1];
 354                           t1[l - ll + 2 + (isec << 8) - 257] =
 355                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 356                         }
 357                     }
 358                   if (ulsec < lsec)
 359                     {
 360                       i4 = ii + isec - 1;
 361                       for (i = ii; i<= i4; ++i)
 362                         {
 363                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 364                                     a[i + (ll + lsec - 1) * a_dim1];
 365                         }
 366                     }
 367
 368                   uisec = isec - isec % 4;
 369                   i4 = jj + ujsec - 1;
 370                   for (j = jj; j <= i4; j += 4)
 371                     {
 372                       i5 = ii + uisec - 1;
 373                       for (i = ii; i <= i5; i += 4)
 374                         {
 375                           f11 = c[i + j * c_dim1];
 376                           f21 = c[i + 1 + j * c_dim1];
 377                           f12 = c[i + (j + 1) * c_dim1];
 378                           f22 = c[i + 1 + (j + 1) * c_dim1];
 379                           f13 = c[i + (j + 2) * c_dim1];
 380                           f23 = c[i + 1 + (j + 2) * c_dim1];
 381                           f14 = c[i + (j + 3) * c_dim1];
 382                           f24 = c[i + 1 + (j + 3) * c_dim1];
 383                           f31 = c[i + 2 + j * c_dim1];
 384                           f41 = c[i + 3 + j * c_dim1];
 385                           f32 = c[i + 2 + (j + 1) * c_dim1];
 386                           f42 = c[i + 3 + (j + 1) * c_dim1];
 387                           f33 = c[i + 2 + (j + 2) * c_dim1];
 388                           f43 = c[i + 3 + (j + 2) * c_dim1];
 389                           f34 = c[i + 2 + (j + 3) * c_dim1];
 390                           f44 = c[i + 3 + (j + 3) * c_dim1];
 391                           i6 = ll + lsec - 1;
 392                           for (l = ll; l <= i6; ++l)
 393                             {
 394                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 395                                       * b[l + j * b_dim1];
 396                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 397                                       * b[l + j * b_dim1];
 398                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 399                                       * b[l + (j + 1) * b_dim1];
 400                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 401                                       * b[l + (j + 1) * b_dim1];
 402                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 403                                       * b[l + (j + 2) * b_dim1];
 404                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 405                                       * b[l + (j + 2) * b_dim1];
 406                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 407                                       * b[l + (j + 3) * b_dim1];
 408                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 409                                       * b[l + (j + 3) * b_dim1];
 410                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 411                                       * b[l + j * b_dim1];
 412                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 413                                       * b[l + j * b_dim1];
 414                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 415                                       * b[l + (j + 1) * b_dim1];
 416                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 417                                       * b[l + (j + 1) * b_dim1];
 418                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 419                                       * b[l + (j + 2) * b_dim1];
 420                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 421                                       * b[l + (j + 2) * b_dim1];
 422                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 423                                       * b[l + (j + 3) * b_dim1];
 424                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 425                                       * b[l + (j + 3) * b_dim1];
 426                             }
 427                           c[i + j * c_dim1] = f11;
 428                           c[i + 1 + j * c_dim1] = f21;
 429                           c[i + (j + 1) * c_dim1] = f12;
 430                           c[i + 1 + (j + 1) * c_dim1] = f22;
 431                           c[i + (j + 2) * c_dim1] = f13;
 432                           c[i + 1 + (j + 2) * c_dim1] = f23;
 433                           c[i + (j + 3) * c_dim1] = f14;
 434                           c[i + 1 + (j + 3) * c_dim1] = f24;
 435                           c[i + 2 + j * c_dim1] = f31;
 436                           c[i + 3 + j * c_dim1] = f41;
 437                           c[i + 2 + (j + 1) * c_dim1] = f32;
 438                           c[i + 3 + (j + 1) * c_dim1] = f42;
 439                           c[i + 2 + (j + 2) * c_dim1] = f33;
 440                           c[i + 3 + (j + 2) * c_dim1] = f43;
 441                           c[i + 2 + (j + 3) * c_dim1] = f34;
 442                           c[i + 3 + (j + 3) * c_dim1] = f44;
 443                         }
 444                       if (uisec < isec)
 445                         {
 446                           i5 = ii + isec - 1;
 447                           for (i = ii + uisec; i <= i5; ++i)
 448                             {
 449                               f11 = c[i + j * c_dim1];
 450                               f12 = c[i + (j + 1) * c_dim1];
 451                               f13 = c[i + (j + 2) * c_dim1];
 452                               f14 = c[i + (j + 3) * c_dim1];
 453                               i6 = ll + lsec - 1;
 454                               for (l = ll; l <= i6; ++l)
 455                                 {
 456                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 457                                           257] * b[l + j * b_dim1];
 458                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 459                                           257] * b[l + (j + 1) * b_dim1];
 460                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 461                                           257] * b[l + (j + 2) * b_dim1];
 462                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 463                                           257] * b[l + (j + 3) * b_dim1];
 464                                 }
 465                               c[i + j * c_dim1] = f11;
 466                               c[i + (j + 1) * c_dim1] = f12;
 467                               c[i + (j + 2) * c_dim1] = f13;
 468                               c[i + (j + 3) * c_dim1] = f14;
 469                             }
 470                         }
 471                     }
 472                   if (ujsec < jsec)
 473                     {
 474                       i4 = jj + jsec - 1;
 475                       for (j = jj + ujsec; j <= i4; ++j)
 476                         {
 477                           i5 = ii + uisec - 1;
 478                           for (i = ii; i <= i5; i += 4)
 479                             {
 480                               f11 = c[i + j * c_dim1];
 481                               f21 = c[i + 1 + j * c_dim1];
 482                               f31 = c[i + 2 + j * c_dim1];
 483                               f41 = c[i + 3 + j * c_dim1];
 484                               i6 = ll + lsec - 1;
 485                               for (l = ll; l <= i6; ++l)
 486                                 {
 487                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 488                                           257] * b[l + j * b_dim1];
 489                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
 490                                           257] * b[l + j * b_dim1];
 491                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
 492                                           257] * b[l + j * b_dim1];
 493                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
 494                                           257] * b[l + j * b_dim1];
 495                                 }
 496                               c[i + j * c_dim1] = f11;
 497                               c[i + 1 + j * c_dim1] = f21;
 498                               c[i + 2 + j * c_dim1] = f31;
 499                               c[i + 3 + j * c_dim1] = f41;
 500                             }
 501                           i5 = ii + isec - 1;
 502                           for (i = ii + uisec; i <= i5; ++i)
 503                             {
 504                               f11 = c[i + j * c_dim1];
 505                               i6 = ll + lsec - 1;
 506                               for (l = ll; l <= i6; ++l)
 507                                 {
 508                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 509                                           257] * b[l + j * b_dim1];
 510                                 }
 511                               c[i + j * c_dim1] = f11;
 512                             }
 513                         }
 514                     }
 515                 }
 516             }
 517         }
 518       free(t1);
 519       return;
 520     }
 521   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
 522     {
 523       if (GFC_DESCRIPTOR_RANK (a) != 1)
 524         {
 525           const GFC_REAL_17 *restrict abase_x;
 526           const GFC_REAL_17 *restrict bbase_y;
 527           GFC_REAL_17 *restrict dest_y;
 528           GFC_REAL_17 s;
 529
 530           for (y = 0; y < ycount; y++)
 531             {
 532               bbase_y = &bbase[y*bystride];
 533               dest_y = &dest[y*rystride];
 534               for (x = 0; x < xcount; x++)
 535                 {
 536                   abase_x = &abase[x*axstride];
 537                   s = (GFC_REAL_17) 0;
 538                   for (n = 0; n < count; n++)
 539                     s += abase_x[n] * bbase_y[n];
 540                   dest_y[x] = s;
 541                 }
 542             }
 543         }
 544       else
 545         {
 546           const GFC_REAL_17 *restrict bbase_y;
 547           GFC_REAL_17 s;
 548
 549           for (y = 0; y < ycount; y++)
 550             {
 551               bbase_y = &bbase[y*bystride];
 552               s = (GFC_REAL_17) 0;
 553               for (n = 0; n < count; n++)
 554                 s += abase[n*axstride] * bbase_y[n];
 555               dest[y*rystride] = s;
 556             }
 557         }
 558     }
 559   else if (GFC_DESCRIPTOR_RANK (a) == 1)
 560     {
 561       const GFC_REAL_17 *restrict bbase_y;
 562       GFC_REAL_17 s;
 563
 564       for (y = 0; y < ycount; y++)
 565         {
 566           bbase_y = &bbase[y*bystride];
 567           s = (GFC_REAL_17) 0;
 568           for (n = 0; n < count; n++)
 569             s += abase[n*axstride] * bbase_y[n*bxstride];
 570           dest[y*rxstride] = s;
 571         }
 572     }
 573   else if (axstride < aystride)
 574     {
 575       for (y = 0; y < ycount; y++)
 576         for (x = 0; x < xcount; x++)
 577           dest[x*rxstride + y*rystride] = (GFC_REAL_17)0;
 578
 579       for (y = 0; y < ycount; y++)
 580         for (n = 0; n < count; n++)
 581           for (x = 0; x < xcount; x++)
 582             /* dest[x,y] += a[x,n] * b[n,y] */
 583             dest[x*rxstride + y*rystride] +=
 584                                         abase[x*axstride + n*aystride] *
 585                                         bbase[n*bxstride + y*bystride];
 586     }
 587   else
 588     {
 589       const GFC_REAL_17 *restrict abase_x;
 590       const GFC_REAL_17 *restrict bbase_y;
 591       GFC_REAL_17 *restrict dest_y;
 592       GFC_REAL_17 s;
 593
 594       for (y = 0; y < ycount; y++)
 595         {
 596           bbase_y = &bbase[y*bystride];
 597           dest_y = &dest[y*rystride];
 598           for (x = 0; x < xcount; x++)
 599             {
 600               abase_x = &abase[x*axstride];
 601               s = (GFC_REAL_17) 0;
 602               for (n = 0; n < count; n++)
 603                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
 604               dest_y[x*rxstride] = s;
 605             }
 606         }
 607     }
 608 }
 609 #undef POW3
 610 #undef min
 611 #undef max
 612
 613 #endif
 614
 615 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
 616 void
 617 matmul_r17_avx128_fma4 (gfc_array_r17 * const restrict retarray,
 618         gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
 619         int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
 620 internal_proto(matmul_r17_avx128_fma4);
 621 void
 622 matmul_r17_avx128_fma4 (gfc_array_r17 * const restrict retarray,
 623         gfc_array_r17 * const restrict a, gfc_array_r17 * const restrict b, int try_blas,
 624         int blas_limit, blas_call gemm)
 625 {
 626   const GFC_REAL_17 * restrict abase;
 627   const GFC_REAL_17 * restrict bbase;
 628   GFC_REAL_17 * restrict dest;
 629
 630   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
 631   index_type x, y, n, count, xcount, ycount;
 632
 633   assert (GFC_DESCRIPTOR_RANK (a) == 2
 634           || GFC_DESCRIPTOR_RANK (b) == 2);
 635
 636 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
 637
 638    Either A or B (but not both) can be rank 1:
 639
 640    o One-dimensional argument A is implicitly treated as a row matrix
 641      dimensioned [1,count], so xcount=1.
 642
 643    o One-dimensional argument B is implicitly treated as a column matrix
 644      dimensioned [count, 1], so ycount=1.
 645 */
 646
 647   if (retarray->base_addr == NULL)
 648     {
 649       if (GFC_DESCRIPTOR_RANK (a) == 1)
 650         {
 651           GFC_DIMENSION_SET(retarray->dim[0], 0,
 652                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
 653         }
 654       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 655         {
 656           GFC_DIMENSION_SET(retarray->dim[0], 0,
 657                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 658         }
 659       else
 660         {
 661           GFC_DIMENSION_SET(retarray->dim[0], 0,
 662                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 663
 664           GFC_DIMENSION_SET(retarray->dim[1], 0,
 665                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
 666                             GFC_DESCRIPTOR_EXTENT(retarray,0));
 667         }
 668
 669       retarray->base_addr
 670         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_17));
 671       retarray->offset = 0;
 672     }
 673   else if (unlikely (compile_options.bounds_check))
 674     {
 675       index_type ret_extent, arg_extent;
 676
 677       if (GFC_DESCRIPTOR_RANK (a) == 1)
 678         {
 679           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 680           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 681           if (arg_extent != ret_extent)
 682             runtime_error ("Array bound mismatch for dimension 1 of "
 683                            "array (%ld/%ld) ",
 684                            (long int) ret_extent, (long int) arg_extent);
 685         }
 686       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 687         {
 688           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 689           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 690           if (arg_extent != ret_extent)
 691             runtime_error ("Array bound mismatch for dimension 1 of "
 692                            "array (%ld/%ld) ",
 693                            (long int) ret_extent, (long int) arg_extent);
 694         }
 695       else
 696         {
 697           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 698           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 699           if (arg_extent != ret_extent)
 700             runtime_error ("Array bound mismatch for dimension 1 of "
 701                            "array (%ld/%ld) ",
 702                            (long int) ret_extent, (long int) arg_extent);
 703
 704           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 705           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 706           if (arg_extent != ret_extent)
 707             runtime_error ("Array bound mismatch for dimension 2 of "
 708                            "array (%ld/%ld) ",
 709                            (long int) ret_extent, (long int) arg_extent);
 710         }
 711     }
 712
 713
 714   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 715     {
 716       /* One-dimensional result may be addressed in the code below
 717          either as a row or a column matrix. We want both cases to
 718          work. */
 719       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 720     }
 721   else
 722     {
 723       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 724       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 725     }
 726
 727
 728   if (GFC_DESCRIPTOR_RANK (a) == 1)
 729     {
 730       /* Treat it as a a row matrix A[1,count]. */
 731       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 732       aystride = 1;
 733
 734       xcount = 1;
 735       count = GFC_DESCRIPTOR_EXTENT(a,0);
 736     }
 737   else
 738     {
 739       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 740       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 741
 742       count = GFC_DESCRIPTOR_EXTENT(a,1);
 743       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 744     }
 745
 746   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 747     {
 748       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 749         runtime_error ("Incorrect extent in argument B in MATMUL intrinsic "
 750                        "in dimension 1: is %ld, should be %ld",
 751                        (long int) GFC_DESCRIPTOR_EXTENT(b,0), (long int) count);
 752     }
 753
 754   if (GFC_DESCRIPTOR_RANK (b) == 1)
 755     {
 756       /* Treat it as a column matrix B[count,1] */
 757       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 758
 759       /* bystride should never be used for 1-dimensional b.
 760          The value is only used for calculation of the
 761          memory by the buffer.  */
 762       bystride = 256;
 763       ycount = 1;
 764     }
 765   else
 766     {
 767       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 768       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 769       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 770     }
 771
 772   abase = a->base_addr;
 773   bbase = b->base_addr;
 774   dest = retarray->base_addr;
 775
 776   /* Now that everything is set up, we perform the multiplication
 777      itself.  */
 778
 779 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 780 #define min(a,b) ((a) <= (b) ? (a) : (b))
 781 #define max(a,b) ((a) >= (b) ? (a) : (b))
 782
 783   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 784       && (bxstride == 1 || bystride == 1)
 785       && (((float) xcount) * ((float) ycount) * ((float) count)
 786           > POW3(blas_limit)))
 787     {
 788       const int m = xcount, n = ycount, k = count, ldc = rystride;
 789       const GFC_REAL_17 one = 1, zero = 0;
 790       const int lda = (axstride == 1) ? aystride : axstride,
 791                 ldb = (bxstride == 1) ? bystride : bxstride;
 792
 793       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 794         {
 795           assert (gemm != NULL);
 796           const char *transa, *transb;
 797           if (try_blas & 2)
 798             transa = "C";
 799           else
 800             transa = axstride == 1 ? "N" : "T";
 801
 802           if (try_blas & 4)
 803             transb = "C";
 804           else
 805             transb = bxstride == 1 ? "N" : "T";
 806
 807           gemm (transa, transb , &m,
 808                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 809                 &ldc, 1, 1);
 810           return;
 811         }
 812     }
 813
 814   if (rxstride == 1 && axstride == 1 && bxstride == 1
 815       && GFC_DESCRIPTOR_RANK (b) != 1)
 816     {
 817       /* This block of code implements a tuned matmul, derived from
 818          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 819
 820                Bo Kagstrom and Per Ling
 821                Department of Computing Science
 822                Umea University
 823                S-901 87 Umea, Sweden
 824
 825          from netlib.org, translated to C, and modified for matmul.m4.  */
 826
 827       const GFC_REAL_17 *a, *b;
 828       GFC_REAL_17 *c;
 829       const index_type m = xcount, n = ycount, k = count;
 830
 831       /* System generated locals */
 832       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 833                  i1, i2, i3, i4, i5, i6;
 834
 835       /* Local variables */
 836       GFC_REAL_17 f11, f12, f21, f22, f31, f32, f41, f42,
 837                  f13, f14, f23, f24, f33, f34, f43, f44;
 838       index_type i, j, l, ii, jj, ll;
 839       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 840       GFC_REAL_17 *t1;
 841
 842       a = abase;
 843       b = bbase;
 844       c = retarray->base_addr;
 845
 846       /* Parameter adjustments */
 847       c_dim1 = rystride;
 848       c_offset = 1 + c_dim1;
 849       c -= c_offset;
 850       a_dim1 = aystride;
 851       a_offset = 1 + a_dim1;
 852       a -= a_offset;
 853       b_dim1 = bystride;
 854       b_offset = 1 + b_dim1;
 855       b -= b_offset;
 856
 857       /* Empty c first.  */
 858       for (j=1; j<=n; j++)
 859         for (i=1; i<=m; i++)
 860           c[i + j * c_dim1] = (GFC_REAL_17)0;
 861
 862       /* Early exit if possible */
 863       if (m == 0 || n == 0 || k == 0)
 864         return;
 865
 866       /* Adjust size of t1 to what is needed.  */
 867       index_type t1_dim, a_sz;
 868       if (aystride == 1)
 869         a_sz = rystride;
 870       else
 871         a_sz = a_dim1;
 872
 873       t1_dim = a_sz * 256 + b_dim1;
 874       if (t1_dim > 65536)
 875         t1_dim = 65536;
 876
 877       t1 = malloc (t1_dim * sizeof(GFC_REAL_17));
 878
 879       /* Start turning the crank. */
 880       i1 = n;
 881       for (jj = 1; jj <= i1; jj += 512)
 882         {
 883           /* Computing MIN */
 884           i2 = 512;
 885           i3 = n - jj + 1;
 886           jsec = min(i2,i3);
 887           ujsec = jsec - jsec % 4;
 888           i2 = k;
 889           for (ll = 1; ll <= i2; ll += 256)
 890             {
 891               /* Computing MIN */
 892               i3 = 256;
 893               i4 = k - ll + 1;
 894               lsec = min(i3,i4);
 895               ulsec = lsec - lsec % 2;
 896
 897               i3 = m;
 898               for (ii = 1; ii <= i3; ii += 256)
 899                 {
 900                   /* Computing MIN */
 901                   i4 = 256;
 902                   i5 = m - ii + 1;
 903                   isec = min(i4,i5);
 904                   uisec = isec - isec % 2;
 905                   i4 = ll + ulsec - 1;
 906                   for (l = ll; l <= i4; l += 2)
 907                     {
 908                       i5 = ii + uisec - 1;
 909                       for (i = ii; i <= i5; i += 2)
 910                         {
 911                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 912                                         a[i + l * a_dim1];
 913                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 914                                         a[i + (l + 1) * a_dim1];
 915                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 916                                         a[i + 1 + l * a_dim1];
 917                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 918                                         a[i + 1 + (l + 1) * a_dim1];
 919                         }
 920                       if (uisec < isec)
 921                         {
 922                           t1[l - ll + 1 + (isec << 8) - 257] =
 923                                     a[ii + isec - 1 + l * a_dim1];
 924                           t1[l - ll + 2 + (isec << 8) - 257] =
 925                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 926                         }
 927                     }
 928                   if (ulsec < lsec)
 929                     {
 930                       i4 = ii + isec - 1;
 931                       for (i = ii; i<= i4; ++i)
 932                         {
 933                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 934                                     a[i + (ll + lsec - 1) * a_dim1];
 935                         }
 936                     }
 937
 938                   uisec = isec - isec % 4;
 939                   i4 = jj + ujsec - 1;
 940                   for (j = jj; j <= i4; j += 4)
 941                     {
 942                       i5 = ii + uisec - 1;
 943                       for (i = ii; i <= i5; i += 4)
 944                         {
 945                           f11 = c[i + j * c_dim1];
 946                           f21 = c[i + 1 + j * c_dim1];
 947                           f12 = c[i + (j + 1) * c_dim1];
 948                           f22 = c[i + 1 + (j + 1) * c_dim1];
 949                           f13 = c[i + (j + 2) * c_dim1];
 950                           f23 = c[i + 1 + (j + 2) * c_dim1];
 951                           f14 = c[i + (j + 3) * c_dim1];
 952                           f24 = c[i + 1 + (j + 3) * c_dim1];
 953                           f31 = c[i + 2 + j * c_dim1];
 954                           f41 = c[i + 3 + j * c_dim1];
 955                           f32 = c[i + 2 + (j + 1) * c_dim1];
 956                           f42 = c[i + 3 + (j + 1) * c_dim1];
 957                           f33 = c[i + 2 + (j + 2) * c_dim1];
 958                           f43 = c[i + 3 + (j + 2) * c_dim1];
 959                           f34 = c[i + 2 + (j + 3) * c_dim1];
 960                           f44 = c[i + 3 + (j + 3) * c_dim1];
 961                           i6 = ll + lsec - 1;
 962                           for (l = ll; l <= i6; ++l)
 963                             {
 964                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 965                                       * b[l + j * b_dim1];
 966                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 967                                       * b[l + j * b_dim1];
 968                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 969                                       * b[l + (j + 1) * b_dim1];
 970                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 971                                       * b[l + (j + 1) * b_dim1];
 972                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 973                                       * b[l + (j + 2) * b_dim1];
 974                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 975                                       * b[l + (j + 2) * b_dim1];
 976                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 977                                       * b[l + (j + 3) * b_dim1];
 978                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 979                                       * b[l + (j + 3) * b_dim1];
 980                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 981                                       * b[l + j * b_dim1];
 982                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 983                                       * b[l + j * b_dim1];
 984                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 985                                       * b[l + (j + 1) * b_dim1];
 986                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 987                                       * b[l + (j + 1) * b_dim1];
 988                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 989                                       * b[l + (j + 2) * b_dim1];
 990                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 991                                       * b[l + (j + 2) * b_dim1];
 992                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 993                                       * b[l + (j + 3) * b_dim1];
 994                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 995                                       * b[l + (j + 3) * b_dim1];
 996                             }
 997                           c[i + j * c_dim1] = f11;
 998                           c[i + 1 + j * c_dim1] = f21;
 999                           c[i + (j + 1) * c_dim1] = f12;
1000                           c[i + 1 + (j + 1) * c_dim1] = f22;
1001                           c[i + (j + 2) * c_dim1] = f13;
1002                           c[i + 1 + (j + 2) * c_dim1] = f23;
1003                           c[i + (j + 3) * c_dim1] = f14;
1004                           c[i + 1 + (j + 3) * c_dim1] = f24;
1005                           c[i + 2 + j * c_dim1] = f31;
1006                           c[i + 3 + j * c_dim1] = f41;
1007                           c[i + 2 + (j + 1) * c_dim1] = f32;
1008                           c[i + 3 + (j + 1) * c_dim1] = f42;
1009                           c[i + 2 + (j + 2) * c_dim1] = f33;
1010                           c[i + 3 + (j + 2) * c_dim1] = f43;
1011                           c[i + 2 + (j + 3) * c_dim1] = f34;
1012                           c[i + 3 + (j + 3) * c_dim1] = f44;
1013                         }
1014                       if (uisec < isec)
1015                         {
1016                           i5 = ii + isec - 1;
1017                           for (i = ii + uisec; i <= i5; ++i)
1018                             {
1019                               f11 = c[i + j * c_dim1];
1020                               f12 = c[i + (j + 1) * c_dim1];
1021                               f13 = c[i + (j + 2) * c_dim1];
1022                               f14 = c[i + (j + 3) * c_dim1];
1023                               i6 = ll + lsec - 1;
1024                               for (l = ll; l <= i6; ++l)
1025                                 {
1026                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1027                                           257] * b[l + j * b_dim1];
1028                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1029                                           257] * b[l + (j + 1) * b_dim1];
1030                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1031                                           257] * b[l + (j + 2) * b_dim1];
1032                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1033                                           257] * b[l + (j + 3) * b_dim1];
1034                                 }
1035                               c[i + j * c_dim1] = f11;
1036                               c[i + (j + 1) * c_dim1] = f12;
1037                               c[i + (j + 2) * c_dim1] = f13;
1038                               c[i + (j + 3) * c_dim1] = f14;
1039                             }
1040                         }
1041                     }
1042                   if (ujsec < jsec)
1043                     {
1044                       i4 = jj + jsec - 1;
1045                       for (j = jj + ujsec; j <= i4; ++j)
1046                         {
1047                           i5 = ii + uisec - 1;
1048                           for (i = ii; i <= i5; i += 4)
1049                             {
1050                               f11 = c[i + j * c_dim1];
1051                               f21 = c[i + 1 + j * c_dim1];
1052                               f31 = c[i + 2 + j * c_dim1];
1053                               f41 = c[i + 3 + j * c_dim1];
1054                               i6 = ll + lsec - 1;
1055                               for (l = ll; l <= i6; ++l)
1056                                 {
1057                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1058                                           257] * b[l + j * b_dim1];
1059                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1060                                           257] * b[l + j * b_dim1];
1061                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1062                                           257] * b[l + j * b_dim1];
1063                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1064                                           257] * b[l + j * b_dim1];
1065                                 }
1066                               c[i + j * c_dim1] = f11;
1067                               c[i + 1 + j * c_dim1] = f21;
1068                               c[i + 2 + j * c_dim1] = f31;
1069                               c[i + 3 + j * c_dim1] = f41;
1070                             }
1071                           i5 = ii + isec - 1;
1072                           for (i = ii + uisec; i <= i5; ++i)
1073                             {
1074                               f11 = c[i + j * c_dim1];
1075                               i6 = ll + lsec - 1;
1076                               for (l = ll; l <= i6; ++l)
1077                                 {
1078                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1079                                           257] * b[l + j * b_dim1];
1080                                 }
1081                               c[i + j * c_dim1] = f11;
1082                             }
1083                         }
1084                     }
1085                 }
1086             }
1087         }
1088       free(t1);
1089       return;
1090     }
1091   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1092     {
1093       if (GFC_DESCRIPTOR_RANK (a) != 1)
1094         {
1095           const GFC_REAL_17 *restrict abase_x;
1096           const GFC_REAL_17 *restrict bbase_y;
1097           GFC_REAL_17 *restrict dest_y;
1098           GFC_REAL_17 s;
1099
1100           for (y = 0; y < ycount; y++)
1101             {
1102               bbase_y = &bbase[y*bystride];
1103               dest_y = &dest[y*rystride];
1104               for (x = 0; x < xcount; x++)
1105                 {
1106                   abase_x = &abase[x*axstride];
1107                   s = (GFC_REAL_17) 0;
1108                   for (n = 0; n < count; n++)
1109                     s += abase_x[n] * bbase_y[n];
1110                   dest_y[x] = s;
1111                 }
1112             }
1113         }
1114       else
1115         {
1116           const GFC_REAL_17 *restrict bbase_y;
1117           GFC_REAL_17 s;
1118
1119           for (y = 0; y < ycount; y++)
1120             {
1121               bbase_y = &bbase[y*bystride];
1122               s = (GFC_REAL_17) 0;
1123               for (n = 0; n < count; n++)
1124                 s += abase[n*axstride] * bbase_y[n];
1125               dest[y*rystride] = s;
1126             }
1127         }
1128     }
1129   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1130     {
1131       const GFC_REAL_17 *restrict bbase_y;
1132       GFC_REAL_17 s;
1133
1134       for (y = 0; y < ycount; y++)
1135         {
1136           bbase_y = &bbase[y*bystride];
1137           s = (GFC_REAL_17) 0;
1138           for (n = 0; n < count; n++)
1139             s += abase[n*axstride] * bbase_y[n*bxstride];
1140           dest[y*rxstride] = s;
1141         }
1142     }
1143   else if (axstride < aystride)
1144     {
1145       for (y = 0; y < ycount; y++)
1146         for (x = 0; x < xcount; x++)
1147           dest[x*rxstride + y*rystride] = (GFC_REAL_17)0;
1148
1149       for (y = 0; y < ycount; y++)
1150         for (n = 0; n < count; n++)
1151           for (x = 0; x < xcount; x++)
1152             /* dest[x,y] += a[x,n] * b[n,y] */
1153             dest[x*rxstride + y*rystride] +=
1154                                         abase[x*axstride + n*aystride] *
1155                                         bbase[n*bxstride + y*bystride];
1156     }
1157   else
1158     {
1159       const GFC_REAL_17 *restrict abase_x;
1160       const GFC_REAL_17 *restrict bbase_y;
1161       GFC_REAL_17 *restrict dest_y;
1162       GFC_REAL_17 s;
1163
1164       for (y = 0; y < ycount; y++)
1165         {
1166           bbase_y = &bbase[y*bystride];
1167           dest_y = &dest[y*rystride];
1168           for (x = 0; x < xcount; x++)
1169             {
1170               abase_x = &abase[x*axstride];
1171               s = (GFC_REAL_17) 0;
1172               for (n = 0; n < count; n++)
1173                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1174               dest_y[x*rxstride] = s;
1175             }
1176         }
1177     }
1178 }
1179 #undef POW3
1180 #undef min
1181 #undef max
1182
1183 #endif
1184
1185 #endif
1186