libgfortran/generated/matmul_r10.c

   1 /* Implementation of the MATMUL intrinsic
   2    Copyright (C) 2002-2016 Free Software Foundation, Inc.
   3    Contributed by Paul Brook <paul@nowt.org>
   4
   5 This file is part of the GNU Fortran runtime library (libgfortran).
   6
   7 Libgfortran is free software; you can redistribute it and/or
   8 modify it under the terms of the GNU General Public
   9 License as published by the Free Software Foundation; either
  10 version 3 of the License, or (at your option) any later version.
  11
  12 Libgfortran is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 Under Section 7 of GPL version 3, you are granted additional
  18 permissions described in the GCC Runtime Library Exception, version
  19 3.1, as published by the Free Software Foundation.
  20
  21 You should have received a copy of the GNU General Public License and
  22 a copy of the GCC Runtime Library Exception along with this program;
  23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24 <http://www.gnu.org/licenses/>.  */
  25
  26 #include "libgfortran.h"
  27 #include <stdlib.h>
  28 #include <string.h>
  29 #include <assert.h>
  30
  31
  32 #if defined (HAVE_GFC_REAL_10)
  33
  34 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
  35    passed to us by the front-end, in which case we call it for large
  36    matrices.  */
  37
  38 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
  39                           const int *, const GFC_REAL_10 *, const GFC_REAL_10 *,
  40                           const int *, const GFC_REAL_10 *, const int *,
  41                           const GFC_REAL_10 *, GFC_REAL_10 *, const int *,
  42                           int, int);
  43
  44 /* The order of loops is different in the case of plain matrix
  45    multiplication C=MATMUL(A,B), and in the frequent special case where
  46    the argument A is the temporary result of a TRANSPOSE intrinsic:
  47    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
  48    looking at their strides.
  49
  50    The equivalent Fortran pseudo-code is:
  51
  52    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
  53    IF (.NOT.IS_TRANSPOSED(A)) THEN
  54      C = 0
  55      DO J=1,N
  56        DO K=1,COUNT
  57          DO I=1,M
  58            C(I,J) = C(I,J)+A(I,K)*B(K,J)
  59    ELSE
  60      DO J=1,N
  61        DO I=1,M
  62          S = 0
  63          DO K=1,COUNT
  64            S = S+A(I,K)*B(K,J)
  65          C(I,J) = S
  66    ENDIF
  67 */
  68
  69 /* If try_blas is set to a nonzero value, then the matmul function will
  70    see if there is a way to perform the matrix multiplication by a call
  71    to the BLAS gemm function.  */
  72
  73 extern void matmul_r10 (gfc_array_r10 * const restrict retarray,
  74         gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
  75         int blas_limit, blas_call gemm);
  76 export_proto(matmul_r10);
  77
  78 void
  79 matmul_r10 (gfc_array_r10 * const restrict retarray,
  80         gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
  81         int blas_limit, blas_call gemm)
  82 {
  83   const GFC_REAL_10 * restrict abase;
  84   const GFC_REAL_10 * restrict bbase;
  85   GFC_REAL_10 * restrict dest;
  86
  87   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
  88   index_type x, y, n, count, xcount, ycount;
  89
  90   assert (GFC_DESCRIPTOR_RANK (a) == 2
  91           || GFC_DESCRIPTOR_RANK (b) == 2);
  92
  93 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
  94
  95    Either A or B (but not both) can be rank 1:
  96
  97    o One-dimensional argument A is implicitly treated as a row matrix
  98      dimensioned [1,count], so xcount=1.
  99
 100    o One-dimensional argument B is implicitly treated as a column matrix
 101      dimensioned [count, 1], so ycount=1.
 102 */
 103
 104   if (retarray->base_addr == NULL)
 105     {
 106       if (GFC_DESCRIPTOR_RANK (a) == 1)
 107         {
 108           GFC_DIMENSION_SET(retarray->dim[0], 0,
 109                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
 110         }
 111       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 112         {
 113           GFC_DIMENSION_SET(retarray->dim[0], 0,
 114                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 115         }
 116       else
 117         {
 118           GFC_DIMENSION_SET(retarray->dim[0], 0,
 119                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 120
 121           GFC_DIMENSION_SET(retarray->dim[1], 0,
 122                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
 123                             GFC_DESCRIPTOR_EXTENT(retarray,0));
 124         }
 125
 126       retarray->base_addr
 127         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_10));
 128       retarray->offset = 0;
 129     }
 130   else if (unlikely (compile_options.bounds_check))
 131     {
 132       index_type ret_extent, arg_extent;
 133
 134       if (GFC_DESCRIPTOR_RANK (a) == 1)
 135         {
 136           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 137           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 138           if (arg_extent != ret_extent)
 139             runtime_error ("Incorrect extent in return array in"
 140                            " MATMUL intrinsic: is %ld, should be %ld",
 141                            (long int) ret_extent, (long int) arg_extent);
 142         }
 143       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 144         {
 145           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 146           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 147           if (arg_extent != ret_extent)
 148             runtime_error ("Incorrect extent in return array in"
 149                            " MATMUL intrinsic: is %ld, should be %ld",
 150                            (long int) ret_extent, (long int) arg_extent);
 151         }
 152       else
 153         {
 154           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 155           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 156           if (arg_extent != ret_extent)
 157             runtime_error ("Incorrect extent in return array in"
 158                            " MATMUL intrinsic for dimension 1:"
 159                            " is %ld, should be %ld",
 160                            (long int) ret_extent, (long int) arg_extent);
 161
 162           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 163           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 164           if (arg_extent != ret_extent)
 165             runtime_error ("Incorrect extent in return array in"
 166                            " MATMUL intrinsic for dimension 2:"
 167                            " is %ld, should be %ld",
 168                            (long int) ret_extent, (long int) arg_extent);
 169         }
 170     }
 171
 172
 173   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 174     {
 175       /* One-dimensional result may be addressed in the code below
 176          either as a row or a column matrix. We want both cases to
 177          work. */
 178       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 179     }
 180   else
 181     {
 182       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 183       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 184     }
 185
 186
 187   if (GFC_DESCRIPTOR_RANK (a) == 1)
 188     {
 189       /* Treat it as a a row matrix A[1,count]. */
 190       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 191       aystride = 1;
 192
 193       xcount = 1;
 194       count = GFC_DESCRIPTOR_EXTENT(a,0);
 195     }
 196   else
 197     {
 198       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 199       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 200
 201       count = GFC_DESCRIPTOR_EXTENT(a,1);
 202       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 203     }
 204
 205   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 206     {
 207       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 208         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
 209     }
 210
 211   if (GFC_DESCRIPTOR_RANK (b) == 1)
 212     {
 213       /* Treat it as a column matrix B[count,1] */
 214       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 215
 216       /* bystride should never be used for 1-dimensional b.
 217          in case it is we want it to cause a segfault, rather than
 218          an incorrect result. */
 219       bystride = 0xDEADBEEF;
 220       ycount = 1;
 221     }
 222   else
 223     {
 224       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 225       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 226       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 227     }
 228
 229   abase = a->base_addr;
 230   bbase = b->base_addr;
 231   dest = retarray->base_addr;
 232
 233   /* Now that everything is set up, we perform the multiplication
 234      itself.  */
 235
 236 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 237 #define min(a,b) ((a) <= (b) ? (a) : (b))
 238 #define max(a,b) ((a) >= (b) ? (a) : (b))
 239
 240   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 241       && (bxstride == 1 || bystride == 1)
 242       && (((float) xcount) * ((float) ycount) * ((float) count)
 243           > POW3(blas_limit)))
 244     {
 245       const int m = xcount, n = ycount, k = count, ldc = rystride;
 246       const GFC_REAL_10 one = 1, zero = 0;
 247       const int lda = (axstride == 1) ? aystride : axstride,
 248                 ldb = (bxstride == 1) ? bystride : bxstride;
 249
 250       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 251         {
 252           assert (gemm != NULL);
 253           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
 254                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 255                 &ldc, 1, 1);
 256           return;
 257         }
 258     }
 259
 260   if (rxstride == 1 && axstride == 1 && bxstride == 1)
 261     {
 262       /* This block of code implements a tuned matmul, derived from
 263          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 264
 265                Bo Kagstrom and Per Ling
 266                Department of Computing Science
 267                Umea University
 268                S-901 87 Umea, Sweden
 269
 270          from netlib.org, translated to C, and modified for matmul.m4.  */
 271
 272       const GFC_REAL_10 *a, *b;
 273       GFC_REAL_10 *c;
 274       const index_type m = xcount, n = ycount, k = count;
 275
 276       /* System generated locals */
 277       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 278                  i1, i2, i3, i4, i5, i6;
 279
 280       /* Local variables */
 281       GFC_REAL_10 t1[65536], /* was [256][256] */
 282                  f11, f12, f21, f22, f31, f32, f41, f42,
 283                  f13, f14, f23, f24, f33, f34, f43, f44;
 284       index_type i, j, l, ii, jj, ll;
 285       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 286
 287       a = abase;
 288       b = bbase;
 289       c = retarray->base_addr;
 290
 291       /* Parameter adjustments */
 292       c_dim1 = rystride;
 293       c_offset = 1 + c_dim1;
 294       c -= c_offset;
 295       a_dim1 = aystride;
 296       a_offset = 1 + a_dim1;
 297       a -= a_offset;
 298       b_dim1 = bystride;
 299       b_offset = 1 + b_dim1;
 300       b -= b_offset;
 301
 302       /* Early exit if possible */
 303       if (m == 0 || n == 0 || k == 0)
 304         return;
 305
 306       /* Empty c first.  */
 307       for (j=1; j<=n; j++)
 308         for (i=1; i<=m; i++)
 309           c[i + j * c_dim1] = (GFC_REAL_10)0;
 310
 311       /* Start turning the crank. */
 312       i1 = n;
 313       for (jj = 1; jj <= i1; jj += 512)
 314         {
 315           /* Computing MIN */
 316           i2 = 512;
 317           i3 = n - jj + 1;
 318           jsec = min(i2,i3);
 319           ujsec = jsec - jsec % 4;
 320           i2 = k;
 321           for (ll = 1; ll <= i2; ll += 256)
 322             {
 323               /* Computing MIN */
 324               i3 = 256;
 325               i4 = k - ll + 1;
 326               lsec = min(i3,i4);
 327               ulsec = lsec - lsec % 2;
 328
 329               i3 = m;
 330               for (ii = 1; ii <= i3; ii += 256)
 331                 {
 332                   /* Computing MIN */
 333                   i4 = 256;
 334                   i5 = m - ii + 1;
 335                   isec = min(i4,i5);
 336                   uisec = isec - isec % 2;
 337                   i4 = ll + ulsec - 1;
 338                   for (l = ll; l <= i4; l += 2)
 339                     {
 340                       i5 = ii + uisec - 1;
 341                       for (i = ii; i <= i5; i += 2)
 342                         {
 343                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 344                                         a[i + l * a_dim1];
 345                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 346                                         a[i + (l + 1) * a_dim1];
 347                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 348                                         a[i + 1 + l * a_dim1];
 349                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 350                                         a[i + 1 + (l + 1) * a_dim1];
 351                         }
 352                       if (uisec < isec)
 353                         {
 354                           t1[l - ll + 1 + (isec << 8) - 257] =
 355                                     a[ii + isec - 1 + l * a_dim1];
 356                           t1[l - ll + 2 + (isec << 8) - 257] =
 357                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 358                         }
 359                     }
 360                   if (ulsec < lsec)
 361                     {
 362                       i4 = ii + isec - 1;
 363                       for (i = ii; i<= i4; ++i)
 364                         {
 365                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 366                                     a[i + (ll + lsec - 1) * a_dim1];
 367                         }
 368                     }
 369
 370                   uisec = isec - isec % 4;
 371                   i4 = jj + ujsec - 1;
 372                   for (j = jj; j <= i4; j += 4)
 373                     {
 374                       i5 = ii + uisec - 1;
 375                       for (i = ii; i <= i5; i += 4)
 376                         {
 377                           f11 = c[i + j * c_dim1];
 378                           f21 = c[i + 1 + j * c_dim1];
 379                           f12 = c[i + (j + 1) * c_dim1];
 380                           f22 = c[i + 1 + (j + 1) * c_dim1];
 381                           f13 = c[i + (j + 2) * c_dim1];
 382                           f23 = c[i + 1 + (j + 2) * c_dim1];
 383                           f14 = c[i + (j + 3) * c_dim1];
 384                           f24 = c[i + 1 + (j + 3) * c_dim1];
 385                           f31 = c[i + 2 + j * c_dim1];
 386                           f41 = c[i + 3 + j * c_dim1];
 387                           f32 = c[i + 2 + (j + 1) * c_dim1];
 388                           f42 = c[i + 3 + (j + 1) * c_dim1];
 389                           f33 = c[i + 2 + (j + 2) * c_dim1];
 390                           f43 = c[i + 3 + (j + 2) * c_dim1];
 391                           f34 = c[i + 2 + (j + 3) * c_dim1];
 392                           f44 = c[i + 3 + (j + 3) * c_dim1];
 393                           i6 = ll + lsec - 1;
 394                           for (l = ll; l <= i6; ++l)
 395                             {
 396                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 397                                       * b[l + j * b_dim1];
 398                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 399                                       * b[l + j * b_dim1];
 400                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 401                                       * b[l + (j + 1) * b_dim1];
 402                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 403                                       * b[l + (j + 1) * b_dim1];
 404                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 405                                       * b[l + (j + 2) * b_dim1];
 406                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 407                                       * b[l + (j + 2) * b_dim1];
 408                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 409                                       * b[l + (j + 3) * b_dim1];
 410                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 411                                       * b[l + (j + 3) * b_dim1];
 412                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 413                                       * b[l + j * b_dim1];
 414                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 415                                       * b[l + j * b_dim1];
 416                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 417                                       * b[l + (j + 1) * b_dim1];
 418                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 419                                       * b[l + (j + 1) * b_dim1];
 420                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 421                                       * b[l + (j + 2) * b_dim1];
 422                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 423                                       * b[l + (j + 2) * b_dim1];
 424                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 425                                       * b[l + (j + 3) * b_dim1];
 426                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 427                                       * b[l + (j + 3) * b_dim1];
 428                             }
 429                           c[i + j * c_dim1] = f11;
 430                           c[i + 1 + j * c_dim1] = f21;
 431                           c[i + (j + 1) * c_dim1] = f12;
 432                           c[i + 1 + (j + 1) * c_dim1] = f22;
 433                           c[i + (j + 2) * c_dim1] = f13;
 434                           c[i + 1 + (j + 2) * c_dim1] = f23;
 435                           c[i + (j + 3) * c_dim1] = f14;
 436                           c[i + 1 + (j + 3) * c_dim1] = f24;
 437                           c[i + 2 + j * c_dim1] = f31;
 438                           c[i + 3 + j * c_dim1] = f41;
 439                           c[i + 2 + (j + 1) * c_dim1] = f32;
 440                           c[i + 3 + (j + 1) * c_dim1] = f42;
 441                           c[i + 2 + (j + 2) * c_dim1] = f33;
 442                           c[i + 3 + (j + 2) * c_dim1] = f43;
 443                           c[i + 2 + (j + 3) * c_dim1] = f34;
 444                           c[i + 3 + (j + 3) * c_dim1] = f44;
 445                         }
 446                       if (uisec < isec)
 447                         {
 448                           i5 = ii + isec - 1;
 449                           for (i = ii + uisec; i <= i5; ++i)
 450                             {
 451                               f11 = c[i + j * c_dim1];
 452                               f12 = c[i + (j + 1) * c_dim1];
 453                               f13 = c[i + (j + 2) * c_dim1];
 454                               f14 = c[i + (j + 3) * c_dim1];
 455                               i6 = ll + lsec - 1;
 456                               for (l = ll; l <= i6; ++l)
 457                                 {
 458                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 459                                           257] * b[l + j * b_dim1];
 460                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 461                                           257] * b[l + (j + 1) * b_dim1];
 462                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 463                                           257] * b[l + (j + 2) * b_dim1];
 464                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 465                                           257] * b[l + (j + 3) * b_dim1];
 466                                 }
 467                               c[i + j * c_dim1] = f11;
 468                               c[i + (j + 1) * c_dim1] = f12;
 469                               c[i + (j + 2) * c_dim1] = f13;
 470                               c[i + (j + 3) * c_dim1] = f14;
 471                             }
 472                         }
 473                     }
 474                   if (ujsec < jsec)
 475                     {
 476                       i4 = jj + jsec - 1;
 477                       for (j = jj + ujsec; j <= i4; ++j)
 478                         {
 479                           i5 = ii + uisec - 1;
 480                           for (i = ii; i <= i5; i += 4)
 481                             {
 482                               f11 = c[i + j * c_dim1];
 483                               f21 = c[i + 1 + j * c_dim1];
 484                               f31 = c[i + 2 + j * c_dim1];
 485                               f41 = c[i + 3 + j * c_dim1];
 486                               i6 = ll + lsec - 1;
 487                               for (l = ll; l <= i6; ++l)
 488                                 {
 489                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 490                                           257] * b[l + j * b_dim1];
 491                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
 492                                           257] * b[l + j * b_dim1];
 493                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
 494                                           257] * b[l + j * b_dim1];
 495                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
 496                                           257] * b[l + j * b_dim1];
 497                                 }
 498                               c[i + j * c_dim1] = f11;
 499                               c[i + 1 + j * c_dim1] = f21;
 500                               c[i + 2 + j * c_dim1] = f31;
 501                               c[i + 3 + j * c_dim1] = f41;
 502                             }
 503                           i5 = ii + isec - 1;
 504                           for (i = ii + uisec; i <= i5; ++i)
 505                             {
 506                               f11 = c[i + j * c_dim1];
 507                               i6 = ll + lsec - 1;
 508                               for (l = ll; l <= i6; ++l)
 509                                 {
 510                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 511                                           257] * b[l + j * b_dim1];
 512                                 }
 513                               c[i + j * c_dim1] = f11;
 514                             }
 515                         }
 516                     }
 517                 }
 518             }
 519         }
 520       return;
 521     }
 522   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
 523     {
 524       if (GFC_DESCRIPTOR_RANK (a) != 1)
 525         {
 526           const GFC_REAL_10 *restrict abase_x;
 527           const GFC_REAL_10 *restrict bbase_y;
 528           GFC_REAL_10 *restrict dest_y;
 529           GFC_REAL_10 s;
 530
 531           for (y = 0; y < ycount; y++)
 532             {
 533               bbase_y = &bbase[y*bystride];
 534               dest_y = &dest[y*rystride];
 535               for (x = 0; x < xcount; x++)
 536                 {
 537                   abase_x = &abase[x*axstride];
 538                   s = (GFC_REAL_10) 0;
 539                   for (n = 0; n < count; n++)
 540                     s += abase_x[n] * bbase_y[n];
 541                   dest_y[x] = s;
 542                 }
 543             }
 544         }
 545       else
 546         {
 547           const GFC_REAL_10 *restrict bbase_y;
 548           GFC_REAL_10 s;
 549
 550           for (y = 0; y < ycount; y++)
 551             {
 552               bbase_y = &bbase[y*bystride];
 553               s = (GFC_REAL_10) 0;
 554               for (n = 0; n < count; n++)
 555                 s += abase[n*axstride] * bbase_y[n];
 556               dest[y*rystride] = s;
 557             }
 558         }
 559     }
 560   else if (axstride < aystride)
 561     {
 562       for (y = 0; y < ycount; y++)
 563         for (x = 0; x < xcount; x++)
 564           dest[x*rxstride + y*rystride] = (GFC_REAL_10)0;
 565
 566       for (y = 0; y < ycount; y++)
 567         for (n = 0; n < count; n++)
 568           for (x = 0; x < xcount; x++)
 569             /* dest[x,y] += a[x,n] * b[n,y] */
 570             dest[x*rxstride + y*rystride] +=
 571                                         abase[x*axstride + n*aystride] *
 572                                         bbase[n*bxstride + y*bystride];
 573     }
 574   else if (GFC_DESCRIPTOR_RANK (a) == 1)
 575     {
 576       const GFC_REAL_10 *restrict bbase_y;
 577       GFC_REAL_10 s;
 578
 579       for (y = 0; y < ycount; y++)
 580         {
 581           bbase_y = &bbase[y*bystride];
 582           s = (GFC_REAL_10) 0;
 583           for (n = 0; n < count; n++)
 584             s += abase[n*axstride] * bbase_y[n*bxstride];
 585           dest[y*rxstride] = s;
 586         }
 587     }
 588   else
 589     {
 590       const GFC_REAL_10 *restrict abase_x;
 591       const GFC_REAL_10 *restrict bbase_y;
 592       GFC_REAL_10 *restrict dest_y;
 593       GFC_REAL_10 s;
 594
 595       for (y = 0; y < ycount; y++)
 596         {
 597           bbase_y = &bbase[y*bystride];
 598           dest_y = &dest[y*rystride];
 599           for (x = 0; x < xcount; x++)
 600             {
 601               abase_x = &abase[x*axstride];
 602               s = (GFC_REAL_10) 0;
 603               for (n = 0; n < count; n++)
 604                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
 605               dest_y[x*rxstride] = s;
 606             }
 607         }
 608     }
 609 }
 610 #endif