* testsuite/26_numerics/headers/cmath/hypot.cc: XFAIL on AIX.
[official-gcc.git] / libgfortran / generated / matmul_r10.c
blob8bb1e6297bb35b871417b86da6a85aa743e01a5a
1 /* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2016 Free Software Foundation, Inc.
3 Contributed by Paul Brook <paul@nowt.org>
5 This file is part of the GNU Fortran runtime library (libgfortran).
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
26 #include "libgfortran.h"
27 #include <stdlib.h>
28 #include <string.h>
29 #include <assert.h>
32 #if defined (HAVE_GFC_REAL_10)
34 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
35 passed to us by the front-end, in which case we call it for large
36 matrices. */
38 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
39 const int *, const GFC_REAL_10 *, const GFC_REAL_10 *,
40 const int *, const GFC_REAL_10 *, const int *,
41 const GFC_REAL_10 *, GFC_REAL_10 *, const int *,
42 int, int);
44 /* The order of loops is different in the case of plain matrix
45 multiplication C=MATMUL(A,B), and in the frequent special case where
46 the argument A is the temporary result of a TRANSPOSE intrinsic:
47 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
48 looking at their strides.
50 The equivalent Fortran pseudo-code is:
52 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
53 IF (.NOT.IS_TRANSPOSED(A)) THEN
54 C = 0
55 DO J=1,N
56 DO K=1,COUNT
57 DO I=1,M
58 C(I,J) = C(I,J)+A(I,K)*B(K,J)
59 ELSE
60 DO J=1,N
61 DO I=1,M
62 S = 0
63 DO K=1,COUNT
64 S = S+A(I,K)*B(K,J)
65 C(I,J) = S
66 ENDIF
69 /* If try_blas is set to a nonzero value, then the matmul function will
70 see if there is a way to perform the matrix multiplication by a call
71 to the BLAS gemm function. */
73 extern void matmul_r10 (gfc_array_r10 * const restrict retarray,
74 gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
75 int blas_limit, blas_call gemm);
76 export_proto(matmul_r10);
78 void
79 matmul_r10 (gfc_array_r10 * const restrict retarray,
80 gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
81 int blas_limit, blas_call gemm)
83 const GFC_REAL_10 * restrict abase;
84 const GFC_REAL_10 * restrict bbase;
85 GFC_REAL_10 * restrict dest;
87 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
88 index_type x, y, n, count, xcount, ycount;
90 assert (GFC_DESCRIPTOR_RANK (a) == 2
91 || GFC_DESCRIPTOR_RANK (b) == 2);
93 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
95 Either A or B (but not both) can be rank 1:
97 o One-dimensional argument A is implicitly treated as a row matrix
98 dimensioned [1,count], so xcount=1.
100 o One-dimensional argument B is implicitly treated as a column matrix
101 dimensioned [count, 1], so ycount=1.
104 if (retarray->base_addr == NULL)
106 if (GFC_DESCRIPTOR_RANK (a) == 1)
108 GFC_DIMENSION_SET(retarray->dim[0], 0,
109 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
111 else if (GFC_DESCRIPTOR_RANK (b) == 1)
113 GFC_DIMENSION_SET(retarray->dim[0], 0,
114 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
116 else
118 GFC_DIMENSION_SET(retarray->dim[0], 0,
119 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
121 GFC_DIMENSION_SET(retarray->dim[1], 0,
122 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
123 GFC_DESCRIPTOR_EXTENT(retarray,0));
126 retarray->base_addr
127 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_10));
128 retarray->offset = 0;
130 else if (unlikely (compile_options.bounds_check))
132 index_type ret_extent, arg_extent;
134 if (GFC_DESCRIPTOR_RANK (a) == 1)
136 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
137 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
138 if (arg_extent != ret_extent)
139 runtime_error ("Incorrect extent in return array in"
140 " MATMUL intrinsic: is %ld, should be %ld",
141 (long int) ret_extent, (long int) arg_extent);
143 else if (GFC_DESCRIPTOR_RANK (b) == 1)
145 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
146 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
147 if (arg_extent != ret_extent)
148 runtime_error ("Incorrect extent in return array in"
149 " MATMUL intrinsic: is %ld, should be %ld",
150 (long int) ret_extent, (long int) arg_extent);
152 else
154 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
155 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
156 if (arg_extent != ret_extent)
157 runtime_error ("Incorrect extent in return array in"
158 " MATMUL intrinsic for dimension 1:"
159 " is %ld, should be %ld",
160 (long int) ret_extent, (long int) arg_extent);
162 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
163 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
164 if (arg_extent != ret_extent)
165 runtime_error ("Incorrect extent in return array in"
166 " MATMUL intrinsic for dimension 2:"
167 " is %ld, should be %ld",
168 (long int) ret_extent, (long int) arg_extent);
173 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
175 /* One-dimensional result may be addressed in the code below
176 either as a row or a column matrix. We want both cases to
177 work. */
178 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
180 else
182 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
183 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
187 if (GFC_DESCRIPTOR_RANK (a) == 1)
189 /* Treat it as a a row matrix A[1,count]. */
190 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
191 aystride = 1;
193 xcount = 1;
194 count = GFC_DESCRIPTOR_EXTENT(a,0);
196 else
198 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
199 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
201 count = GFC_DESCRIPTOR_EXTENT(a,1);
202 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
205 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
207 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
208 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
211 if (GFC_DESCRIPTOR_RANK (b) == 1)
213 /* Treat it as a column matrix B[count,1] */
214 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
216 /* bystride should never be used for 1-dimensional b.
217 in case it is we want it to cause a segfault, rather than
218 an incorrect result. */
219 bystride = 0xDEADBEEF;
220 ycount = 1;
222 else
224 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
225 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
226 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
229 abase = a->base_addr;
230 bbase = b->base_addr;
231 dest = retarray->base_addr;
233 /* Now that everything is set up, we perform the multiplication
234 itself. */
236 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
237 #define min(a,b) ((a) <= (b) ? (a) : (b))
238 #define max(a,b) ((a) >= (b) ? (a) : (b))
240 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
241 && (bxstride == 1 || bystride == 1)
242 && (((float) xcount) * ((float) ycount) * ((float) count)
243 > POW3(blas_limit)))
245 const int m = xcount, n = ycount, k = count, ldc = rystride;
246 const GFC_REAL_10 one = 1, zero = 0;
247 const int lda = (axstride == 1) ? aystride : axstride,
248 ldb = (bxstride == 1) ? bystride : bxstride;
250 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
252 assert (gemm != NULL);
253 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
254 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
255 &ldc, 1, 1);
256 return;
260 if (rxstride == 1 && axstride == 1 && bxstride == 1)
262 /* This block of code implements a tuned matmul, derived from
263 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
265 Bo Kagstrom and Per Ling
266 Department of Computing Science
267 Umea University
268 S-901 87 Umea, Sweden
270 from netlib.org, translated to C, and modified for matmul.m4. */
272 const GFC_REAL_10 *a, *b;
273 GFC_REAL_10 *c;
274 const index_type m = xcount, n = ycount, k = count;
276 /* System generated locals */
277 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
278 i1, i2, i3, i4, i5, i6;
280 /* Local variables */
281 GFC_REAL_10 t1[65536], /* was [256][256] */
282 f11, f12, f21, f22, f31, f32, f41, f42,
283 f13, f14, f23, f24, f33, f34, f43, f44;
284 index_type i, j, l, ii, jj, ll;
285 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
287 a = abase;
288 b = bbase;
289 c = retarray->base_addr;
291 /* Parameter adjustments */
292 c_dim1 = rystride;
293 c_offset = 1 + c_dim1;
294 c -= c_offset;
295 a_dim1 = aystride;
296 a_offset = 1 + a_dim1;
297 a -= a_offset;
298 b_dim1 = bystride;
299 b_offset = 1 + b_dim1;
300 b -= b_offset;
302 /* Early exit if possible */
303 if (m == 0 || n == 0 || k == 0)
304 return;
306 /* Empty c first. */
307 for (j=1; j<=n; j++)
308 for (i=1; i<=m; i++)
309 c[i + j * c_dim1] = (GFC_REAL_10)0;
311 /* Start turning the crank. */
312 i1 = n;
313 for (jj = 1; jj <= i1; jj += 512)
315 /* Computing MIN */
316 i2 = 512;
317 i3 = n - jj + 1;
318 jsec = min(i2,i3);
319 ujsec = jsec - jsec % 4;
320 i2 = k;
321 for (ll = 1; ll <= i2; ll += 256)
323 /* Computing MIN */
324 i3 = 256;
325 i4 = k - ll + 1;
326 lsec = min(i3,i4);
327 ulsec = lsec - lsec % 2;
329 i3 = m;
330 for (ii = 1; ii <= i3; ii += 256)
332 /* Computing MIN */
333 i4 = 256;
334 i5 = m - ii + 1;
335 isec = min(i4,i5);
336 uisec = isec - isec % 2;
337 i4 = ll + ulsec - 1;
338 for (l = ll; l <= i4; l += 2)
340 i5 = ii + uisec - 1;
341 for (i = ii; i <= i5; i += 2)
343 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
344 a[i + l * a_dim1];
345 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
346 a[i + (l + 1) * a_dim1];
347 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
348 a[i + 1 + l * a_dim1];
349 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
350 a[i + 1 + (l + 1) * a_dim1];
352 if (uisec < isec)
354 t1[l - ll + 1 + (isec << 8) - 257] =
355 a[ii + isec - 1 + l * a_dim1];
356 t1[l - ll + 2 + (isec << 8) - 257] =
357 a[ii + isec - 1 + (l + 1) * a_dim1];
360 if (ulsec < lsec)
362 i4 = ii + isec - 1;
363 for (i = ii; i<= i4; ++i)
365 t1[lsec + ((i - ii + 1) << 8) - 257] =
366 a[i + (ll + lsec - 1) * a_dim1];
370 uisec = isec - isec % 4;
371 i4 = jj + ujsec - 1;
372 for (j = jj; j <= i4; j += 4)
374 i5 = ii + uisec - 1;
375 for (i = ii; i <= i5; i += 4)
377 f11 = c[i + j * c_dim1];
378 f21 = c[i + 1 + j * c_dim1];
379 f12 = c[i + (j + 1) * c_dim1];
380 f22 = c[i + 1 + (j + 1) * c_dim1];
381 f13 = c[i + (j + 2) * c_dim1];
382 f23 = c[i + 1 + (j + 2) * c_dim1];
383 f14 = c[i + (j + 3) * c_dim1];
384 f24 = c[i + 1 + (j + 3) * c_dim1];
385 f31 = c[i + 2 + j * c_dim1];
386 f41 = c[i + 3 + j * c_dim1];
387 f32 = c[i + 2 + (j + 1) * c_dim1];
388 f42 = c[i + 3 + (j + 1) * c_dim1];
389 f33 = c[i + 2 + (j + 2) * c_dim1];
390 f43 = c[i + 3 + (j + 2) * c_dim1];
391 f34 = c[i + 2 + (j + 3) * c_dim1];
392 f44 = c[i + 3 + (j + 3) * c_dim1];
393 i6 = ll + lsec - 1;
394 for (l = ll; l <= i6; ++l)
396 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
397 * b[l + j * b_dim1];
398 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
399 * b[l + j * b_dim1];
400 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
401 * b[l + (j + 1) * b_dim1];
402 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
403 * b[l + (j + 1) * b_dim1];
404 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
405 * b[l + (j + 2) * b_dim1];
406 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
407 * b[l + (j + 2) * b_dim1];
408 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
409 * b[l + (j + 3) * b_dim1];
410 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
411 * b[l + (j + 3) * b_dim1];
412 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
413 * b[l + j * b_dim1];
414 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
415 * b[l + j * b_dim1];
416 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
417 * b[l + (j + 1) * b_dim1];
418 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
419 * b[l + (j + 1) * b_dim1];
420 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
421 * b[l + (j + 2) * b_dim1];
422 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
423 * b[l + (j + 2) * b_dim1];
424 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
425 * b[l + (j + 3) * b_dim1];
426 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
427 * b[l + (j + 3) * b_dim1];
429 c[i + j * c_dim1] = f11;
430 c[i + 1 + j * c_dim1] = f21;
431 c[i + (j + 1) * c_dim1] = f12;
432 c[i + 1 + (j + 1) * c_dim1] = f22;
433 c[i + (j + 2) * c_dim1] = f13;
434 c[i + 1 + (j + 2) * c_dim1] = f23;
435 c[i + (j + 3) * c_dim1] = f14;
436 c[i + 1 + (j + 3) * c_dim1] = f24;
437 c[i + 2 + j * c_dim1] = f31;
438 c[i + 3 + j * c_dim1] = f41;
439 c[i + 2 + (j + 1) * c_dim1] = f32;
440 c[i + 3 + (j + 1) * c_dim1] = f42;
441 c[i + 2 + (j + 2) * c_dim1] = f33;
442 c[i + 3 + (j + 2) * c_dim1] = f43;
443 c[i + 2 + (j + 3) * c_dim1] = f34;
444 c[i + 3 + (j + 3) * c_dim1] = f44;
446 if (uisec < isec)
448 i5 = ii + isec - 1;
449 for (i = ii + uisec; i <= i5; ++i)
451 f11 = c[i + j * c_dim1];
452 f12 = c[i + (j + 1) * c_dim1];
453 f13 = c[i + (j + 2) * c_dim1];
454 f14 = c[i + (j + 3) * c_dim1];
455 i6 = ll + lsec - 1;
456 for (l = ll; l <= i6; ++l)
458 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
459 257] * b[l + j * b_dim1];
460 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
461 257] * b[l + (j + 1) * b_dim1];
462 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
463 257] * b[l + (j + 2) * b_dim1];
464 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
465 257] * b[l + (j + 3) * b_dim1];
467 c[i + j * c_dim1] = f11;
468 c[i + (j + 1) * c_dim1] = f12;
469 c[i + (j + 2) * c_dim1] = f13;
470 c[i + (j + 3) * c_dim1] = f14;
474 if (ujsec < jsec)
476 i4 = jj + jsec - 1;
477 for (j = jj + ujsec; j <= i4; ++j)
479 i5 = ii + uisec - 1;
480 for (i = ii; i <= i5; i += 4)
482 f11 = c[i + j * c_dim1];
483 f21 = c[i + 1 + j * c_dim1];
484 f31 = c[i + 2 + j * c_dim1];
485 f41 = c[i + 3 + j * c_dim1];
486 i6 = ll + lsec - 1;
487 for (l = ll; l <= i6; ++l)
489 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
490 257] * b[l + j * b_dim1];
491 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
492 257] * b[l + j * b_dim1];
493 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
494 257] * b[l + j * b_dim1];
495 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
496 257] * b[l + j * b_dim1];
498 c[i + j * c_dim1] = f11;
499 c[i + 1 + j * c_dim1] = f21;
500 c[i + 2 + j * c_dim1] = f31;
501 c[i + 3 + j * c_dim1] = f41;
503 i5 = ii + isec - 1;
504 for (i = ii + uisec; i <= i5; ++i)
506 f11 = c[i + j * c_dim1];
507 i6 = ll + lsec - 1;
508 for (l = ll; l <= i6; ++l)
510 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
511 257] * b[l + j * b_dim1];
513 c[i + j * c_dim1] = f11;
520 return;
522 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
524 if (GFC_DESCRIPTOR_RANK (a) != 1)
526 const GFC_REAL_10 *restrict abase_x;
527 const GFC_REAL_10 *restrict bbase_y;
528 GFC_REAL_10 *restrict dest_y;
529 GFC_REAL_10 s;
531 for (y = 0; y < ycount; y++)
533 bbase_y = &bbase[y*bystride];
534 dest_y = &dest[y*rystride];
535 for (x = 0; x < xcount; x++)
537 abase_x = &abase[x*axstride];
538 s = (GFC_REAL_10) 0;
539 for (n = 0; n < count; n++)
540 s += abase_x[n] * bbase_y[n];
541 dest_y[x] = s;
545 else
547 const GFC_REAL_10 *restrict bbase_y;
548 GFC_REAL_10 s;
550 for (y = 0; y < ycount; y++)
552 bbase_y = &bbase[y*bystride];
553 s = (GFC_REAL_10) 0;
554 for (n = 0; n < count; n++)
555 s += abase[n*axstride] * bbase_y[n];
556 dest[y*rystride] = s;
560 else if (axstride < aystride)
562 for (y = 0; y < ycount; y++)
563 for (x = 0; x < xcount; x++)
564 dest[x*rxstride + y*rystride] = (GFC_REAL_10)0;
566 for (y = 0; y < ycount; y++)
567 for (n = 0; n < count; n++)
568 for (x = 0; x < xcount; x++)
569 /* dest[x,y] += a[x,n] * b[n,y] */
570 dest[x*rxstride + y*rystride] +=
571 abase[x*axstride + n*aystride] *
572 bbase[n*bxstride + y*bystride];
574 else if (GFC_DESCRIPTOR_RANK (a) == 1)
576 const GFC_REAL_10 *restrict bbase_y;
577 GFC_REAL_10 s;
579 for (y = 0; y < ycount; y++)
581 bbase_y = &bbase[y*bystride];
582 s = (GFC_REAL_10) 0;
583 for (n = 0; n < count; n++)
584 s += abase[n*axstride] * bbase_y[n*bxstride];
585 dest[y*rxstride] = s;
588 else
590 const GFC_REAL_10 *restrict abase_x;
591 const GFC_REAL_10 *restrict bbase_y;
592 GFC_REAL_10 *restrict dest_y;
593 GFC_REAL_10 s;
595 for (y = 0; y < ycount; y++)
597 bbase_y = &bbase[y*bystride];
598 dest_y = &dest[y*rystride];
599 for (x = 0; x < xcount; x++)
601 abase_x = &abase[x*axstride];
602 s = (GFC_REAL_10) 0;
603 for (n = 0; n < count; n++)
604 s += abase_x[n*aystride] * bbase_y[n*bxstride];
605 dest_y[x*rxstride] = s;
610 #endif