1 /* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 Contributed by Paul Brook <paul@nowt.org>
5 This file is part of the GNU Fortran runtime library (libgfortran).
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
26 #include "libgfortran.h"
31 #if defined (HAVE_GFC_INTEGER_1)
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34 passed to us by the front-end, in which case we call it for large
37 typedef void (*blas_call
)(const char *, const char *, const int *, const int *,
38 const int *, const GFC_INTEGER_1
*, const GFC_INTEGER_1
*,
39 const int *, const GFC_INTEGER_1
*, const int *,
40 const GFC_INTEGER_1
*, GFC_INTEGER_1
*, const int *,
43 /* The order of loops is different in the case of plain matrix
44 multiplication C=MATMUL(A,B), and in the frequent special case where
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47 looking at their strides.
49 The equivalent Fortran pseudo-code is:
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
68 /* If try_blas is set to a nonzero value, then the matmul function will
69 see if there is a way to perform the matrix multiplication by a call
70 to the BLAS gemm function. */
72 extern void matmul_i1 (gfc_array_i1
* const restrict retarray
,
73 gfc_array_i1
* const restrict a
, gfc_array_i1
* const restrict b
, int try_blas
,
74 int blas_limit
, blas_call gemm
);
75 export_proto(matmul_i1
);
77 /* Put exhaustive list of possible architectures here here, ORed together. */
79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
83 matmul_i1_avx (gfc_array_i1
* const restrict retarray
,
84 gfc_array_i1
* const restrict a
, gfc_array_i1
* const restrict b
, int try_blas
,
85 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx")));
87 matmul_i1_avx (gfc_array_i1
* const restrict retarray
,
88 gfc_array_i1
* const restrict a
, gfc_array_i1
* const restrict b
, int try_blas
,
89 int blas_limit
, blas_call gemm
)
91 const GFC_INTEGER_1
* restrict abase
;
92 const GFC_INTEGER_1
* restrict bbase
;
93 GFC_INTEGER_1
* restrict dest
;
95 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
96 index_type x
, y
, n
, count
, xcount
, ycount
;
98 assert (GFC_DESCRIPTOR_RANK (a
) == 2
99 || GFC_DESCRIPTOR_RANK (b
) == 2);
101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
103 Either A or B (but not both) can be rank 1:
105 o One-dimensional argument A is implicitly treated as a row matrix
106 dimensioned [1,count], so xcount=1.
108 o One-dimensional argument B is implicitly treated as a column matrix
109 dimensioned [count, 1], so ycount=1.
112 if (retarray
->base_addr
== NULL
)
114 if (GFC_DESCRIPTOR_RANK (a
) == 1)
116 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
117 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
119 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
121 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
122 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
126 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
127 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
129 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
130 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
131 GFC_DESCRIPTOR_EXTENT(retarray
,0));
135 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_INTEGER_1
));
136 retarray
->offset
= 0;
138 else if (unlikely (compile_options
.bounds_check
))
140 index_type ret_extent
, arg_extent
;
142 if (GFC_DESCRIPTOR_RANK (a
) == 1)
144 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
145 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
146 if (arg_extent
!= ret_extent
)
147 runtime_error ("Incorrect extent in return array in"
148 " MATMUL intrinsic: is %ld, should be %ld",
149 (long int) ret_extent
, (long int) arg_extent
);
151 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
153 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
154 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
155 if (arg_extent
!= ret_extent
)
156 runtime_error ("Incorrect extent in return array in"
157 " MATMUL intrinsic: is %ld, should be %ld",
158 (long int) ret_extent
, (long int) arg_extent
);
162 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
163 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
164 if (arg_extent
!= ret_extent
)
165 runtime_error ("Incorrect extent in return array in"
166 " MATMUL intrinsic for dimension 1:"
167 " is %ld, should be %ld",
168 (long int) ret_extent
, (long int) arg_extent
);
170 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
171 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
172 if (arg_extent
!= ret_extent
)
173 runtime_error ("Incorrect extent in return array in"
174 " MATMUL intrinsic for dimension 2:"
175 " is %ld, should be %ld",
176 (long int) ret_extent
, (long int) arg_extent
);
181 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
183 /* One-dimensional result may be addressed in the code below
184 either as a row or a column matrix. We want both cases to
186 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
190 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
191 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
195 if (GFC_DESCRIPTOR_RANK (a
) == 1)
197 /* Treat it as a a row matrix A[1,count]. */
198 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
202 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
206 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
207 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
209 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
210 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
213 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
215 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
216 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
219 if (GFC_DESCRIPTOR_RANK (b
) == 1)
221 /* Treat it as a column matrix B[count,1] */
222 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
224 /* bystride should never be used for 1-dimensional b.
225 The value is only used for calculation of the
226 memory by the buffer. */
232 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
233 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
234 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
237 abase
= a
->base_addr
;
238 bbase
= b
->base_addr
;
239 dest
= retarray
->base_addr
;
241 /* Now that everything is set up, we perform the multiplication
244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245 #define min(a,b) ((a) <= (b) ? (a) : (b))
246 #define max(a,b) ((a) >= (b) ? (a) : (b))
248 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
249 && (bxstride
== 1 || bystride
== 1)
250 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
253 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
254 const GFC_INTEGER_1 one
= 1, zero
= 0;
255 const int lda
= (axstride
== 1) ? aystride
: axstride
,
256 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
258 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
260 assert (gemm
!= NULL
);
261 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
262 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
268 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
270 /* This block of code implements a tuned matmul, derived from
271 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
273 Bo Kagstrom and Per Ling
274 Department of Computing Science
276 S-901 87 Umea, Sweden
278 from netlib.org, translated to C, and modified for matmul.m4. */
280 const GFC_INTEGER_1
*a
, *b
;
282 const index_type m
= xcount
, n
= ycount
, k
= count
;
284 /* System generated locals */
285 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
286 i1
, i2
, i3
, i4
, i5
, i6
;
288 /* Local variables */
289 GFC_INTEGER_1 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
290 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
291 index_type i
, j
, l
, ii
, jj
, ll
;
292 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
297 c
= retarray
->base_addr
;
299 /* Parameter adjustments */
301 c_offset
= 1 + c_dim1
;
304 a_offset
= 1 + a_dim1
;
307 b_offset
= 1 + b_dim1
;
313 c
[i
+ j
* c_dim1
] = (GFC_INTEGER_1
)0;
315 /* Early exit if possible */
316 if (m
== 0 || n
== 0 || k
== 0)
319 /* Adjust size of t1 to what is needed. */
320 index_type t1_dim
, a_sz
;
326 t1_dim
= a_sz
* 256 + b_dim1
;
330 t1
= malloc (t1_dim
* sizeof(GFC_INTEGER_1
));
332 /* Start turning the crank. */
334 for (jj
= 1; jj
<= i1
; jj
+= 512)
340 ujsec
= jsec
- jsec
% 4;
342 for (ll
= 1; ll
<= i2
; ll
+= 256)
348 ulsec
= lsec
- lsec
% 2;
351 for (ii
= 1; ii
<= i3
; ii
+= 256)
357 uisec
= isec
- isec
% 2;
359 for (l
= ll
; l
<= i4
; l
+= 2)
362 for (i
= ii
; i
<= i5
; i
+= 2)
364 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
366 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
367 a
[i
+ (l
+ 1) * a_dim1
];
368 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
369 a
[i
+ 1 + l
* a_dim1
];
370 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
371 a
[i
+ 1 + (l
+ 1) * a_dim1
];
375 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
376 a
[ii
+ isec
- 1 + l
* a_dim1
];
377 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
378 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
384 for (i
= ii
; i
<= i4
; ++i
)
386 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
387 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
391 uisec
= isec
- isec
% 4;
393 for (j
= jj
; j
<= i4
; j
+= 4)
396 for (i
= ii
; i
<= i5
; i
+= 4)
398 f11
= c
[i
+ j
* c_dim1
];
399 f21
= c
[i
+ 1 + j
* c_dim1
];
400 f12
= c
[i
+ (j
+ 1) * c_dim1
];
401 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
402 f13
= c
[i
+ (j
+ 2) * c_dim1
];
403 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
404 f14
= c
[i
+ (j
+ 3) * c_dim1
];
405 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
406 f31
= c
[i
+ 2 + j
* c_dim1
];
407 f41
= c
[i
+ 3 + j
* c_dim1
];
408 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
409 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
410 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
411 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
412 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
413 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
415 for (l
= ll
; l
<= i6
; ++l
)
417 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
419 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
421 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
422 * b
[l
+ (j
+ 1) * b_dim1
];
423 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
424 * b
[l
+ (j
+ 1) * b_dim1
];
425 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
426 * b
[l
+ (j
+ 2) * b_dim1
];
427 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
428 * b
[l
+ (j
+ 2) * b_dim1
];
429 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
430 * b
[l
+ (j
+ 3) * b_dim1
];
431 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
432 * b
[l
+ (j
+ 3) * b_dim1
];
433 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
435 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
437 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
438 * b
[l
+ (j
+ 1) * b_dim1
];
439 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
440 * b
[l
+ (j
+ 1) * b_dim1
];
441 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
442 * b
[l
+ (j
+ 2) * b_dim1
];
443 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
444 * b
[l
+ (j
+ 2) * b_dim1
];
445 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
446 * b
[l
+ (j
+ 3) * b_dim1
];
447 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
448 * b
[l
+ (j
+ 3) * b_dim1
];
450 c
[i
+ j
* c_dim1
] = f11
;
451 c
[i
+ 1 + j
* c_dim1
] = f21
;
452 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
453 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
454 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
455 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
456 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
457 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
458 c
[i
+ 2 + j
* c_dim1
] = f31
;
459 c
[i
+ 3 + j
* c_dim1
] = f41
;
460 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
461 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
462 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
463 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
464 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
465 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
470 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
472 f11
= c
[i
+ j
* c_dim1
];
473 f12
= c
[i
+ (j
+ 1) * c_dim1
];
474 f13
= c
[i
+ (j
+ 2) * c_dim1
];
475 f14
= c
[i
+ (j
+ 3) * c_dim1
];
477 for (l
= ll
; l
<= i6
; ++l
)
479 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
480 257] * b
[l
+ j
* b_dim1
];
481 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
482 257] * b
[l
+ (j
+ 1) * b_dim1
];
483 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
484 257] * b
[l
+ (j
+ 2) * b_dim1
];
485 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
486 257] * b
[l
+ (j
+ 3) * b_dim1
];
488 c
[i
+ j
* c_dim1
] = f11
;
489 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
490 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
491 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
498 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
501 for (i
= ii
; i
<= i5
; i
+= 4)
503 f11
= c
[i
+ j
* c_dim1
];
504 f21
= c
[i
+ 1 + j
* c_dim1
];
505 f31
= c
[i
+ 2 + j
* c_dim1
];
506 f41
= c
[i
+ 3 + j
* c_dim1
];
508 for (l
= ll
; l
<= i6
; ++l
)
510 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
511 257] * b
[l
+ j
* b_dim1
];
512 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
513 257] * b
[l
+ j
* b_dim1
];
514 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
515 257] * b
[l
+ j
* b_dim1
];
516 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
517 257] * b
[l
+ j
* b_dim1
];
519 c
[i
+ j
* c_dim1
] = f11
;
520 c
[i
+ 1 + j
* c_dim1
] = f21
;
521 c
[i
+ 2 + j
* c_dim1
] = f31
;
522 c
[i
+ 3 + j
* c_dim1
] = f41
;
525 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
527 f11
= c
[i
+ j
* c_dim1
];
529 for (l
= ll
; l
<= i6
; ++l
)
531 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
532 257] * b
[l
+ j
* b_dim1
];
534 c
[i
+ j
* c_dim1
] = f11
;
544 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
546 if (GFC_DESCRIPTOR_RANK (a
) != 1)
548 const GFC_INTEGER_1
*restrict abase_x
;
549 const GFC_INTEGER_1
*restrict bbase_y
;
550 GFC_INTEGER_1
*restrict dest_y
;
553 for (y
= 0; y
< ycount
; y
++)
555 bbase_y
= &bbase
[y
*bystride
];
556 dest_y
= &dest
[y
*rystride
];
557 for (x
= 0; x
< xcount
; x
++)
559 abase_x
= &abase
[x
*axstride
];
560 s
= (GFC_INTEGER_1
) 0;
561 for (n
= 0; n
< count
; n
++)
562 s
+= abase_x
[n
] * bbase_y
[n
];
569 const GFC_INTEGER_1
*restrict bbase_y
;
572 for (y
= 0; y
< ycount
; y
++)
574 bbase_y
= &bbase
[y
*bystride
];
575 s
= (GFC_INTEGER_1
) 0;
576 for (n
= 0; n
< count
; n
++)
577 s
+= abase
[n
*axstride
] * bbase_y
[n
];
578 dest
[y
*rystride
] = s
;
582 else if (axstride
< aystride
)
584 for (y
= 0; y
< ycount
; y
++)
585 for (x
= 0; x
< xcount
; x
++)
586 dest
[x
*rxstride
+ y
*rystride
] = (GFC_INTEGER_1
)0;
588 for (y
= 0; y
< ycount
; y
++)
589 for (n
= 0; n
< count
; n
++)
590 for (x
= 0; x
< xcount
; x
++)
591 /* dest[x,y] += a[x,n] * b[n,y] */
592 dest
[x
*rxstride
+ y
*rystride
] +=
593 abase
[x
*axstride
+ n
*aystride
] *
594 bbase
[n
*bxstride
+ y
*bystride
];
596 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
598 const GFC_INTEGER_1
*restrict bbase_y
;
601 for (y
= 0; y
< ycount
; y
++)
603 bbase_y
= &bbase
[y
*bystride
];
604 s
= (GFC_INTEGER_1
) 0;
605 for (n
= 0; n
< count
; n
++)
606 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
607 dest
[y
*rxstride
] = s
;
612 const GFC_INTEGER_1
*restrict abase_x
;
613 const GFC_INTEGER_1
*restrict bbase_y
;
614 GFC_INTEGER_1
*restrict dest_y
;
617 for (y
= 0; y
< ycount
; y
++)
619 bbase_y
= &bbase
[y
*bystride
];
620 dest_y
= &dest
[y
*rystride
];
621 for (x
= 0; x
< xcount
; x
++)
623 abase_x
= &abase
[x
*axstride
];
624 s
= (GFC_INTEGER_1
) 0;
625 for (n
= 0; n
< count
; n
++)
626 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
627 dest_y
[x
*rxstride
] = s
;
636 #endif /* HAVE_AVX */
640 matmul_i1_avx2 (gfc_array_i1
* const restrict retarray
,
641 gfc_array_i1
* const restrict a
, gfc_array_i1
* const restrict b
, int try_blas
,
642 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx2,fma")));
644 matmul_i1_avx2 (gfc_array_i1
* const restrict retarray
,
645 gfc_array_i1
* const restrict a
, gfc_array_i1
* const restrict b
, int try_blas
,
646 int blas_limit
, blas_call gemm
)
648 const GFC_INTEGER_1
* restrict abase
;
649 const GFC_INTEGER_1
* restrict bbase
;
650 GFC_INTEGER_1
* restrict dest
;
652 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
653 index_type x
, y
, n
, count
, xcount
, ycount
;
655 assert (GFC_DESCRIPTOR_RANK (a
) == 2
656 || GFC_DESCRIPTOR_RANK (b
) == 2);
658 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
660 Either A or B (but not both) can be rank 1:
662 o One-dimensional argument A is implicitly treated as a row matrix
663 dimensioned [1,count], so xcount=1.
665 o One-dimensional argument B is implicitly treated as a column matrix
666 dimensioned [count, 1], so ycount=1.
669 if (retarray
->base_addr
== NULL
)
671 if (GFC_DESCRIPTOR_RANK (a
) == 1)
673 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
674 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
676 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
678 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
679 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
683 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
684 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
686 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
687 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
688 GFC_DESCRIPTOR_EXTENT(retarray
,0));
692 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_INTEGER_1
));
693 retarray
->offset
= 0;
695 else if (unlikely (compile_options
.bounds_check
))
697 index_type ret_extent
, arg_extent
;
699 if (GFC_DESCRIPTOR_RANK (a
) == 1)
701 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
702 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
703 if (arg_extent
!= ret_extent
)
704 runtime_error ("Incorrect extent in return array in"
705 " MATMUL intrinsic: is %ld, should be %ld",
706 (long int) ret_extent
, (long int) arg_extent
);
708 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
710 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
711 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
712 if (arg_extent
!= ret_extent
)
713 runtime_error ("Incorrect extent in return array in"
714 " MATMUL intrinsic: is %ld, should be %ld",
715 (long int) ret_extent
, (long int) arg_extent
);
719 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
720 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
721 if (arg_extent
!= ret_extent
)
722 runtime_error ("Incorrect extent in return array in"
723 " MATMUL intrinsic for dimension 1:"
724 " is %ld, should be %ld",
725 (long int) ret_extent
, (long int) arg_extent
);
727 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
728 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
729 if (arg_extent
!= ret_extent
)
730 runtime_error ("Incorrect extent in return array in"
731 " MATMUL intrinsic for dimension 2:"
732 " is %ld, should be %ld",
733 (long int) ret_extent
, (long int) arg_extent
);
738 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
740 /* One-dimensional result may be addressed in the code below
741 either as a row or a column matrix. We want both cases to
743 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
747 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
748 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
752 if (GFC_DESCRIPTOR_RANK (a
) == 1)
754 /* Treat it as a a row matrix A[1,count]. */
755 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
759 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
763 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
764 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
766 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
767 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
770 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
772 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
773 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
776 if (GFC_DESCRIPTOR_RANK (b
) == 1)
778 /* Treat it as a column matrix B[count,1] */
779 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
781 /* bystride should never be used for 1-dimensional b.
782 The value is only used for calculation of the
783 memory by the buffer. */
789 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
790 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
791 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
794 abase
= a
->base_addr
;
795 bbase
= b
->base_addr
;
796 dest
= retarray
->base_addr
;
798 /* Now that everything is set up, we perform the multiplication
801 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
802 #define min(a,b) ((a) <= (b) ? (a) : (b))
803 #define max(a,b) ((a) >= (b) ? (a) : (b))
805 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
806 && (bxstride
== 1 || bystride
== 1)
807 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
810 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
811 const GFC_INTEGER_1 one
= 1, zero
= 0;
812 const int lda
= (axstride
== 1) ? aystride
: axstride
,
813 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
815 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
817 assert (gemm
!= NULL
);
818 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
819 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
825 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
827 /* This block of code implements a tuned matmul, derived from
828 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
830 Bo Kagstrom and Per Ling
831 Department of Computing Science
833 S-901 87 Umea, Sweden
835 from netlib.org, translated to C, and modified for matmul.m4. */
837 const GFC_INTEGER_1
*a
, *b
;
839 const index_type m
= xcount
, n
= ycount
, k
= count
;
841 /* System generated locals */
842 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
843 i1
, i2
, i3
, i4
, i5
, i6
;
845 /* Local variables */
846 GFC_INTEGER_1 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
847 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
848 index_type i
, j
, l
, ii
, jj
, ll
;
849 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
854 c
= retarray
->base_addr
;
856 /* Parameter adjustments */
858 c_offset
= 1 + c_dim1
;
861 a_offset
= 1 + a_dim1
;
864 b_offset
= 1 + b_dim1
;
870 c
[i
+ j
* c_dim1
] = (GFC_INTEGER_1
)0;
872 /* Early exit if possible */
873 if (m
== 0 || n
== 0 || k
== 0)
876 /* Adjust size of t1 to what is needed. */
877 index_type t1_dim
, a_sz
;
883 t1_dim
= a_sz
* 256 + b_dim1
;
887 t1
= malloc (t1_dim
* sizeof(GFC_INTEGER_1
));
889 /* Start turning the crank. */
891 for (jj
= 1; jj
<= i1
; jj
+= 512)
897 ujsec
= jsec
- jsec
% 4;
899 for (ll
= 1; ll
<= i2
; ll
+= 256)
905 ulsec
= lsec
- lsec
% 2;
908 for (ii
= 1; ii
<= i3
; ii
+= 256)
914 uisec
= isec
- isec
% 2;
916 for (l
= ll
; l
<= i4
; l
+= 2)
919 for (i
= ii
; i
<= i5
; i
+= 2)
921 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
923 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
924 a
[i
+ (l
+ 1) * a_dim1
];
925 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
926 a
[i
+ 1 + l
* a_dim1
];
927 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
928 a
[i
+ 1 + (l
+ 1) * a_dim1
];
932 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
933 a
[ii
+ isec
- 1 + l
* a_dim1
];
934 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
935 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
941 for (i
= ii
; i
<= i4
; ++i
)
943 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
944 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
948 uisec
= isec
- isec
% 4;
950 for (j
= jj
; j
<= i4
; j
+= 4)
953 for (i
= ii
; i
<= i5
; i
+= 4)
955 f11
= c
[i
+ j
* c_dim1
];
956 f21
= c
[i
+ 1 + j
* c_dim1
];
957 f12
= c
[i
+ (j
+ 1) * c_dim1
];
958 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
959 f13
= c
[i
+ (j
+ 2) * c_dim1
];
960 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
961 f14
= c
[i
+ (j
+ 3) * c_dim1
];
962 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
963 f31
= c
[i
+ 2 + j
* c_dim1
];
964 f41
= c
[i
+ 3 + j
* c_dim1
];
965 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
966 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
967 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
968 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
969 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
970 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
972 for (l
= ll
; l
<= i6
; ++l
)
974 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
976 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
978 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
979 * b
[l
+ (j
+ 1) * b_dim1
];
980 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
981 * b
[l
+ (j
+ 1) * b_dim1
];
982 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
983 * b
[l
+ (j
+ 2) * b_dim1
];
984 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
985 * b
[l
+ (j
+ 2) * b_dim1
];
986 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
987 * b
[l
+ (j
+ 3) * b_dim1
];
988 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
989 * b
[l
+ (j
+ 3) * b_dim1
];
990 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
992 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
994 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
995 * b
[l
+ (j
+ 1) * b_dim1
];
996 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
997 * b
[l
+ (j
+ 1) * b_dim1
];
998 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
999 * b
[l
+ (j
+ 2) * b_dim1
];
1000 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1001 * b
[l
+ (j
+ 2) * b_dim1
];
1002 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1003 * b
[l
+ (j
+ 3) * b_dim1
];
1004 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1005 * b
[l
+ (j
+ 3) * b_dim1
];
1007 c
[i
+ j
* c_dim1
] = f11
;
1008 c
[i
+ 1 + j
* c_dim1
] = f21
;
1009 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1010 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
1011 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1012 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
1013 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1014 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
1015 c
[i
+ 2 + j
* c_dim1
] = f31
;
1016 c
[i
+ 3 + j
* c_dim1
] = f41
;
1017 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
1018 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
1019 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
1020 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
1021 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
1022 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
1027 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1029 f11
= c
[i
+ j
* c_dim1
];
1030 f12
= c
[i
+ (j
+ 1) * c_dim1
];
1031 f13
= c
[i
+ (j
+ 2) * c_dim1
];
1032 f14
= c
[i
+ (j
+ 3) * c_dim1
];
1034 for (l
= ll
; l
<= i6
; ++l
)
1036 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1037 257] * b
[l
+ j
* b_dim1
];
1038 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1039 257] * b
[l
+ (j
+ 1) * b_dim1
];
1040 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1041 257] * b
[l
+ (j
+ 2) * b_dim1
];
1042 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1043 257] * b
[l
+ (j
+ 3) * b_dim1
];
1045 c
[i
+ j
* c_dim1
] = f11
;
1046 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1047 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1048 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1055 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
1057 i5
= ii
+ uisec
- 1;
1058 for (i
= ii
; i
<= i5
; i
+= 4)
1060 f11
= c
[i
+ j
* c_dim1
];
1061 f21
= c
[i
+ 1 + j
* c_dim1
];
1062 f31
= c
[i
+ 2 + j
* c_dim1
];
1063 f41
= c
[i
+ 3 + j
* c_dim1
];
1065 for (l
= ll
; l
<= i6
; ++l
)
1067 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1068 257] * b
[l
+ j
* b_dim1
];
1069 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
1070 257] * b
[l
+ j
* b_dim1
];
1071 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
1072 257] * b
[l
+ j
* b_dim1
];
1073 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
1074 257] * b
[l
+ j
* b_dim1
];
1076 c
[i
+ j
* c_dim1
] = f11
;
1077 c
[i
+ 1 + j
* c_dim1
] = f21
;
1078 c
[i
+ 2 + j
* c_dim1
] = f31
;
1079 c
[i
+ 3 + j
* c_dim1
] = f41
;
1082 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1084 f11
= c
[i
+ j
* c_dim1
];
1086 for (l
= ll
; l
<= i6
; ++l
)
1088 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1089 257] * b
[l
+ j
* b_dim1
];
1091 c
[i
+ j
* c_dim1
] = f11
;
1101 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
1103 if (GFC_DESCRIPTOR_RANK (a
) != 1)
1105 const GFC_INTEGER_1
*restrict abase_x
;
1106 const GFC_INTEGER_1
*restrict bbase_y
;
1107 GFC_INTEGER_1
*restrict dest_y
;
1110 for (y
= 0; y
< ycount
; y
++)
1112 bbase_y
= &bbase
[y
*bystride
];
1113 dest_y
= &dest
[y
*rystride
];
1114 for (x
= 0; x
< xcount
; x
++)
1116 abase_x
= &abase
[x
*axstride
];
1117 s
= (GFC_INTEGER_1
) 0;
1118 for (n
= 0; n
< count
; n
++)
1119 s
+= abase_x
[n
] * bbase_y
[n
];
1126 const GFC_INTEGER_1
*restrict bbase_y
;
1129 for (y
= 0; y
< ycount
; y
++)
1131 bbase_y
= &bbase
[y
*bystride
];
1132 s
= (GFC_INTEGER_1
) 0;
1133 for (n
= 0; n
< count
; n
++)
1134 s
+= abase
[n
*axstride
] * bbase_y
[n
];
1135 dest
[y
*rystride
] = s
;
1139 else if (axstride
< aystride
)
1141 for (y
= 0; y
< ycount
; y
++)
1142 for (x
= 0; x
< xcount
; x
++)
1143 dest
[x
*rxstride
+ y
*rystride
] = (GFC_INTEGER_1
)0;
1145 for (y
= 0; y
< ycount
; y
++)
1146 for (n
= 0; n
< count
; n
++)
1147 for (x
= 0; x
< xcount
; x
++)
1148 /* dest[x,y] += a[x,n] * b[n,y] */
1149 dest
[x
*rxstride
+ y
*rystride
] +=
1150 abase
[x
*axstride
+ n
*aystride
] *
1151 bbase
[n
*bxstride
+ y
*bystride
];
1153 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
1155 const GFC_INTEGER_1
*restrict bbase_y
;
1158 for (y
= 0; y
< ycount
; y
++)
1160 bbase_y
= &bbase
[y
*bystride
];
1161 s
= (GFC_INTEGER_1
) 0;
1162 for (n
= 0; n
< count
; n
++)
1163 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
1164 dest
[y
*rxstride
] = s
;
1169 const GFC_INTEGER_1
*restrict abase_x
;
1170 const GFC_INTEGER_1
*restrict bbase_y
;
1171 GFC_INTEGER_1
*restrict dest_y
;
1174 for (y
= 0; y
< ycount
; y
++)
1176 bbase_y
= &bbase
[y
*bystride
];
1177 dest_y
= &dest
[y
*rystride
];
1178 for (x
= 0; x
< xcount
; x
++)
1180 abase_x
= &abase
[x
*axstride
];
1181 s
= (GFC_INTEGER_1
) 0;
1182 for (n
= 0; n
< count
; n
++)
1183 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
1184 dest_y
[x
*rxstride
] = s
;
1193 #endif /* HAVE_AVX2 */
1197 matmul_i1_avx512f (gfc_array_i1
* const restrict retarray
,
1198 gfc_array_i1
* const restrict a
, gfc_array_i1
* const restrict b
, int try_blas
,
1199 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx512f")));
1201 matmul_i1_avx512f (gfc_array_i1
* const restrict retarray
,
1202 gfc_array_i1
* const restrict a
, gfc_array_i1
* const restrict b
, int try_blas
,
1203 int blas_limit
, blas_call gemm
)
1205 const GFC_INTEGER_1
* restrict abase
;
1206 const GFC_INTEGER_1
* restrict bbase
;
1207 GFC_INTEGER_1
* restrict dest
;
1209 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
1210 index_type x
, y
, n
, count
, xcount
, ycount
;
1212 assert (GFC_DESCRIPTOR_RANK (a
) == 2
1213 || GFC_DESCRIPTOR_RANK (b
) == 2);
1215 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1217 Either A or B (but not both) can be rank 1:
1219 o One-dimensional argument A is implicitly treated as a row matrix
1220 dimensioned [1,count], so xcount=1.
1222 o One-dimensional argument B is implicitly treated as a column matrix
1223 dimensioned [count, 1], so ycount=1.
1226 if (retarray
->base_addr
== NULL
)
1228 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1230 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1231 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
1233 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1235 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1236 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1240 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1241 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1243 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
1244 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
1245 GFC_DESCRIPTOR_EXTENT(retarray
,0));
1249 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_INTEGER_1
));
1250 retarray
->offset
= 0;
1252 else if (unlikely (compile_options
.bounds_check
))
1254 index_type ret_extent
, arg_extent
;
1256 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1258 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1259 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1260 if (arg_extent
!= ret_extent
)
1261 runtime_error ("Incorrect extent in return array in"
1262 " MATMUL intrinsic: is %ld, should be %ld",
1263 (long int) ret_extent
, (long int) arg_extent
);
1265 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1267 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1268 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1269 if (arg_extent
!= ret_extent
)
1270 runtime_error ("Incorrect extent in return array in"
1271 " MATMUL intrinsic: is %ld, should be %ld",
1272 (long int) ret_extent
, (long int) arg_extent
);
1276 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1277 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1278 if (arg_extent
!= ret_extent
)
1279 runtime_error ("Incorrect extent in return array in"
1280 " MATMUL intrinsic for dimension 1:"
1281 " is %ld, should be %ld",
1282 (long int) ret_extent
, (long int) arg_extent
);
1284 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1285 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
1286 if (arg_extent
!= ret_extent
)
1287 runtime_error ("Incorrect extent in return array in"
1288 " MATMUL intrinsic for dimension 2:"
1289 " is %ld, should be %ld",
1290 (long int) ret_extent
, (long int) arg_extent
);
1295 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
1297 /* One-dimensional result may be addressed in the code below
1298 either as a row or a column matrix. We want both cases to
1300 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1304 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1305 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
1309 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1311 /* Treat it as a a row matrix A[1,count]. */
1312 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1316 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
1320 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1321 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
1323 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
1324 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
1327 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
1329 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
1330 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1333 if (GFC_DESCRIPTOR_RANK (b
) == 1)
1335 /* Treat it as a column matrix B[count,1] */
1336 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1338 /* bystride should never be used for 1-dimensional b.
1339 The value is only used for calculation of the
1340 memory by the buffer. */
1346 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1347 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
1348 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
1351 abase
= a
->base_addr
;
1352 bbase
= b
->base_addr
;
1353 dest
= retarray
->base_addr
;
1355 /* Now that everything is set up, we perform the multiplication
1358 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1359 #define min(a,b) ((a) <= (b) ? (a) : (b))
1360 #define max(a,b) ((a) >= (b) ? (a) : (b))
1362 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
1363 && (bxstride
== 1 || bystride
== 1)
1364 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
1365 > POW3(blas_limit
)))
1367 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
1368 const GFC_INTEGER_1 one
= 1, zero
= 0;
1369 const int lda
= (axstride
== 1) ? aystride
: axstride
,
1370 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
1372 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
1374 assert (gemm
!= NULL
);
1375 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
1376 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
1382 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
1384 /* This block of code implements a tuned matmul, derived from
1385 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1387 Bo Kagstrom and Per Ling
1388 Department of Computing Science
1390 S-901 87 Umea, Sweden
1392 from netlib.org, translated to C, and modified for matmul.m4. */
1394 const GFC_INTEGER_1
*a
, *b
;
1396 const index_type m
= xcount
, n
= ycount
, k
= count
;
1398 /* System generated locals */
1399 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
1400 i1
, i2
, i3
, i4
, i5
, i6
;
1402 /* Local variables */
1403 GFC_INTEGER_1 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
1404 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
1405 index_type i
, j
, l
, ii
, jj
, ll
;
1406 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
1411 c
= retarray
->base_addr
;
1413 /* Parameter adjustments */
1415 c_offset
= 1 + c_dim1
;
1418 a_offset
= 1 + a_dim1
;
1421 b_offset
= 1 + b_dim1
;
1424 /* Empty c first. */
1425 for (j
=1; j
<=n
; j
++)
1426 for (i
=1; i
<=m
; i
++)
1427 c
[i
+ j
* c_dim1
] = (GFC_INTEGER_1
)0;
1429 /* Early exit if possible */
1430 if (m
== 0 || n
== 0 || k
== 0)
1433 /* Adjust size of t1 to what is needed. */
1434 index_type t1_dim
, a_sz
;
1440 t1_dim
= a_sz
* 256 + b_dim1
;
1444 t1
= malloc (t1_dim
* sizeof(GFC_INTEGER_1
));
1446 /* Start turning the crank. */
1448 for (jj
= 1; jj
<= i1
; jj
+= 512)
1454 ujsec
= jsec
- jsec
% 4;
1456 for (ll
= 1; ll
<= i2
; ll
+= 256)
1462 ulsec
= lsec
- lsec
% 2;
1465 for (ii
= 1; ii
<= i3
; ii
+= 256)
1471 uisec
= isec
- isec
% 2;
1472 i4
= ll
+ ulsec
- 1;
1473 for (l
= ll
; l
<= i4
; l
+= 2)
1475 i5
= ii
+ uisec
- 1;
1476 for (i
= ii
; i
<= i5
; i
+= 2)
1478 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
1480 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
1481 a
[i
+ (l
+ 1) * a_dim1
];
1482 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
1483 a
[i
+ 1 + l
* a_dim1
];
1484 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
1485 a
[i
+ 1 + (l
+ 1) * a_dim1
];
1489 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
1490 a
[ii
+ isec
- 1 + l
* a_dim1
];
1491 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
1492 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
1498 for (i
= ii
; i
<= i4
; ++i
)
1500 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
1501 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
1505 uisec
= isec
- isec
% 4;
1506 i4
= jj
+ ujsec
- 1;
1507 for (j
= jj
; j
<= i4
; j
+= 4)
1509 i5
= ii
+ uisec
- 1;
1510 for (i
= ii
; i
<= i5
; i
+= 4)
1512 f11
= c
[i
+ j
* c_dim1
];
1513 f21
= c
[i
+ 1 + j
* c_dim1
];
1514 f12
= c
[i
+ (j
+ 1) * c_dim1
];
1515 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
1516 f13
= c
[i
+ (j
+ 2) * c_dim1
];
1517 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
1518 f14
= c
[i
+ (j
+ 3) * c_dim1
];
1519 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
1520 f31
= c
[i
+ 2 + j
* c_dim1
];
1521 f41
= c
[i
+ 3 + j
* c_dim1
];
1522 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
1523 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
1524 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
1525 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
1526 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
1527 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
1529 for (l
= ll
; l
<= i6
; ++l
)
1531 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1532 * b
[l
+ j
* b_dim1
];
1533 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1534 * b
[l
+ j
* b_dim1
];
1535 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1536 * b
[l
+ (j
+ 1) * b_dim1
];
1537 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1538 * b
[l
+ (j
+ 1) * b_dim1
];
1539 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1540 * b
[l
+ (j
+ 2) * b_dim1
];
1541 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1542 * b
[l
+ (j
+ 2) * b_dim1
];
1543 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
1544 * b
[l
+ (j
+ 3) * b_dim1
];
1545 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
1546 * b
[l
+ (j
+ 3) * b_dim1
];
1547 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1548 * b
[l
+ j
* b_dim1
];
1549 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1550 * b
[l
+ j
* b_dim1
];
1551 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1552 * b
[l
+ (j
+ 1) * b_dim1
];
1553 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1554 * b
[l
+ (j
+ 1) * b_dim1
];
1555 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1556 * b
[l
+ (j
+ 2) * b_dim1
];
1557 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1558 * b
[l
+ (j
+ 2) * b_dim1
];
1559 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
1560 * b
[l
+ (j
+ 3) * b_dim1
];
1561 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
1562 * b
[l
+ (j
+ 3) * b_dim1
];
1564 c
[i
+ j
* c_dim1
] = f11
;
1565 c
[i
+ 1 + j
* c_dim1
] = f21
;
1566 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1567 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
1568 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1569 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
1570 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1571 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
1572 c
[i
+ 2 + j
* c_dim1
] = f31
;
1573 c
[i
+ 3 + j
* c_dim1
] = f41
;
1574 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
1575 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
1576 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
1577 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
1578 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
1579 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
1584 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1586 f11
= c
[i
+ j
* c_dim1
];
1587 f12
= c
[i
+ (j
+ 1) * c_dim1
];
1588 f13
= c
[i
+ (j
+ 2) * c_dim1
];
1589 f14
= c
[i
+ (j
+ 3) * c_dim1
];
1591 for (l
= ll
; l
<= i6
; ++l
)
1593 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1594 257] * b
[l
+ j
* b_dim1
];
1595 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1596 257] * b
[l
+ (j
+ 1) * b_dim1
];
1597 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1598 257] * b
[l
+ (j
+ 2) * b_dim1
];
1599 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1600 257] * b
[l
+ (j
+ 3) * b_dim1
];
1602 c
[i
+ j
* c_dim1
] = f11
;
1603 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
1604 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
1605 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
1612 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
1614 i5
= ii
+ uisec
- 1;
1615 for (i
= ii
; i
<= i5
; i
+= 4)
1617 f11
= c
[i
+ j
* c_dim1
];
1618 f21
= c
[i
+ 1 + j
* c_dim1
];
1619 f31
= c
[i
+ 2 + j
* c_dim1
];
1620 f41
= c
[i
+ 3 + j
* c_dim1
];
1622 for (l
= ll
; l
<= i6
; ++l
)
1624 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1625 257] * b
[l
+ j
* b_dim1
];
1626 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
1627 257] * b
[l
+ j
* b_dim1
];
1628 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
1629 257] * b
[l
+ j
* b_dim1
];
1630 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
1631 257] * b
[l
+ j
* b_dim1
];
1633 c
[i
+ j
* c_dim1
] = f11
;
1634 c
[i
+ 1 + j
* c_dim1
] = f21
;
1635 c
[i
+ 2 + j
* c_dim1
] = f31
;
1636 c
[i
+ 3 + j
* c_dim1
] = f41
;
1639 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
1641 f11
= c
[i
+ j
* c_dim1
];
1643 for (l
= ll
; l
<= i6
; ++l
)
1645 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
1646 257] * b
[l
+ j
* b_dim1
];
1648 c
[i
+ j
* c_dim1
] = f11
;
1658 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
1660 if (GFC_DESCRIPTOR_RANK (a
) != 1)
1662 const GFC_INTEGER_1
*restrict abase_x
;
1663 const GFC_INTEGER_1
*restrict bbase_y
;
1664 GFC_INTEGER_1
*restrict dest_y
;
1667 for (y
= 0; y
< ycount
; y
++)
1669 bbase_y
= &bbase
[y
*bystride
];
1670 dest_y
= &dest
[y
*rystride
];
1671 for (x
= 0; x
< xcount
; x
++)
1673 abase_x
= &abase
[x
*axstride
];
1674 s
= (GFC_INTEGER_1
) 0;
1675 for (n
= 0; n
< count
; n
++)
1676 s
+= abase_x
[n
] * bbase_y
[n
];
1683 const GFC_INTEGER_1
*restrict bbase_y
;
1686 for (y
= 0; y
< ycount
; y
++)
1688 bbase_y
= &bbase
[y
*bystride
];
1689 s
= (GFC_INTEGER_1
) 0;
1690 for (n
= 0; n
< count
; n
++)
1691 s
+= abase
[n
*axstride
] * bbase_y
[n
];
1692 dest
[y
*rystride
] = s
;
1696 else if (axstride
< aystride
)
1698 for (y
= 0; y
< ycount
; y
++)
1699 for (x
= 0; x
< xcount
; x
++)
1700 dest
[x
*rxstride
+ y
*rystride
] = (GFC_INTEGER_1
)0;
1702 for (y
= 0; y
< ycount
; y
++)
1703 for (n
= 0; n
< count
; n
++)
1704 for (x
= 0; x
< xcount
; x
++)
1705 /* dest[x,y] += a[x,n] * b[n,y] */
1706 dest
[x
*rxstride
+ y
*rystride
] +=
1707 abase
[x
*axstride
+ n
*aystride
] *
1708 bbase
[n
*bxstride
+ y
*bystride
];
1710 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
1712 const GFC_INTEGER_1
*restrict bbase_y
;
1715 for (y
= 0; y
< ycount
; y
++)
1717 bbase_y
= &bbase
[y
*bystride
];
1718 s
= (GFC_INTEGER_1
) 0;
1719 for (n
= 0; n
< count
; n
++)
1720 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
1721 dest
[y
*rxstride
] = s
;
1726 const GFC_INTEGER_1
*restrict abase_x
;
1727 const GFC_INTEGER_1
*restrict bbase_y
;
1728 GFC_INTEGER_1
*restrict dest_y
;
1731 for (y
= 0; y
< ycount
; y
++)
1733 bbase_y
= &bbase
[y
*bystride
];
1734 dest_y
= &dest
[y
*rystride
];
1735 for (x
= 0; x
< xcount
; x
++)
1737 abase_x
= &abase
[x
*axstride
];
1738 s
= (GFC_INTEGER_1
) 0;
1739 for (n
= 0; n
< count
; n
++)
1740 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
1741 dest_y
[x
*rxstride
] = s
;
1750 #endif /* HAVE_AVX512F */
1752 /* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
1754 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
1756 matmul_i1_avx128_fma3 (gfc_array_i1
* const restrict retarray
,
1757 gfc_array_i1
* const restrict a
, gfc_array_i1
* const restrict b
, int try_blas
,
1758 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx,fma")));
1759 internal_proto(matmul_i1_avx128_fma3
);
1762 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
1764 matmul_i1_avx128_fma4 (gfc_array_i1
* const restrict retarray
,
1765 gfc_array_i1
* const restrict a
, gfc_array_i1
* const restrict b
, int try_blas
,
1766 int blas_limit
, blas_call gemm
) __attribute__((__target__("avx,fma4")));
1767 internal_proto(matmul_i1_avx128_fma4
);
1770 /* Function to fall back to if there is no special processor-specific version. */
1772 matmul_i1_vanilla (gfc_array_i1
* const restrict retarray
,
1773 gfc_array_i1
* const restrict a
, gfc_array_i1
* const restrict b
, int try_blas
,
1774 int blas_limit
, blas_call gemm
)
1776 const GFC_INTEGER_1
* restrict abase
;
1777 const GFC_INTEGER_1
* restrict bbase
;
1778 GFC_INTEGER_1
* restrict dest
;
1780 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
1781 index_type x
, y
, n
, count
, xcount
, ycount
;
1783 assert (GFC_DESCRIPTOR_RANK (a
) == 2
1784 || GFC_DESCRIPTOR_RANK (b
) == 2);
1786 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1788 Either A or B (but not both) can be rank 1:
1790 o One-dimensional argument A is implicitly treated as a row matrix
1791 dimensioned [1,count], so xcount=1.
1793 o One-dimensional argument B is implicitly treated as a column matrix
1794 dimensioned [count, 1], so ycount=1.
1797 if (retarray
->base_addr
== NULL
)
1799 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1801 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1802 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
1804 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1806 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1807 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1811 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
1812 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
1814 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
1815 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
1816 GFC_DESCRIPTOR_EXTENT(retarray
,0));
1820 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_INTEGER_1
));
1821 retarray
->offset
= 0;
1823 else if (unlikely (compile_options
.bounds_check
))
1825 index_type ret_extent
, arg_extent
;
1827 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1829 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1830 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1831 if (arg_extent
!= ret_extent
)
1832 runtime_error ("Incorrect extent in return array in"
1833 " MATMUL intrinsic: is %ld, should be %ld",
1834 (long int) ret_extent
, (long int) arg_extent
);
1836 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
1838 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1839 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1840 if (arg_extent
!= ret_extent
)
1841 runtime_error ("Incorrect extent in return array in"
1842 " MATMUL intrinsic: is %ld, should be %ld",
1843 (long int) ret_extent
, (long int) arg_extent
);
1847 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
1848 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
1849 if (arg_extent
!= ret_extent
)
1850 runtime_error ("Incorrect extent in return array in"
1851 " MATMUL intrinsic for dimension 1:"
1852 " is %ld, should be %ld",
1853 (long int) ret_extent
, (long int) arg_extent
);
1855 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
1856 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
1857 if (arg_extent
!= ret_extent
)
1858 runtime_error ("Incorrect extent in return array in"
1859 " MATMUL intrinsic for dimension 2:"
1860 " is %ld, should be %ld",
1861 (long int) ret_extent
, (long int) arg_extent
);
1866 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
1868 /* One-dimensional result may be addressed in the code below
1869 either as a row or a column matrix. We want both cases to
1871 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1875 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
1876 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
1880 if (GFC_DESCRIPTOR_RANK (a
) == 1)
1882 /* Treat it as a a row matrix A[1,count]. */
1883 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1887 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
1891 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
1892 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
1894 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
1895 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
1898 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
1900 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
1901 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1904 if (GFC_DESCRIPTOR_RANK (b
) == 1)
1906 /* Treat it as a column matrix B[count,1] */
1907 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1909 /* bystride should never be used for 1-dimensional b.
1910 The value is only used for calculation of the
1911 memory by the buffer. */
1917 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
1918 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
1919 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
1922 abase
= a
->base_addr
;
1923 bbase
= b
->base_addr
;
1924 dest
= retarray
->base_addr
;
1926 /* Now that everything is set up, we perform the multiplication
1929 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1930 #define min(a,b) ((a) <= (b) ? (a) : (b))
1931 #define max(a,b) ((a) >= (b) ? (a) : (b))
1933 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
1934 && (bxstride
== 1 || bystride
== 1)
1935 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
1936 > POW3(blas_limit
)))
1938 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
1939 const GFC_INTEGER_1 one
= 1, zero
= 0;
1940 const int lda
= (axstride
== 1) ? aystride
: axstride
,
1941 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
1943 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
1945 assert (gemm
!= NULL
);
1946 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
1947 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
1953 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
1955 /* This block of code implements a tuned matmul, derived from
1956 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1958 Bo Kagstrom and Per Ling
1959 Department of Computing Science
1961 S-901 87 Umea, Sweden
1963 from netlib.org, translated to C, and modified for matmul.m4. */
1965 const GFC_INTEGER_1
*a
, *b
;
1967 const index_type m
= xcount
, n
= ycount
, k
= count
;
1969 /* System generated locals */
1970 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
1971 i1
, i2
, i3
, i4
, i5
, i6
;
1973 /* Local variables */
1974 GFC_INTEGER_1 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
1975 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
1976 index_type i
, j
, l
, ii
, jj
, ll
;
1977 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
1982 c
= retarray
->base_addr
;
1984 /* Parameter adjustments */
1986 c_offset
= 1 + c_dim1
;
1989 a_offset
= 1 + a_dim1
;
1992 b_offset
= 1 + b_dim1
;
1995 /* Empty c first. */
1996 for (j
=1; j
<=n
; j
++)
1997 for (i
=1; i
<=m
; i
++)
1998 c
[i
+ j
* c_dim1
] = (GFC_INTEGER_1
)0;
2000 /* Early exit if possible */
2001 if (m
== 0 || n
== 0 || k
== 0)
2004 /* Adjust size of t1 to what is needed. */
2005 index_type t1_dim
, a_sz
;
2011 t1_dim
= a_sz
* 256 + b_dim1
;
2015 t1
= malloc (t1_dim
* sizeof(GFC_INTEGER_1
));
2017 /* Start turning the crank. */
2019 for (jj
= 1; jj
<= i1
; jj
+= 512)
2025 ujsec
= jsec
- jsec
% 4;
2027 for (ll
= 1; ll
<= i2
; ll
+= 256)
2033 ulsec
= lsec
- lsec
% 2;
2036 for (ii
= 1; ii
<= i3
; ii
+= 256)
2042 uisec
= isec
- isec
% 2;
2043 i4
= ll
+ ulsec
- 1;
2044 for (l
= ll
; l
<= i4
; l
+= 2)
2046 i5
= ii
+ uisec
- 1;
2047 for (i
= ii
; i
<= i5
; i
+= 2)
2049 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
2051 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
2052 a
[i
+ (l
+ 1) * a_dim1
];
2053 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
2054 a
[i
+ 1 + l
* a_dim1
];
2055 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
2056 a
[i
+ 1 + (l
+ 1) * a_dim1
];
2060 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
2061 a
[ii
+ isec
- 1 + l
* a_dim1
];
2062 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
2063 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
2069 for (i
= ii
; i
<= i4
; ++i
)
2071 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
2072 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
2076 uisec
= isec
- isec
% 4;
2077 i4
= jj
+ ujsec
- 1;
2078 for (j
= jj
; j
<= i4
; j
+= 4)
2080 i5
= ii
+ uisec
- 1;
2081 for (i
= ii
; i
<= i5
; i
+= 4)
2083 f11
= c
[i
+ j
* c_dim1
];
2084 f21
= c
[i
+ 1 + j
* c_dim1
];
2085 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2086 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
2087 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2088 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
2089 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2090 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
2091 f31
= c
[i
+ 2 + j
* c_dim1
];
2092 f41
= c
[i
+ 3 + j
* c_dim1
];
2093 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
2094 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
2095 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
2096 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
2097 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
2098 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
2100 for (l
= ll
; l
<= i6
; ++l
)
2102 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2103 * b
[l
+ j
* b_dim1
];
2104 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2105 * b
[l
+ j
* b_dim1
];
2106 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2107 * b
[l
+ (j
+ 1) * b_dim1
];
2108 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2109 * b
[l
+ (j
+ 1) * b_dim1
];
2110 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2111 * b
[l
+ (j
+ 2) * b_dim1
];
2112 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2113 * b
[l
+ (j
+ 2) * b_dim1
];
2114 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2115 * b
[l
+ (j
+ 3) * b_dim1
];
2116 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2117 * b
[l
+ (j
+ 3) * b_dim1
];
2118 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2119 * b
[l
+ j
* b_dim1
];
2120 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2121 * b
[l
+ j
* b_dim1
];
2122 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2123 * b
[l
+ (j
+ 1) * b_dim1
];
2124 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2125 * b
[l
+ (j
+ 1) * b_dim1
];
2126 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2127 * b
[l
+ (j
+ 2) * b_dim1
];
2128 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2129 * b
[l
+ (j
+ 2) * b_dim1
];
2130 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2131 * b
[l
+ (j
+ 3) * b_dim1
];
2132 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2133 * b
[l
+ (j
+ 3) * b_dim1
];
2135 c
[i
+ j
* c_dim1
] = f11
;
2136 c
[i
+ 1 + j
* c_dim1
] = f21
;
2137 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2138 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
2139 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2140 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
2141 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2142 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
2143 c
[i
+ 2 + j
* c_dim1
] = f31
;
2144 c
[i
+ 3 + j
* c_dim1
] = f41
;
2145 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
2146 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
2147 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
2148 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
2149 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
2150 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
2155 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2157 f11
= c
[i
+ j
* c_dim1
];
2158 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2159 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2160 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2162 for (l
= ll
; l
<= i6
; ++l
)
2164 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2165 257] * b
[l
+ j
* b_dim1
];
2166 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2167 257] * b
[l
+ (j
+ 1) * b_dim1
];
2168 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2169 257] * b
[l
+ (j
+ 2) * b_dim1
];
2170 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2171 257] * b
[l
+ (j
+ 3) * b_dim1
];
2173 c
[i
+ j
* c_dim1
] = f11
;
2174 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2175 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2176 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2183 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
2185 i5
= ii
+ uisec
- 1;
2186 for (i
= ii
; i
<= i5
; i
+= 4)
2188 f11
= c
[i
+ j
* c_dim1
];
2189 f21
= c
[i
+ 1 + j
* c_dim1
];
2190 f31
= c
[i
+ 2 + j
* c_dim1
];
2191 f41
= c
[i
+ 3 + j
* c_dim1
];
2193 for (l
= ll
; l
<= i6
; ++l
)
2195 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2196 257] * b
[l
+ j
* b_dim1
];
2197 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
2198 257] * b
[l
+ j
* b_dim1
];
2199 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
2200 257] * b
[l
+ j
* b_dim1
];
2201 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
2202 257] * b
[l
+ j
* b_dim1
];
2204 c
[i
+ j
* c_dim1
] = f11
;
2205 c
[i
+ 1 + j
* c_dim1
] = f21
;
2206 c
[i
+ 2 + j
* c_dim1
] = f31
;
2207 c
[i
+ 3 + j
* c_dim1
] = f41
;
2210 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2212 f11
= c
[i
+ j
* c_dim1
];
2214 for (l
= ll
; l
<= i6
; ++l
)
2216 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2217 257] * b
[l
+ j
* b_dim1
];
2219 c
[i
+ j
* c_dim1
] = f11
;
2229 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
2231 if (GFC_DESCRIPTOR_RANK (a
) != 1)
2233 const GFC_INTEGER_1
*restrict abase_x
;
2234 const GFC_INTEGER_1
*restrict bbase_y
;
2235 GFC_INTEGER_1
*restrict dest_y
;
2238 for (y
= 0; y
< ycount
; y
++)
2240 bbase_y
= &bbase
[y
*bystride
];
2241 dest_y
= &dest
[y
*rystride
];
2242 for (x
= 0; x
< xcount
; x
++)
2244 abase_x
= &abase
[x
*axstride
];
2245 s
= (GFC_INTEGER_1
) 0;
2246 for (n
= 0; n
< count
; n
++)
2247 s
+= abase_x
[n
] * bbase_y
[n
];
2254 const GFC_INTEGER_1
*restrict bbase_y
;
2257 for (y
= 0; y
< ycount
; y
++)
2259 bbase_y
= &bbase
[y
*bystride
];
2260 s
= (GFC_INTEGER_1
) 0;
2261 for (n
= 0; n
< count
; n
++)
2262 s
+= abase
[n
*axstride
] * bbase_y
[n
];
2263 dest
[y
*rystride
] = s
;
2267 else if (axstride
< aystride
)
2269 for (y
= 0; y
< ycount
; y
++)
2270 for (x
= 0; x
< xcount
; x
++)
2271 dest
[x
*rxstride
+ y
*rystride
] = (GFC_INTEGER_1
)0;
2273 for (y
= 0; y
< ycount
; y
++)
2274 for (n
= 0; n
< count
; n
++)
2275 for (x
= 0; x
< xcount
; x
++)
2276 /* dest[x,y] += a[x,n] * b[n,y] */
2277 dest
[x
*rxstride
+ y
*rystride
] +=
2278 abase
[x
*axstride
+ n
*aystride
] *
2279 bbase
[n
*bxstride
+ y
*bystride
];
2281 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
2283 const GFC_INTEGER_1
*restrict bbase_y
;
2286 for (y
= 0; y
< ycount
; y
++)
2288 bbase_y
= &bbase
[y
*bystride
];
2289 s
= (GFC_INTEGER_1
) 0;
2290 for (n
= 0; n
< count
; n
++)
2291 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
2292 dest
[y
*rxstride
] = s
;
2297 const GFC_INTEGER_1
*restrict abase_x
;
2298 const GFC_INTEGER_1
*restrict bbase_y
;
2299 GFC_INTEGER_1
*restrict dest_y
;
2302 for (y
= 0; y
< ycount
; y
++)
2304 bbase_y
= &bbase
[y
*bystride
];
2305 dest_y
= &dest
[y
*rystride
];
2306 for (x
= 0; x
< xcount
; x
++)
2308 abase_x
= &abase
[x
*axstride
];
2309 s
= (GFC_INTEGER_1
) 0;
2310 for (n
= 0; n
< count
; n
++)
2311 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
2312 dest_y
[x
*rxstride
] = s
;
2322 /* Compiling main function, with selection code for the processor. */
2324 /* Currently, this is i386 only. Adjust for other architectures. */
2326 #include <config/i386/cpuinfo.h>
2327 void matmul_i1 (gfc_array_i1
* const restrict retarray
,
2328 gfc_array_i1
* const restrict a
, gfc_array_i1
* const restrict b
, int try_blas
,
2329 int blas_limit
, blas_call gemm
)
2331 static void (*matmul_p
) (gfc_array_i1
* const restrict retarray
,
2332 gfc_array_i1
* const restrict a
, gfc_array_i1
* const restrict b
, int try_blas
,
2333 int blas_limit
, blas_call gemm
);
2335 void (*matmul_fn
) (gfc_array_i1
* const restrict retarray
,
2336 gfc_array_i1
* const restrict a
, gfc_array_i1
* const restrict b
, int try_blas
,
2337 int blas_limit
, blas_call gemm
);
2339 matmul_fn
= __atomic_load_n (&matmul_p
, __ATOMIC_RELAXED
);
2340 if (matmul_fn
== NULL
)
2342 matmul_fn
= matmul_i1_vanilla
;
2343 if (__cpu_model
.__cpu_vendor
== VENDOR_INTEL
)
2345 /* Run down the available processors in order of preference. */
2347 if (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX512F
))
2349 matmul_fn
= matmul_i1_avx512f
;
2353 #endif /* HAVE_AVX512F */
2356 if ((__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX2
))
2357 && (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_FMA
)))
2359 matmul_fn
= matmul_i1_avx2
;
2366 if (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX
))
2368 matmul_fn
= matmul_i1_avx
;
2371 #endif /* HAVE_AVX */
2373 else if (__cpu_model
.__cpu_vendor
== VENDOR_AMD
)
2375 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
2376 if ((__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX
))
2377 && (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_FMA
)))
2379 matmul_fn
= matmul_i1_avx128_fma3
;
2383 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
2384 if ((__cpu_model
.__cpu_features
[0] & (1 << FEATURE_AVX
))
2385 && (__cpu_model
.__cpu_features
[0] & (1 << FEATURE_FMA4
)))
2387 matmul_fn
= matmul_i1_avx128_fma4
;
2394 __atomic_store_n (&matmul_p
, matmul_fn
, __ATOMIC_RELAXED
);
2397 (*matmul_fn
) (retarray
, a
, b
, try_blas
, blas_limit
, gemm
);
2400 #else /* Just the vanilla function. */
2403 matmul_i1 (gfc_array_i1
* const restrict retarray
,
2404 gfc_array_i1
* const restrict a
, gfc_array_i1
* const restrict b
, int try_blas
,
2405 int blas_limit
, blas_call gemm
)
2407 const GFC_INTEGER_1
* restrict abase
;
2408 const GFC_INTEGER_1
* restrict bbase
;
2409 GFC_INTEGER_1
* restrict dest
;
2411 index_type rxstride
, rystride
, axstride
, aystride
, bxstride
, bystride
;
2412 index_type x
, y
, n
, count
, xcount
, ycount
;
2414 assert (GFC_DESCRIPTOR_RANK (a
) == 2
2415 || GFC_DESCRIPTOR_RANK (b
) == 2);
2417 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2419 Either A or B (but not both) can be rank 1:
2421 o One-dimensional argument A is implicitly treated as a row matrix
2422 dimensioned [1,count], so xcount=1.
2424 o One-dimensional argument B is implicitly treated as a column matrix
2425 dimensioned [count, 1], so ycount=1.
2428 if (retarray
->base_addr
== NULL
)
2430 if (GFC_DESCRIPTOR_RANK (a
) == 1)
2432 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
2433 GFC_DESCRIPTOR_EXTENT(b
,1) - 1, 1);
2435 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
2437 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
2438 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
2442 GFC_DIMENSION_SET(retarray
->dim
[0], 0,
2443 GFC_DESCRIPTOR_EXTENT(a
,0) - 1, 1);
2445 GFC_DIMENSION_SET(retarray
->dim
[1], 0,
2446 GFC_DESCRIPTOR_EXTENT(b
,1) - 1,
2447 GFC_DESCRIPTOR_EXTENT(retarray
,0));
2451 = xmallocarray (size0 ((array_t
*) retarray
), sizeof (GFC_INTEGER_1
));
2452 retarray
->offset
= 0;
2454 else if (unlikely (compile_options
.bounds_check
))
2456 index_type ret_extent
, arg_extent
;
2458 if (GFC_DESCRIPTOR_RANK (a
) == 1)
2460 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
2461 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
2462 if (arg_extent
!= ret_extent
)
2463 runtime_error ("Incorrect extent in return array in"
2464 " MATMUL intrinsic: is %ld, should be %ld",
2465 (long int) ret_extent
, (long int) arg_extent
);
2467 else if (GFC_DESCRIPTOR_RANK (b
) == 1)
2469 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
2470 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
2471 if (arg_extent
!= ret_extent
)
2472 runtime_error ("Incorrect extent in return array in"
2473 " MATMUL intrinsic: is %ld, should be %ld",
2474 (long int) ret_extent
, (long int) arg_extent
);
2478 arg_extent
= GFC_DESCRIPTOR_EXTENT(a
,0);
2479 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,0);
2480 if (arg_extent
!= ret_extent
)
2481 runtime_error ("Incorrect extent in return array in"
2482 " MATMUL intrinsic for dimension 1:"
2483 " is %ld, should be %ld",
2484 (long int) ret_extent
, (long int) arg_extent
);
2486 arg_extent
= GFC_DESCRIPTOR_EXTENT(b
,1);
2487 ret_extent
= GFC_DESCRIPTOR_EXTENT(retarray
,1);
2488 if (arg_extent
!= ret_extent
)
2489 runtime_error ("Incorrect extent in return array in"
2490 " MATMUL intrinsic for dimension 2:"
2491 " is %ld, should be %ld",
2492 (long int) ret_extent
, (long int) arg_extent
);
2497 if (GFC_DESCRIPTOR_RANK (retarray
) == 1)
2499 /* One-dimensional result may be addressed in the code below
2500 either as a row or a column matrix. We want both cases to
2502 rxstride
= rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
2506 rxstride
= GFC_DESCRIPTOR_STRIDE(retarray
,0);
2507 rystride
= GFC_DESCRIPTOR_STRIDE(retarray
,1);
2511 if (GFC_DESCRIPTOR_RANK (a
) == 1)
2513 /* Treat it as a a row matrix A[1,count]. */
2514 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
2518 count
= GFC_DESCRIPTOR_EXTENT(a
,0);
2522 axstride
= GFC_DESCRIPTOR_STRIDE(a
,0);
2523 aystride
= GFC_DESCRIPTOR_STRIDE(a
,1);
2525 count
= GFC_DESCRIPTOR_EXTENT(a
,1);
2526 xcount
= GFC_DESCRIPTOR_EXTENT(a
,0);
2529 if (count
!= GFC_DESCRIPTOR_EXTENT(b
,0))
2531 if (count
> 0 || GFC_DESCRIPTOR_EXTENT(b
,0) > 0)
2532 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2535 if (GFC_DESCRIPTOR_RANK (b
) == 1)
2537 /* Treat it as a column matrix B[count,1] */
2538 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
2540 /* bystride should never be used for 1-dimensional b.
2541 The value is only used for calculation of the
2542 memory by the buffer. */
2548 bxstride
= GFC_DESCRIPTOR_STRIDE(b
,0);
2549 bystride
= GFC_DESCRIPTOR_STRIDE(b
,1);
2550 ycount
= GFC_DESCRIPTOR_EXTENT(b
,1);
2553 abase
= a
->base_addr
;
2554 bbase
= b
->base_addr
;
2555 dest
= retarray
->base_addr
;
2557 /* Now that everything is set up, we perform the multiplication
2560 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2561 #define min(a,b) ((a) <= (b) ? (a) : (b))
2562 #define max(a,b) ((a) >= (b) ? (a) : (b))
2564 if (try_blas
&& rxstride
== 1 && (axstride
== 1 || aystride
== 1)
2565 && (bxstride
== 1 || bystride
== 1)
2566 && (((float) xcount
) * ((float) ycount
) * ((float) count
)
2567 > POW3(blas_limit
)))
2569 const int m
= xcount
, n
= ycount
, k
= count
, ldc
= rystride
;
2570 const GFC_INTEGER_1 one
= 1, zero
= 0;
2571 const int lda
= (axstride
== 1) ? aystride
: axstride
,
2572 ldb
= (bxstride
== 1) ? bystride
: bxstride
;
2574 if (lda
> 0 && ldb
> 0 && ldc
> 0 && m
> 1 && n
> 1 && k
> 1)
2576 assert (gemm
!= NULL
);
2577 gemm (axstride
== 1 ? "N" : "T", bxstride
== 1 ? "N" : "T", &m
,
2578 &n
, &k
, &one
, abase
, &lda
, bbase
, &ldb
, &zero
, dest
,
2584 if (rxstride
== 1 && axstride
== 1 && bxstride
== 1)
2586 /* This block of code implements a tuned matmul, derived from
2587 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2589 Bo Kagstrom and Per Ling
2590 Department of Computing Science
2592 S-901 87 Umea, Sweden
2594 from netlib.org, translated to C, and modified for matmul.m4. */
2596 const GFC_INTEGER_1
*a
, *b
;
2598 const index_type m
= xcount
, n
= ycount
, k
= count
;
2600 /* System generated locals */
2601 index_type a_dim1
, a_offset
, b_dim1
, b_offset
, c_dim1
, c_offset
,
2602 i1
, i2
, i3
, i4
, i5
, i6
;
2604 /* Local variables */
2605 GFC_INTEGER_1 f11
, f12
, f21
, f22
, f31
, f32
, f41
, f42
,
2606 f13
, f14
, f23
, f24
, f33
, f34
, f43
, f44
;
2607 index_type i
, j
, l
, ii
, jj
, ll
;
2608 index_type isec
, jsec
, lsec
, uisec
, ujsec
, ulsec
;
2613 c
= retarray
->base_addr
;
2615 /* Parameter adjustments */
2617 c_offset
= 1 + c_dim1
;
2620 a_offset
= 1 + a_dim1
;
2623 b_offset
= 1 + b_dim1
;
2626 /* Empty c first. */
2627 for (j
=1; j
<=n
; j
++)
2628 for (i
=1; i
<=m
; i
++)
2629 c
[i
+ j
* c_dim1
] = (GFC_INTEGER_1
)0;
2631 /* Early exit if possible */
2632 if (m
== 0 || n
== 0 || k
== 0)
2635 /* Adjust size of t1 to what is needed. */
2636 index_type t1_dim
, a_sz
;
2642 t1_dim
= a_sz
* 256 + b_dim1
;
2646 t1
= malloc (t1_dim
* sizeof(GFC_INTEGER_1
));
2648 /* Start turning the crank. */
2650 for (jj
= 1; jj
<= i1
; jj
+= 512)
2656 ujsec
= jsec
- jsec
% 4;
2658 for (ll
= 1; ll
<= i2
; ll
+= 256)
2664 ulsec
= lsec
- lsec
% 2;
2667 for (ii
= 1; ii
<= i3
; ii
+= 256)
2673 uisec
= isec
- isec
% 2;
2674 i4
= ll
+ ulsec
- 1;
2675 for (l
= ll
; l
<= i4
; l
+= 2)
2677 i5
= ii
+ uisec
- 1;
2678 for (i
= ii
; i
<= i5
; i
+= 2)
2680 t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257] =
2682 t1
[l
- ll
+ 2 + ((i
- ii
+ 1) << 8) - 257] =
2683 a
[i
+ (l
+ 1) * a_dim1
];
2684 t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257] =
2685 a
[i
+ 1 + l
* a_dim1
];
2686 t1
[l
- ll
+ 2 + ((i
- ii
+ 2) << 8) - 257] =
2687 a
[i
+ 1 + (l
+ 1) * a_dim1
];
2691 t1
[l
- ll
+ 1 + (isec
<< 8) - 257] =
2692 a
[ii
+ isec
- 1 + l
* a_dim1
];
2693 t1
[l
- ll
+ 2 + (isec
<< 8) - 257] =
2694 a
[ii
+ isec
- 1 + (l
+ 1) * a_dim1
];
2700 for (i
= ii
; i
<= i4
; ++i
)
2702 t1
[lsec
+ ((i
- ii
+ 1) << 8) - 257] =
2703 a
[i
+ (ll
+ lsec
- 1) * a_dim1
];
2707 uisec
= isec
- isec
% 4;
2708 i4
= jj
+ ujsec
- 1;
2709 for (j
= jj
; j
<= i4
; j
+= 4)
2711 i5
= ii
+ uisec
- 1;
2712 for (i
= ii
; i
<= i5
; i
+= 4)
2714 f11
= c
[i
+ j
* c_dim1
];
2715 f21
= c
[i
+ 1 + j
* c_dim1
];
2716 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2717 f22
= c
[i
+ 1 + (j
+ 1) * c_dim1
];
2718 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2719 f23
= c
[i
+ 1 + (j
+ 2) * c_dim1
];
2720 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2721 f24
= c
[i
+ 1 + (j
+ 3) * c_dim1
];
2722 f31
= c
[i
+ 2 + j
* c_dim1
];
2723 f41
= c
[i
+ 3 + j
* c_dim1
];
2724 f32
= c
[i
+ 2 + (j
+ 1) * c_dim1
];
2725 f42
= c
[i
+ 3 + (j
+ 1) * c_dim1
];
2726 f33
= c
[i
+ 2 + (j
+ 2) * c_dim1
];
2727 f43
= c
[i
+ 3 + (j
+ 2) * c_dim1
];
2728 f34
= c
[i
+ 2 + (j
+ 3) * c_dim1
];
2729 f44
= c
[i
+ 3 + (j
+ 3) * c_dim1
];
2731 for (l
= ll
; l
<= i6
; ++l
)
2733 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2734 * b
[l
+ j
* b_dim1
];
2735 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2736 * b
[l
+ j
* b_dim1
];
2737 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2738 * b
[l
+ (j
+ 1) * b_dim1
];
2739 f22
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2740 * b
[l
+ (j
+ 1) * b_dim1
];
2741 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2742 * b
[l
+ (j
+ 2) * b_dim1
];
2743 f23
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2744 * b
[l
+ (j
+ 2) * b_dim1
];
2745 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) - 257]
2746 * b
[l
+ (j
+ 3) * b_dim1
];
2747 f24
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) - 257]
2748 * b
[l
+ (j
+ 3) * b_dim1
];
2749 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2750 * b
[l
+ j
* b_dim1
];
2751 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2752 * b
[l
+ j
* b_dim1
];
2753 f32
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2754 * b
[l
+ (j
+ 1) * b_dim1
];
2755 f42
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2756 * b
[l
+ (j
+ 1) * b_dim1
];
2757 f33
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2758 * b
[l
+ (j
+ 2) * b_dim1
];
2759 f43
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2760 * b
[l
+ (j
+ 2) * b_dim1
];
2761 f34
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) - 257]
2762 * b
[l
+ (j
+ 3) * b_dim1
];
2763 f44
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) - 257]
2764 * b
[l
+ (j
+ 3) * b_dim1
];
2766 c
[i
+ j
* c_dim1
] = f11
;
2767 c
[i
+ 1 + j
* c_dim1
] = f21
;
2768 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2769 c
[i
+ 1 + (j
+ 1) * c_dim1
] = f22
;
2770 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2771 c
[i
+ 1 + (j
+ 2) * c_dim1
] = f23
;
2772 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2773 c
[i
+ 1 + (j
+ 3) * c_dim1
] = f24
;
2774 c
[i
+ 2 + j
* c_dim1
] = f31
;
2775 c
[i
+ 3 + j
* c_dim1
] = f41
;
2776 c
[i
+ 2 + (j
+ 1) * c_dim1
] = f32
;
2777 c
[i
+ 3 + (j
+ 1) * c_dim1
] = f42
;
2778 c
[i
+ 2 + (j
+ 2) * c_dim1
] = f33
;
2779 c
[i
+ 3 + (j
+ 2) * c_dim1
] = f43
;
2780 c
[i
+ 2 + (j
+ 3) * c_dim1
] = f34
;
2781 c
[i
+ 3 + (j
+ 3) * c_dim1
] = f44
;
2786 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2788 f11
= c
[i
+ j
* c_dim1
];
2789 f12
= c
[i
+ (j
+ 1) * c_dim1
];
2790 f13
= c
[i
+ (j
+ 2) * c_dim1
];
2791 f14
= c
[i
+ (j
+ 3) * c_dim1
];
2793 for (l
= ll
; l
<= i6
; ++l
)
2795 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2796 257] * b
[l
+ j
* b_dim1
];
2797 f12
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2798 257] * b
[l
+ (j
+ 1) * b_dim1
];
2799 f13
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2800 257] * b
[l
+ (j
+ 2) * b_dim1
];
2801 f14
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2802 257] * b
[l
+ (j
+ 3) * b_dim1
];
2804 c
[i
+ j
* c_dim1
] = f11
;
2805 c
[i
+ (j
+ 1) * c_dim1
] = f12
;
2806 c
[i
+ (j
+ 2) * c_dim1
] = f13
;
2807 c
[i
+ (j
+ 3) * c_dim1
] = f14
;
2814 for (j
= jj
+ ujsec
; j
<= i4
; ++j
)
2816 i5
= ii
+ uisec
- 1;
2817 for (i
= ii
; i
<= i5
; i
+= 4)
2819 f11
= c
[i
+ j
* c_dim1
];
2820 f21
= c
[i
+ 1 + j
* c_dim1
];
2821 f31
= c
[i
+ 2 + j
* c_dim1
];
2822 f41
= c
[i
+ 3 + j
* c_dim1
];
2824 for (l
= ll
; l
<= i6
; ++l
)
2826 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2827 257] * b
[l
+ j
* b_dim1
];
2828 f21
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 2) << 8) -
2829 257] * b
[l
+ j
* b_dim1
];
2830 f31
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 3) << 8) -
2831 257] * b
[l
+ j
* b_dim1
];
2832 f41
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 4) << 8) -
2833 257] * b
[l
+ j
* b_dim1
];
2835 c
[i
+ j
* c_dim1
] = f11
;
2836 c
[i
+ 1 + j
* c_dim1
] = f21
;
2837 c
[i
+ 2 + j
* c_dim1
] = f31
;
2838 c
[i
+ 3 + j
* c_dim1
] = f41
;
2841 for (i
= ii
+ uisec
; i
<= i5
; ++i
)
2843 f11
= c
[i
+ j
* c_dim1
];
2845 for (l
= ll
; l
<= i6
; ++l
)
2847 f11
+= t1
[l
- ll
+ 1 + ((i
- ii
+ 1) << 8) -
2848 257] * b
[l
+ j
* b_dim1
];
2850 c
[i
+ j
* c_dim1
] = f11
;
2860 else if (rxstride
== 1 && aystride
== 1 && bxstride
== 1)
2862 if (GFC_DESCRIPTOR_RANK (a
) != 1)
2864 const GFC_INTEGER_1
*restrict abase_x
;
2865 const GFC_INTEGER_1
*restrict bbase_y
;
2866 GFC_INTEGER_1
*restrict dest_y
;
2869 for (y
= 0; y
< ycount
; y
++)
2871 bbase_y
= &bbase
[y
*bystride
];
2872 dest_y
= &dest
[y
*rystride
];
2873 for (x
= 0; x
< xcount
; x
++)
2875 abase_x
= &abase
[x
*axstride
];
2876 s
= (GFC_INTEGER_1
) 0;
2877 for (n
= 0; n
< count
; n
++)
2878 s
+= abase_x
[n
] * bbase_y
[n
];
2885 const GFC_INTEGER_1
*restrict bbase_y
;
2888 for (y
= 0; y
< ycount
; y
++)
2890 bbase_y
= &bbase
[y
*bystride
];
2891 s
= (GFC_INTEGER_1
) 0;
2892 for (n
= 0; n
< count
; n
++)
2893 s
+= abase
[n
*axstride
] * bbase_y
[n
];
2894 dest
[y
*rystride
] = s
;
2898 else if (axstride
< aystride
)
2900 for (y
= 0; y
< ycount
; y
++)
2901 for (x
= 0; x
< xcount
; x
++)
2902 dest
[x
*rxstride
+ y
*rystride
] = (GFC_INTEGER_1
)0;
2904 for (y
= 0; y
< ycount
; y
++)
2905 for (n
= 0; n
< count
; n
++)
2906 for (x
= 0; x
< xcount
; x
++)
2907 /* dest[x,y] += a[x,n] * b[n,y] */
2908 dest
[x
*rxstride
+ y
*rystride
] +=
2909 abase
[x
*axstride
+ n
*aystride
] *
2910 bbase
[n
*bxstride
+ y
*bystride
];
2912 else if (GFC_DESCRIPTOR_RANK (a
) == 1)
2914 const GFC_INTEGER_1
*restrict bbase_y
;
2917 for (y
= 0; y
< ycount
; y
++)
2919 bbase_y
= &bbase
[y
*bystride
];
2920 s
= (GFC_INTEGER_1
) 0;
2921 for (n
= 0; n
< count
; n
++)
2922 s
+= abase
[n
*axstride
] * bbase_y
[n
*bxstride
];
2923 dest
[y
*rxstride
] = s
;
2928 const GFC_INTEGER_1
*restrict abase_x
;
2929 const GFC_INTEGER_1
*restrict bbase_y
;
2930 GFC_INTEGER_1
*restrict dest_y
;
2933 for (y
= 0; y
< ycount
; y
++)
2935 bbase_y
= &bbase
[y
*bystride
];
2936 dest_y
= &dest
[y
*rystride
];
2937 for (x
= 0; x
< xcount
; x
++)
2939 abase_x
= &abase
[x
*axstride
];
2940 s
= (GFC_INTEGER_1
) 0;
2941 for (n
= 0; n
< count
; n
++)
2942 s
+= abase_x
[n
*aystride
] * bbase_y
[n
*bxstride
];
2943 dest_y
[x
*rxstride
] = s
;