1 dnl Alpha ev6 mpn_mul_1
-- Multiply a limb vector with a limb
and store the
2 dnl result
in a second limb vector.
4 dnl Copyright
2000, 2001, 2005 Free Software Foundation
, Inc.
6 dnl
This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
9 dnl it under the terms of
either:
11 dnl
* the GNU Lesser General
Public License as published by the Free
12 dnl Software Foundation
; either version 3 of the License, or (at your
13 dnl option
) any later version.
17 dnl
* the GNU General
Public License as published by the Free Software
18 dnl Foundation
; either version 2 of the License, or (at your option) any
21 dnl
or both
in parallel
, as here.
23 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
24 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
25 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
28 dnl You should have received copies of the GNU General
Public License
and the
29 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
30 dnl see
https://www.gnu.
org/licenses
/.
32 include(`..
/config.m4
')
40 C This code runs at 2.25 cycles/limb on EV6.
42 C This code was written in close cooperation with ev6 pipeline expert
43 C Steve Root. Any errors are tege's fault
, though.
48 C code for n
> 8 code for
(n
mod 8)
49 C code for
(n
div 8) feed
-in code
53 C Some notes about unrolled
loop:
55 C r1
-r8 multiplies
and workup
56 C r21
-r28 multiplies
and workup
59 C r20
,r29
,r13
-r15 scramble
61 C We
're doing 7 of the 8 carry propagations with a br fixup code and 1 with a
62 C put-the-carry-into-hi. The idea is that these branches are very rarely
63 C taken, and since a non-taken branch consumes no resources, that is better
66 C Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an
67 C add NEXT cycle #09 which feeds a store in NEXT cycle #02
69 C The code could use some further work:
70 C 1. Speed up really small multiplies. The default alpha/mul_1.asm code is
71 C faster than this for size < 3.
72 C 2. Improve feed-in code, perhaps with the equivalent of switch(n%8) unless
74 C 3. Consider using 4-way unrolling, even if that runs slower.
75 C 4. Reduce register usage. In particular, try to avoid using r29.
82 ldq r2,0(r17) C r2 = s1_limb
83 lda r18,-1(r18) C size--
84 mulq r2,r19,r3 C r3 = prod_low
85 bic r31,r31,r4 C clear cy_limb
86 umulh r2,r19,r0 C r0 = prod_high
87 beq r18,$Le1a C jump if size was == 1
88 ldq r2,8(r17) C r2 = s1_limb
89 lda r18,-1(r18) C size--
91 beq r18,$Le2a C jump if size was == 2
93 $Lopa: mulq r2,r19,r3 C r3 = prod_low
94 addq r4,r0,r0 C cy_limb = cy_limb + 'cy
'
95 lda r18,-1(r18) C size--
96 umulh r2,r19,r4 C r4 = cy_limb
97 ldq r2,16(r17) C r2 = s1_limb
98 lda r17,8(r17) C s1_ptr++
99 addq r3,r0,r3 C r3 = cy_limb + prod_low
101 cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
102 lda r16,8(r16) C res_ptr++
105 $Le2a: mulq r2,r19,r3 C r3 = prod_low
106 addq r4,r0,r0 C cy_limb = cy_limb + 'cy
'
107 umulh r2,r19,r4 C r4 = cy_limb
108 addq r3,r0,r3 C r3 = cy_limb + prod_low
109 cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
111 addq r4,r0,r0 C cy_limb = prod_high + cy
128 and r18, 7, r20 C count for the first loop, 0-7
129 srl r18, 3, r18 C count for unrolled loop
131 beq r20, $L_8_or_more C skip first loop
134 ldq r2,0(r17) C r2 = s1_limb
135 lda r17,8(r17) C s1_ptr++
136 lda r20,-1(r20) C size--
137 mulq r2,r19,r3 C r3 = prod_low
138 umulh r2,r19,r21 C r21 = prod_high
139 beq r20,$Le1b C jump if size was == 1
140 bis r31, r31, r0 C FIXME: shouldn't need
this
141 ldq r2
,0(r17
) C r2
= s1_limb
142 lda r17
,8(r17
) C s1_ptr
++
143 lda r20
,-1(r20
) C
size--
145 lda r16
,8(r16
) C res_ptr
++
146 beq r20
,$Le2b C jump if
size was
== 2
148 $
Lopb: mulq r2
,r19
,r3 C r3
= prod_low
149 addq r21
,r0
,r0 C cy_limb
= cy_limb
+ 'cy'
150 lda r20
,-1(r20
) C
size--
151 umulh r2
,r19
,r21 C r21
= prod_high
152 ldq r2
,0(r17
) C r2
= s1_limb
153 lda r17
,8(r17
) C s1_ptr
++
154 addq r3
,r0
,r3 C r3
= cy_limb
+ prod_low
156 cmpult r3
,r0
,r0 C r0
= carry from
(cy_limb
+ prod_low
)
157 lda r16
,8(r16
) C res_ptr
++
160 $
Le2b: mulq r2
,r19
,r3 C r3
= prod_low
161 addq r21
,r0
,r0 C cy_limb
= cy_limb
+ 'cy'
162 umulh r2
,r19
,r21 C r21
= prod_high
163 addq r3
,r0
,r3 C r3
= cy_limb
+ prod_low
164 cmpult r3
,r0
,r0 C r0
= carry from
(cy_limb
+ prod_low
)
166 lda r16
,8(r16
) C res_ptr
++
167 addq r21
,r0
,r21 C cy_limb
= prod_high
+ cy
170 lda r16
,8(r16
) C res_ptr
++
173 lda r0
, -1(r31
) C put
-1 in r0
, for tricky
loop control
174 lda r17
, -32(r17
) C L1 bookkeeping
175 lda r18
, -1(r18
) C decrement count
178 ldq r10
, 40(r17
) C L1
179 mulq r9
, r19
, r22 C U1 #
07
180 ldq r11
, 48(r17
) C L1
181 umulh r9
, r19
, r23 C U1 #
08
182 ldq r12
, 56(r17
) C L1
183 mulq r10
, r19
, r24 C U1 #
09
186 lda r17
, 64(r17
) C L1 bookkeeping
188 umulh r10
, r19
, r25 C U1 #
11
189 mulq r11
, r19
, r26 C U1 #
12
190 umulh r11
, r19
, r27 C U1 #
13
191 mulq r12
, r19
, r28 C U1 #
14
193 umulh r12
, r19
, r1 C U1 #
15
194 ldq r11
, 16(r17
) C L1
195 mulq r9
, r19
, r2 C U1 #
16
196 ldq r12
, 24(r17
) C L1
197 umulh r9
, r19
, r3 C U1 #
17
198 addq r21
, r22
, r13 C L1
mov
199 mulq r10
, r19
, r4 C U1 #
18
200 addq r23
, r24
, r22 C L0 sum
2 mul's
201 cmpult r13, r21, r14 C L1 carry from sum
202 bgt r18, $L_16_or_more
204 cmpult r22, r24, r24 C U0 carry from sum
205 umulh r10, r19, r5 C U1 #02
206 addq r25, r26, r23 C U0 sum 2 mul's
207 mulq r11
, r19
, r6 C U1 #
03
208 cmpult r23
, r26
, r25 C U0 carry from sum
209 umulh r11
, r19
, r7 C U1 #
04
210 addq r27
, r28
, r28 C U0 sum
2 mul's
211 mulq r12, r19, r8 C U1 #05
212 cmpult r28, r27, r15 C L0 carry from sum
213 lda r16, 32(r16) C L1 bookkeeping
214 addq r13, r31, r13 C U0 start carry cascade
215 umulh r12, r19, r21 C U1 #06
219 C ---------------------------------------------------------------
221 cmpult r22, r24, r24 C U0 carry from sum
224 umulh r10, r19, r5 C U1 #02
225 addq r25, r26, r23 C U0 sum 2 mul's
226 mulq r11
, r19
, r6 C U1 #
03
227 cmpult r23
, r26
, r25 C U0 carry from sum
228 umulh r11
, r19
, r7 C U1 #
04
229 addq r27
, r28
, r28 C U0 sum
2 mul's
230 mulq r12, r19, r8 C U1 #05
231 cmpult r28, r27, r15 C L0 carry from sum
232 lda r16, 32(r16) C L1 bookkeeping
233 addq r13, r31, r13 C U0 start carry cascade
235 umulh r12, r19, r21 C U1 #06
236 C beq r13, $fix0w C U0
237 $ret0w: addq r22, r14, r26 C L0
238 ldq r10, 40(r17) C L1
240 mulq r9, r19, r22 C U1 #07
242 $ret1w: addq r23, r24, r27 C L0
243 ldq r11, 48(r17) C L1
245 umulh r9, r19, r23 C U1 #08
247 $ret2w: addq r28, r25, r28 C L0
248 ldq r12, 56(r17) C L1
250 mulq r10, r19, r24 C U1 #09
252 $ret3w: addq r1, r2, r20 C L0 sum 2 mul's
255 addq r3
, r4
, r2 C L0 #
10 2 mul's
256 lda r17, 64(r17) C L1 bookkeeping
257 cmpult r20, r1, r29 C U0 carry from sum
259 umulh r10, r19, r25 C U1 #11
260 cmpult r2, r4, r4 C U0 carry from sum
261 stq r13, -32(r16) C L0
262 stq r26, -24(r16) C L1
264 mulq r11, r19, r26 C U1 #12
265 addq r5, r6, r14 C U0 sum 2 mul's
266 stq r27
, -16(r16
) C L0
267 stq r28
, -8(r16
) C L1
269 umulh r11
, r19
, r27 C U1 #
13
270 cmpult r14
, r6
, r3 C U0 carry from sum
271 C could do cross
-jumping
here:
272 C bra $L_middle_of_unrolled_loop
273 mulq r12
, r19
, r28 C U1 #
14
274 addq r7
, r3
, r5 C L0 eat carry
275 addq r20
, r15
, r20 C U0 carry cascade
278 umulh r12
, r19
, r1 C U1 #
15
280 $
ret4w: addq r2
, r29
, r6 C L0
281 ldq r11
, 16(r17
) C L1
283 mulq r9
, r19
, r2 C U1 #
16
285 $
ret5w: addq r14
, r4
, r7 C L0
286 ldq r12
, 24(r17
) C L1
288 umulh r9
, r19
, r3 C U1 #
17
290 $
ret6w: addq r5
, r8
, r8 C L0 sum
2
291 addq r21
, r22
, r13 C L1 sum
2 mul's
293 mulq r10, r19, r4 C U1 #18
294 addq r23, r24, r22 C L0 sum 2 mul's
295 cmpult r13
, r21
, r14 C L1 carry from sum
297 C
---------------------------------------------------------------
300 umulh r0
, r18
, r18 C U1 #
01 decrement r18
!
301 cmpult r8
, r5
, r29 C L0 carry from last bunch
302 cmpult r22
, r24
, r24 C U0 carry from sum
305 umulh r10
, r19
, r5 C U1 #
02
306 addq r25
, r26
, r23 C U0 sum
2 mul's
310 mulq r11, r19, r6 C U1 #03
311 cmpult r23, r26, r25 C U0 carry from sum
315 umulh r11, r19, r7 C U1 #04
316 bis r31, r31, r31 C L0 st slosh
317 bis r31, r31, r31 C L1 st slosh
318 addq r27, r28, r28 C U0 sum 2 mul's
320 mulq r12
, r19
, r8 C U1 #
05
321 cmpult r28
, r27
, r15 C L0 carry from sum
322 lda r16
, 64(r16
) C L1 bookkeeping
323 addq r13
, r29
, r13 C U0 start carry cascade
325 umulh r12
, r19
, r21 C U1 #
06
327 $
ret0: addq r22
, r14
, r26 C L0
328 ldq r10
, 40(r17
) C L1
330 mulq r9
, r19
, r22 C U1 #
07
332 $
ret1: addq r23
, r24
, r27 C L0
333 ldq r11
, 48(r17
) C L1
335 umulh r9
, r19
, r23 C U1 #
08
337 $
ret2: addq r28
, r25
, r28 C L0
338 ldq r12
, 56(r17
) C L1
340 mulq r10
, r19
, r24 C U1 #
09
342 $
ret3: addq r1
, r2
, r20 C L0 sum
2 mul's
345 addq r3, r4, r2 C L0 #10 2 mul's
346 bis r31
, r31
, r31 C U1
mul hole
347 lda r17
, 64(r17
) C L1 bookkeeping
348 cmpult r20
, r1
, r29 C U0 carry from sum
350 umulh r10
, r19
, r25 C U1 #
11
351 cmpult r2
, r4
, r4 C U0 carry from sum
352 stq r13
, -32(r16
) C L0
353 stq r26
, -24(r16
) C L1
355 mulq r11
, r19
, r26 C U1 #
12
356 addq r5
, r6
, r14 C U0 sum
2 mul's
357 stq r27, -16(r16) C L0
358 stq r28, -8(r16) C L1
360 umulh r11, r19, r27 C U1 #13
361 bis r31, r31, r31 C L0 st slosh
362 bis r31, r31, r31 C L1 st slosh
363 cmpult r14, r6, r3 C U0 carry from sum
364 $L_middle_of_unrolled_loop:
365 mulq r12, r19, r28 C U1 #14
366 addq r7, r3, r5 C L0 eat carry
367 addq r20, r15, r20 C U0 carry cascade
370 umulh r12, r19, r1 C U1 #15
372 $ret4: addq r2, r29, r6 C L0
373 ldq r11, 16(r17) C L1
375 mulq r9, r19, r2 C U1 #16
377 $ret5: addq r14, r4, r7 C L0
378 ldq r12, 24(r17) C L1
380 umulh r9, r19, r3 C U1 #17
382 $ret6: addq r5, r8, r8 C L0 sum 2
383 addq r21, r22, r13 C L1 sum 2 mul's
385 mulq r10
, r19
, r4 C U1 #
18
386 addq r23
, r24
, r22 C L0 sum
2 mul's
387 cmpult r13, r21, r14 C L1 carry from sum
389 C ---------------------------------------------------------------
391 cmpult r8, r5, r29 C L0 carry from last bunch
392 cmpult r22, r24, r24 C U0 carry from sum
394 umulh r10, r19, r5 C U1 #02
395 addq r25, r26, r23 C U0 sum 2 mul's
399 mulq r11
, r19
, r6 C U1 #
03
400 cmpult r23
, r26
, r25 C U0 carry from sum
404 umulh r11
, r19
, r7 C U1 #
04
405 addq r27
, r28
, r28 C U0 sum
2 mul's
407 mulq r12, r19, r8 C U1 #05
408 cmpult r28, r27, r15 C L0 carry from sum
409 lda r16, 64(r16) C L1 bookkeeping
410 addq r13, r29, r13 C U0 start carry cascade
412 umulh r12, r19, r21 C U1 #06
414 $ret0c: addq r22, r14, r26 C L0
416 $ret1c: addq r23, r24, r27 C L0
418 $ret2c: addq r28, r25, r28 C L0
420 $ret3c: addq r1, r2, r20 C L0 sum 2 mul's
421 addq r3
, r4
, r2 C L0 #
10 2 mul's
422 lda r17, 64(r17) C L1 bookkeeping
423 cmpult r20, r1, r29 C U0 carry from sum
424 cmpult r2, r4, r4 C U0 carry from sum
425 stq r13, -32(r16) C L0
426 stq r26, -24(r16) C L1
427 addq r5, r6, r14 C U0 sum 2 mul's
428 stq r27
, -16(r16
) C L0
429 stq r28
, -8(r16
) C L1
430 cmpult r14
, r6
, r3 C U0 carry from sum
431 addq r7
, r3
, r5 C L0 eat carry
432 addq r20
, r15
, r20 C U0 carry cascade
434 $
ret4c: addq r2
, r29
, r6 C L0
436 $
ret5c: addq r14
, r4
, r7 C L0
438 $
ret6c: addq r5
, r8
, r8 C L0 sum
2
439 cmpult r8
, r5
, r29 C L0 carry from last bunch
458 C
$fix
0w
: bis r14
, r29
, r14 C join carries
460 $fix
1w
: bis r24
, r14
, r24 C join carries
462 $fix
2w
: bis r25
, r24
, r25 C join carries
464 $fix
3w
: bis r15
, r25
, r15 C join carries
466 $fix
0: bis r14
, r29
, r14 C join carries
468 $fix
1: bis r24
, r14
, r24 C join carries
470 $fix
2: bis r25
, r24
, r25 C join carries
472 $fix
3: bis r15
, r25
, r15 C join carries
474 $fix
4: bis r29
, r15
, r29 C join carries
476 $fix
5: bis r4
, r29
, r4 C join carries
478 $fix
6: addq r5
, r4
, r5 C can
't carry twice!
480 $fix0c: bis r14, r29, r14 C join carries
482 $fix1c: bis r24, r14, r24 C join carries
484 $fix2c: bis r25, r24, r25 C join carries
486 $fix3c: bis r15, r25, r15 C join carries
488 $fix4c: bis r29, r15, r29 C join carries
490 $fix5c: bis r4, r29, r4 C join carries
492 $fix6c: addq r5, r4, r5 C can't carry twice
!