beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / alpha / ev6 / mul_1.asm
blob8ee19cd429d00a12ebf2e37e17f707dd7339d59e
1 dnl Alpha ev6 mpn_mul_1 -- Multiply a limb vector with a limb and store the
2 dnl result in a second limb vector.
4 dnl Copyright 2000, 2001, 2005 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
34 C INPUT PARAMETERS
35 C res_ptr r16
36 C s1_ptr r17
37 C size r18
38 C s2_limb r19
40 C This code runs at 2.25 cycles/limb on EV6.
42 C This code was written in close cooperation with ev6 pipeline expert
43 C Steve Root. Any errors are tege's fault, though.
45 C Code structure:
47 C code for n < 8
48 C code for n > 8 code for (n mod 8)
49 C code for (n div 8) feed-in code
50 C 8-way unrolled loop
51 C wind-down code
53 C Some notes about unrolled loop:
55 C r1-r8 multiplies and workup
56 C r21-r28 multiplies and workup
57 C r9-r12 loads
58 C r0 -1
59 C r20,r29,r13-r15 scramble
61 C We're doing 7 of the 8 carry propagations with a br fixup code and 1 with a
62 C put-the-carry-into-hi. The idea is that these branches are very rarely
63 C taken, and since a non-taken branch consumes no resources, that is better
64 C than an addq.
66 C Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an
67 C add NEXT cycle #09 which feeds a store in NEXT cycle #02
69 C The code could use some further work:
70 C 1. Speed up really small multiplies. The default alpha/mul_1.asm code is
71 C faster than this for size < 3.
72 C 2. Improve feed-in code, perhaps with the equivalent of switch(n%8) unless
73 C that is too costly.
74 C 3. Consider using 4-way unrolling, even if that runs slower.
75 C 4. Reduce register usage. In particular, try to avoid using r29.
77 ASM_START()
78 PROLOGUE(mpn_mul_1)
79 cmpult r18, 8, r1
80 beq r1, $Large
81 $Lsmall:
82 ldq r2,0(r17) C r2 = s1_limb
83 lda r18,-1(r18) C size--
84 mulq r2,r19,r3 C r3 = prod_low
85 bic r31,r31,r4 C clear cy_limb
86 umulh r2,r19,r0 C r0 = prod_high
87 beq r18,$Le1a C jump if size was == 1
88 ldq r2,8(r17) C r2 = s1_limb
89 lda r18,-1(r18) C size--
90 stq r3,0(r16)
91 beq r18,$Le2a C jump if size was == 2
92 ALIGN(8)
93 $Lopa: mulq r2,r19,r3 C r3 = prod_low
94 addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
95 lda r18,-1(r18) C size--
96 umulh r2,r19,r4 C r4 = cy_limb
97 ldq r2,16(r17) C r2 = s1_limb
98 lda r17,8(r17) C s1_ptr++
99 addq r3,r0,r3 C r3 = cy_limb + prod_low
100 stq r3,8(r16)
101 cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
102 lda r16,8(r16) C res_ptr++
103 bne r18,$Lopa
105 $Le2a: mulq r2,r19,r3 C r3 = prod_low
106 addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
107 umulh r2,r19,r4 C r4 = cy_limb
108 addq r3,r0,r3 C r3 = cy_limb + prod_low
109 cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
110 stq r3,8(r16)
111 addq r4,r0,r0 C cy_limb = prod_high + cy
112 ret r31,(r26),1
113 $Le1a: stq r3,0(r16)
114 ret r31,(r26),1
116 $Large:
117 lda r30, -224(r30)
118 stq r26, 0(r30)
119 stq r9, 8(r30)
120 stq r10, 16(r30)
121 stq r11, 24(r30)
122 stq r12, 32(r30)
123 stq r13, 40(r30)
124 stq r14, 48(r30)
125 stq r15, 56(r30)
126 stq r29, 64(r30)
128 and r18, 7, r20 C count for the first loop, 0-7
129 srl r18, 3, r18 C count for unrolled loop
130 bis r31, r31, r21
131 beq r20, $L_8_or_more C skip first loop
133 $L_9_or_more:
134 ldq r2,0(r17) C r2 = s1_limb
135 lda r17,8(r17) C s1_ptr++
136 lda r20,-1(r20) C size--
137 mulq r2,r19,r3 C r3 = prod_low
138 umulh r2,r19,r21 C r21 = prod_high
139 beq r20,$Le1b C jump if size was == 1
140 bis r31, r31, r0 C FIXME: shouldn't need this
141 ldq r2,0(r17) C r2 = s1_limb
142 lda r17,8(r17) C s1_ptr++
143 lda r20,-1(r20) C size--
144 stq r3,0(r16)
145 lda r16,8(r16) C res_ptr++
146 beq r20,$Le2b C jump if size was == 2
147 ALIGN(8)
148 $Lopb: mulq r2,r19,r3 C r3 = prod_low
149 addq r21,r0,r0 C cy_limb = cy_limb + 'cy'
150 lda r20,-1(r20) C size--
151 umulh r2,r19,r21 C r21 = prod_high
152 ldq r2,0(r17) C r2 = s1_limb
153 lda r17,8(r17) C s1_ptr++
154 addq r3,r0,r3 C r3 = cy_limb + prod_low
155 stq r3,0(r16)
156 cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
157 lda r16,8(r16) C res_ptr++
158 bne r20,$Lopb
160 $Le2b: mulq r2,r19,r3 C r3 = prod_low
161 addq r21,r0,r0 C cy_limb = cy_limb + 'cy'
162 umulh r2,r19,r21 C r21 = prod_high
163 addq r3,r0,r3 C r3 = cy_limb + prod_low
164 cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
165 stq r3,0(r16)
166 lda r16,8(r16) C res_ptr++
167 addq r21,r0,r21 C cy_limb = prod_high + cy
168 br r31, $L_8_or_more
169 $Le1b: stq r3,0(r16)
170 lda r16,8(r16) C res_ptr++
172 $L_8_or_more:
173 lda r0, -1(r31) C put -1 in r0, for tricky loop control
174 lda r17, -32(r17) C L1 bookkeeping
175 lda r18, -1(r18) C decrement count
177 ldq r9, 32(r17) C L1
178 ldq r10, 40(r17) C L1
179 mulq r9, r19, r22 C U1 #07
180 ldq r11, 48(r17) C L1
181 umulh r9, r19, r23 C U1 #08
182 ldq r12, 56(r17) C L1
183 mulq r10, r19, r24 C U1 #09
184 ldq r9, 64(r17) C L1
186 lda r17, 64(r17) C L1 bookkeeping
188 umulh r10, r19, r25 C U1 #11
189 mulq r11, r19, r26 C U1 #12
190 umulh r11, r19, r27 C U1 #13
191 mulq r12, r19, r28 C U1 #14
192 ldq r10, 8(r17) C L1
193 umulh r12, r19, r1 C U1 #15
194 ldq r11, 16(r17) C L1
195 mulq r9, r19, r2 C U1 #16
196 ldq r12, 24(r17) C L1
197 umulh r9, r19, r3 C U1 #17
198 addq r21, r22, r13 C L1 mov
199 mulq r10, r19, r4 C U1 #18
200 addq r23, r24, r22 C L0 sum 2 mul's
201 cmpult r13, r21, r14 C L1 carry from sum
202 bgt r18, $L_16_or_more
204 cmpult r22, r24, r24 C U0 carry from sum
205 umulh r10, r19, r5 C U1 #02
206 addq r25, r26, r23 C U0 sum 2 mul's
207 mulq r11, r19, r6 C U1 #03
208 cmpult r23, r26, r25 C U0 carry from sum
209 umulh r11, r19, r7 C U1 #04
210 addq r27, r28, r28 C U0 sum 2 mul's
211 mulq r12, r19, r8 C U1 #05
212 cmpult r28, r27, r15 C L0 carry from sum
213 lda r16, 32(r16) C L1 bookkeeping
214 addq r13, r31, r13 C U0 start carry cascade
215 umulh r12, r19, r21 C U1 #06
216 br r31, $ret0c
218 $L_16_or_more:
219 C ---------------------------------------------------------------
220 subq r18,1,r18
221 cmpult r22, r24, r24 C U0 carry from sum
222 ldq r9, 32(r17) C L1
224 umulh r10, r19, r5 C U1 #02
225 addq r25, r26, r23 C U0 sum 2 mul's
226 mulq r11, r19, r6 C U1 #03
227 cmpult r23, r26, r25 C U0 carry from sum
228 umulh r11, r19, r7 C U1 #04
229 addq r27, r28, r28 C U0 sum 2 mul's
230 mulq r12, r19, r8 C U1 #05
231 cmpult r28, r27, r15 C L0 carry from sum
232 lda r16, 32(r16) C L1 bookkeeping
233 addq r13, r31, r13 C U0 start carry cascade
235 umulh r12, r19, r21 C U1 #06
236 C beq r13, $fix0w C U0
237 $ret0w: addq r22, r14, r26 C L0
238 ldq r10, 40(r17) C L1
240 mulq r9, r19, r22 C U1 #07
241 beq r26, $fix1w C U0
242 $ret1w: addq r23, r24, r27 C L0
243 ldq r11, 48(r17) C L1
245 umulh r9, r19, r23 C U1 #08
246 beq r27, $fix2w C U0
247 $ret2w: addq r28, r25, r28 C L0
248 ldq r12, 56(r17) C L1
250 mulq r10, r19, r24 C U1 #09
251 beq r28, $fix3w C U0
252 $ret3w: addq r1, r2, r20 C L0 sum 2 mul's
253 ldq r9, 64(r17) C L1
255 addq r3, r4, r2 C L0 #10 2 mul's
256 lda r17, 64(r17) C L1 bookkeeping
257 cmpult r20, r1, r29 C U0 carry from sum
259 umulh r10, r19, r25 C U1 #11
260 cmpult r2, r4, r4 C U0 carry from sum
261 stq r13, -32(r16) C L0
262 stq r26, -24(r16) C L1
264 mulq r11, r19, r26 C U1 #12
265 addq r5, r6, r14 C U0 sum 2 mul's
266 stq r27, -16(r16) C L0
267 stq r28, -8(r16) C L1
269 umulh r11, r19, r27 C U1 #13
270 cmpult r14, r6, r3 C U0 carry from sum
271 C could do cross-jumping here:
272 C bra $L_middle_of_unrolled_loop
273 mulq r12, r19, r28 C U1 #14
274 addq r7, r3, r5 C L0 eat carry
275 addq r20, r15, r20 C U0 carry cascade
276 ldq r10, 8(r17) C L1
278 umulh r12, r19, r1 C U1 #15
279 beq r20, $fix4 C U0
280 $ret4w: addq r2, r29, r6 C L0
281 ldq r11, 16(r17) C L1
283 mulq r9, r19, r2 C U1 #16
284 beq r6, $fix5 C U0
285 $ret5w: addq r14, r4, r7 C L0
286 ldq r12, 24(r17) C L1
288 umulh r9, r19, r3 C U1 #17
289 beq r7, $fix6 C U0
290 $ret6w: addq r5, r8, r8 C L0 sum 2
291 addq r21, r22, r13 C L1 sum 2 mul's
293 mulq r10, r19, r4 C U1 #18
294 addq r23, r24, r22 C L0 sum 2 mul's
295 cmpult r13, r21, r14 C L1 carry from sum
296 ble r18, $Lend C U0
297 C ---------------------------------------------------------------
298 ALIGN(16)
299 $Loop:
300 umulh r0, r18, r18 C U1 #01 decrement r18!
301 cmpult r8, r5, r29 C L0 carry from last bunch
302 cmpult r22, r24, r24 C U0 carry from sum
303 ldq r9, 32(r17) C L1
305 umulh r10, r19, r5 C U1 #02
306 addq r25, r26, r23 C U0 sum 2 mul's
307 stq r20, 0(r16) C L0
308 stq r6, 8(r16) C L1
310 mulq r11, r19, r6 C U1 #03
311 cmpult r23, r26, r25 C U0 carry from sum
312 stq r7, 16(r16) C L0
313 stq r8, 24(r16) C L1
315 umulh r11, r19, r7 C U1 #04
316 bis r31, r31, r31 C L0 st slosh
317 bis r31, r31, r31 C L1 st slosh
318 addq r27, r28, r28 C U0 sum 2 mul's
320 mulq r12, r19, r8 C U1 #05
321 cmpult r28, r27, r15 C L0 carry from sum
322 lda r16, 64(r16) C L1 bookkeeping
323 addq r13, r29, r13 C U0 start carry cascade
325 umulh r12, r19, r21 C U1 #06
326 beq r13, $fix0 C U0
327 $ret0: addq r22, r14, r26 C L0
328 ldq r10, 40(r17) C L1
330 mulq r9, r19, r22 C U1 #07
331 beq r26, $fix1 C U0
332 $ret1: addq r23, r24, r27 C L0
333 ldq r11, 48(r17) C L1
335 umulh r9, r19, r23 C U1 #08
336 beq r27, $fix2 C U0
337 $ret2: addq r28, r25, r28 C L0
338 ldq r12, 56(r17) C L1
340 mulq r10, r19, r24 C U1 #09
341 beq r28, $fix3 C U0
342 $ret3: addq r1, r2, r20 C L0 sum 2 mul's
343 ldq r9, 64(r17) C L1
345 addq r3, r4, r2 C L0 #10 2 mul's
346 bis r31, r31, r31 C U1 mul hole
347 lda r17, 64(r17) C L1 bookkeeping
348 cmpult r20, r1, r29 C U0 carry from sum
350 umulh r10, r19, r25 C U1 #11
351 cmpult r2, r4, r4 C U0 carry from sum
352 stq r13, -32(r16) C L0
353 stq r26, -24(r16) C L1
355 mulq r11, r19, r26 C U1 #12
356 addq r5, r6, r14 C U0 sum 2 mul's
357 stq r27, -16(r16) C L0
358 stq r28, -8(r16) C L1
360 umulh r11, r19, r27 C U1 #13
361 bis r31, r31, r31 C L0 st slosh
362 bis r31, r31, r31 C L1 st slosh
363 cmpult r14, r6, r3 C U0 carry from sum
364 $L_middle_of_unrolled_loop:
365 mulq r12, r19, r28 C U1 #14
366 addq r7, r3, r5 C L0 eat carry
367 addq r20, r15, r20 C U0 carry cascade
368 ldq r10, 8(r17) C L1
370 umulh r12, r19, r1 C U1 #15
371 beq r20, $fix4 C U0
372 $ret4: addq r2, r29, r6 C L0
373 ldq r11, 16(r17) C L1
375 mulq r9, r19, r2 C U1 #16
376 beq r6, $fix5 C U0
377 $ret5: addq r14, r4, r7 C L0
378 ldq r12, 24(r17) C L1
380 umulh r9, r19, r3 C U1 #17
381 beq r7, $fix6 C U0
382 $ret6: addq r5, r8, r8 C L0 sum 2
383 addq r21, r22, r13 C L1 sum 2 mul's
385 mulq r10, r19, r4 C U1 #18
386 addq r23, r24, r22 C L0 sum 2 mul's
387 cmpult r13, r21, r14 C L1 carry from sum
388 bgt r18, $Loop C U0
389 C ---------------------------------------------------------------
390 $Lend:
391 cmpult r8, r5, r29 C L0 carry from last bunch
392 cmpult r22, r24, r24 C U0 carry from sum
394 umulh r10, r19, r5 C U1 #02
395 addq r25, r26, r23 C U0 sum 2 mul's
396 stq r20, 0(r16) C L0
397 stq r6, 8(r16) C L1
399 mulq r11, r19, r6 C U1 #03
400 cmpult r23, r26, r25 C U0 carry from sum
401 stq r7, 16(r16) C L0
402 stq r8, 24(r16) C L1
404 umulh r11, r19, r7 C U1 #04
405 addq r27, r28, r28 C U0 sum 2 mul's
407 mulq r12, r19, r8 C U1 #05
408 cmpult r28, r27, r15 C L0 carry from sum
409 lda r16, 64(r16) C L1 bookkeeping
410 addq r13, r29, r13 C U0 start carry cascade
412 umulh r12, r19, r21 C U1 #06
413 beq r13, $fix0c C U0
414 $ret0c: addq r22, r14, r26 C L0
415 beq r26, $fix1c C U0
416 $ret1c: addq r23, r24, r27 C L0
417 beq r27, $fix2c C U0
418 $ret2c: addq r28, r25, r28 C L0
419 beq r28, $fix3c C U0
420 $ret3c: addq r1, r2, r20 C L0 sum 2 mul's
421 addq r3, r4, r2 C L0 #10 2 mul's
422 lda r17, 64(r17) C L1 bookkeeping
423 cmpult r20, r1, r29 C U0 carry from sum
424 cmpult r2, r4, r4 C U0 carry from sum
425 stq r13, -32(r16) C L0
426 stq r26, -24(r16) C L1
427 addq r5, r6, r14 C U0 sum 2 mul's
428 stq r27, -16(r16) C L0
429 stq r28, -8(r16) C L1
430 cmpult r14, r6, r3 C U0 carry from sum
431 addq r7, r3, r5 C L0 eat carry
432 addq r20, r15, r20 C U0 carry cascade
433 beq r20, $fix4c C U0
434 $ret4c: addq r2, r29, r6 C L0
435 beq r6, $fix5c C U0
436 $ret5c: addq r14, r4, r7 C L0
437 beq r7, $fix6c C U0
438 $ret6c: addq r5, r8, r8 C L0 sum 2
439 cmpult r8, r5, r29 C L0 carry from last bunch
440 stq r20, 0(r16) C L0
441 stq r6, 8(r16) C L1
442 stq r7, 16(r16) C L0
443 stq r8, 24(r16) C L1
444 addq r29, r21, r0
446 ldq r26, 0(r30)
447 ldq r9, 8(r30)
448 ldq r10, 16(r30)
449 ldq r11, 24(r30)
450 ldq r12, 32(r30)
451 ldq r13, 40(r30)
452 ldq r14, 48(r30)
453 ldq r15, 56(r30)
454 ldq r29, 64(r30)
455 lda r30, 224(r30)
456 ret r31, (r26), 1
458 C $fix0w: bis r14, r29, r14 C join carries
459 C br r31, $ret0w
460 $fix1w: bis r24, r14, r24 C join carries
461 br r31, $ret1w
462 $fix2w: bis r25, r24, r25 C join carries
463 br r31, $ret2w
464 $fix3w: bis r15, r25, r15 C join carries
465 br r31, $ret3w
466 $fix0: bis r14, r29, r14 C join carries
467 br r31, $ret0
468 $fix1: bis r24, r14, r24 C join carries
469 br r31, $ret1
470 $fix2: bis r25, r24, r25 C join carries
471 br r31, $ret2
472 $fix3: bis r15, r25, r15 C join carries
473 br r31, $ret3
474 $fix4: bis r29, r15, r29 C join carries
475 br r31, $ret4
476 $fix5: bis r4, r29, r4 C join carries
477 br r31, $ret5
478 $fix6: addq r5, r4, r5 C can't carry twice!
479 br r31, $ret6
480 $fix0c: bis r14, r29, r14 C join carries
481 br r31, $ret0c
482 $fix1c: bis r24, r14, r24 C join carries
483 br r31, $ret1c
484 $fix2c: bis r25, r24, r25 C join carries
485 br r31, $ret2c
486 $fix3c: bis r15, r25, r15 C join carries
487 br r31, $ret3c
488 $fix4c: bis r29, r15, r29 C join carries
489 br r31, $ret4c
490 $fix5c: bis r4, r29, r4 C join carries
491 br r31, $ret5c
492 $fix6c: addq r5, r4, r5 C can't carry twice!
493 br r31, $ret6c
495 EPILOGUE(mpn_mul_1)
496 ASM_END()