1 # Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
2 # the result to a second limb vector.
4 # Copyright (C) 2000-2023 Free Software Foundation, Inc.
6 # This file is part of the GNU MP Library.
8 # The GNU MP Library is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU Lesser General Public License as published
10 # by the Free Software Foundation; either version 2.1 of the License, or (at
11 # your option) any later version.
13 # The GNU MP Library is distributed in the hope that it will be useful, but
14 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 # License for more details.
18 # You should have received a copy of the GNU Lesser General Public License
19 # along with the GNU MP Library. If not, see <https://www.gnu.org/licenses/>.
27 # This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
28 # exactly 3.625 cycles/limb on EV6...
30 # This code was written in close cooperation with ev6 pipeline expert
31 # Steve Root (root@toober.hlo.dec.com). Any errors are tege's fault, though.
33 # Register usages for unrolled loop:
38 # 22,23 save for stores
40 # Sustains 8 mul-adds in 29 cycles in the unrolled inner loop.
42 # The stores can issue a cycle late so we have paired no-op's to 'catch'
43 # them, so that further disturbance to the schedule is damped.
45 # We couldn't pair the loads, because the entangled schedule of the
46 # carry's has to happen on one side {0} of the machine. Note, the total
47 # use of U0, and the total use of L0 (after attending to the stores).
48 # which is part of the reason why....
50 # This is a great schedule for the d_cache, a poor schedule for the
51 # b_cache. The lockup on U0 means that any stall can't be recovered
52 # from. Consider a ldq in L1. say that load gets stalled because it
53 # collides with a fill from the b_Cache. On the next cycle, this load
54 # gets priority. If first looks at L0, and goes there. The instruction
55 # we intended for L0 gets to look at L1, which is NOT where we want
56 # it. It either stalls 1, because it can't go in L0, or goes there, and
57 # causes a further instruction to stall.
59 # So for b_cache, we're likely going to want to put one or more cycles
60 # back into the code! And, of course, put in prefetches. For the
61 # accumulator, lds, intent to modify. For the multiplier, you might
62 # want ldq, evict next, if you're not wanting to use it again soon. Use
63 # 256 ahead of present pointer value. At a place where we have an mt
64 # followed by a bookkeeping, put the bookkeeping in upper, and the
65 # prefetch into lower.
67 # Note, the usage of physical registers per cycle is smoothed off, as
70 # Note, the ldq's and stq's are at the end of the quadpacks. note, we'd
71 # like not to have a ldq or stq to preceded a conditional branch in a
72 # quadpack. The conditional branch moves the retire pointer one cycle
76 # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
77 # Reserved regs: $29 $30 $31
78 # Free caller-saves regs in unrolled code: $24 $25 $28
79 # We should swap some of the callee-saves regs for some of the free
80 # caller-saves regs, saving some overhead cycles.
81 # Most importantly, we should write fast code for the 0-7 case.
82 # The code we use there are for the 21164, and runs at 7 cycles/limb
83 # on the 21264. Should not be hard, if we write specialized code for
84 # 1-7 limbs (the one for 0 limbs should be straightforward). We then just
85 # need a jump table indexed by the low 3 bits of the count argument.
100 ldq $2, 0($17) # $2 = s1_limb
101 addq $17, 8, $17 # s1_ptr++
102 subq $18, 1, $18 # size--
103 mulq $2, $19, $3 # $3 = prod_low
104 ldq $5, 0($16) # $5 = *res_ptr
105 umulh $2, $19, $0 # $0 = prod_high
106 beq $18, $Lend0b # jump if size was == 1
107 ldq $2, 0($17) # $2 = s1_limb
108 addq $17, 8, $17 # s1_ptr++
109 subq $18, 1, $18 # size--
113 addq $16, 8, $16 # res_ptr++
114 beq $18, $Lend0a # jump if size was == 2
117 $Loop0: mulq $2, $19, $3 # $3 = prod_low
118 ldq $5, 0($16) # $5 = *res_ptr
119 addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
120 subq $18, 1, $18 # size--
121 umulh $2, $19, $4 # $4 = cy_limb
122 ldq $2, 0($17) # $2 = s1_limb
123 addq $17, 8, $17 # s1_ptr++
124 addq $3, $0, $3 # $3 = cy_limb + prod_low
125 cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
129 addq $16, 8, $16 # res_ptr++
130 addq $5, $0, $0 # combine carries
133 mulq $2, $19, $3 # $3 = prod_low
134 ldq $5, 0($16) # $5 = *res_ptr
135 addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
136 umulh $2, $19, $4 # $4 = cy_limb
137 addq $3, $0, $3 # $3 = cy_limb + prod_low
138 cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
142 addq $5, $0, $0 # combine carries
143 addq $4, $0, $0 # cy_limb = prod_high + cy
162 and $18, 7, $20 # count for the first loop, 0-7
163 srl $18, 3, $18 # count for unrolled loop
166 ldq $2, 0($17) # $2 = s1_limb
167 addq $17, 8, $17 # s1_ptr++
168 subq $20, 1, $20 # size--
169 mulq $2, $19, $3 # $3 = prod_low
170 ldq $5, 0($16) # $5 = *res_ptr
171 umulh $2, $19, $0 # $0 = prod_high
172 beq $20, $Lend1b # jump if size was == 1
173 ldq $2, 0($17) # $2 = s1_limb
174 addq $17, 8, $17 # s1_ptr++
175 subq $20, 1, $20 # size--
179 addq $16, 8, $16 # res_ptr++
180 beq $20, $Lend1a # jump if size was == 2
183 $Loop1: mulq $2, $19, $3 # $3 = prod_low
184 ldq $5, 0($16) # $5 = *res_ptr
185 addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
186 subq $20, 1, $20 # size--
187 umulh $2, $19, $4 # $4 = cy_limb
188 ldq $2, 0($17) # $2 = s1_limb
189 addq $17, 8, $17 # s1_ptr++
190 addq $3, $0, $3 # $3 = cy_limb + prod_low
191 cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
195 addq $16, 8, $16 # res_ptr++
196 addq $5, $0, $0 # combine carries
200 mulq $2, $19, $3 # $3 = prod_low
201 ldq $5, 0($16) # $5 = *res_ptr
202 addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
203 umulh $2, $19, $4 # $4 = cy_limb
204 addq $3, $0, $3 # $3 = cy_limb + prod_low
205 cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
209 addq $16, 8, $16 # res_ptr++
210 addq $5, $0, $0 # combine carries
211 addq $4, $0, $0 # cy_limb = prod_high + cy
217 addq $16, 8, $16 # res_ptr++
221 lda $17, -16($17) # L1 bookkeeping
222 lda $16, -16($16) # L1 bookkeeping
225 # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
229 lda $18, -1($18) # L1 bookkeeping
233 mulq $19, $2, $13 # U1
235 umulh $19, $2, $14 # U1
236 mulq $19, $3, $15 # U1
237 lda $17, 64($17) # L1 bookkeeping
240 umulh $19, $3, $8 # U1
241 ldq $2, -16($17) # L1
242 mulq $19, $0, $9 # U1
244 umulh $19, $0, $10 # U1
245 addq $6, $13, $6 # L0 lo + acc
246 mulq $19, $1, $11 # U1
247 cmpult $6, $13, $20 # L0 lo add => carry
248 lda $16, 64($16) # L1 bookkeeping
249 addq $6, $12, $22 # U0 hi add => answer
250 cmpult $22, $12, $21 # L0 hi add => carry
251 addq $14, $20, $14 # U0 hi mul + carry
252 ldq $6, -16($16) # L1
253 addq $7, $15, $23 # L0 lo + acc
254 addq $14, $21, $14 # U0 hi mul + carry
256 umulh $19, $1, $12 # U1
257 cmpult $23, $15, $20 # L0 lo add => carry
258 addq $23, $14, $23 # U0 hi add => answer
260 mulq $19, $2, $13 # U1
261 cmpult $23, $14, $21 # L0 hi add => carry
262 addq $8, $20, $8 # U0 hi mul + carry
264 umulh $19, $2, $14 # U1
265 addq $4, $9, $4 # L0 lo + acc
266 stq $22, -48($16) # L0
267 stq $23, -40($16) # L1
268 mulq $19, $3, $15 # U1
269 addq $8, $21, $8 # U0 hi mul + carry
270 cmpult $4, $9, $20 # L0 lo add => carry
271 addq $4, $8, $22 # U0 hi add => answer
272 ble $18, $Lend # U1 bookkeeping
274 # ____ MAIN UNROLLED LOOP ____
277 bis $31, $31, $31 # U1 mt
278 cmpult $22, $8, $21 # L0 hi add => carry
279 addq $10, $20, $10 # U0 hi mul + carry
282 bis $31, $31, $31 # U1 mt
283 addq $5, $11, $23 # L0 lo + acc
284 addq $10, $21, $10 # L0 hi mul + carry
287 umulh $19, $3, $8 # U1
288 cmpult $23, $11, $20 # L0 lo add => carry
289 addq $23, $10, $23 # U0 hi add => answer
292 mulq $19, $0, $9 # U1
293 cmpult $23, $10, $21 # L0 hi add => carry
294 addq $12, $20, $12 # U0 hi mul + carry
297 umulh $19, $0, $10 # U1
298 addq $6, $13, $6 # L0 lo + acc
299 stq $22, -32($16) # L0
300 stq $23, -24($16) # L1
302 bis $31, $31, $31 # L0 st slosh
303 mulq $19, $1, $11 # U1
304 bis $31, $31, $31 # L1 st slosh
305 addq $12, $21, $12 # U0 hi mul + carry
307 cmpult $6, $13, $20 # L0 lo add => carry
308 bis $31, $31, $31 # U1 mt
309 lda $18, -1($18) # L1 bookkeeping
310 addq $6, $12, $22 # U0 hi add => answer
312 bis $31, $31, $31 # U1 mt
313 cmpult $22, $12, $21 # L0 hi add => carry
314 addq $14, $20, $14 # U0 hi mul + carry
317 bis $31, $31, $31 # U1 mt
318 addq $7, $15, $23 # L0 lo + acc
319 addq $14, $21, $14 # U0 hi mul + carry
322 umulh $19, $1, $12 # U1
323 cmpult $23, $15, $20 # L0 lo add => carry
324 addq $23, $14, $23 # U0 hi add => answer
327 mulq $19, $2, $13 # U1
328 cmpult $23, $14, $21 # L0 hi add => carry
329 addq $8, $20, $8 # U0 hi mul + carry
332 umulh $19, $2, $14 # U1
333 addq $4, $9, $4 # U0 lo + acc
334 stq $22, -16($16) # L0
335 stq $23, -8($16) # L1
337 bis $31, $31, $31 # L0 st slosh
338 mulq $19, $3, $15 # U1
339 bis $31, $31, $31 # L1 st slosh
340 addq $8, $21, $8 # L0 hi mul + carry
342 cmpult $4, $9, $20 # L0 lo add => carry
343 bis $31, $31, $31 # U1 mt
344 lda $17, 64($17) # L1 bookkeeping
345 addq $4, $8, $22 # U0 hi add => answer
347 bis $31, $31, $31 # U1 mt
348 cmpult $22, $8, $21 # L0 hi add => carry
349 addq $10, $20, $10 # U0 hi mul + carry
352 bis $31, $31, $31 # U1 mt
353 addq $5, $11, $23 # L0 lo + acc
354 addq $10, $21, $10 # L0 hi mul + carry
357 umulh $19, $3, $8 # U1
358 cmpult $23, $11, $20 # L0 lo add => carry
359 addq $23, $10, $23 # U0 hi add => answer
360 ldq $2, -16($17) # L1
362 mulq $19, $0, $9 # U1
363 cmpult $23, $10, $21 # L0 hi add => carry
364 addq $12, $20, $12 # U0 hi mul + carry
367 umulh $19, $0, $10 # U1
368 addq $6, $13, $6 # L0 lo + acc
372 bis $31, $31, $31 # L0 st slosh
373 mulq $19, $1, $11 # U1
374 bis $31, $31, $31 # L1 st slosh
375 addq $12, $21, $12 # U0 hi mul + carry
377 cmpult $6, $13, $20 # L0 lo add => carry
378 bis $31, $31, $31 # U1 mt
379 lda $16, 64($16) # L1 bookkeeping
380 addq $6, $12, $22 # U0 hi add => answer
382 bis $31, $31, $31 # U1 mt
383 cmpult $22, $12, $21 # L0 hi add => carry
384 addq $14, $20, $14 # U0 hi mul + carry
385 ldq $6, -16($16) # L1
387 bis $31, $31, $31 # U1 mt
388 addq $7, $15, $23 # L0 lo + acc
389 addq $14, $21, $14 # U0 hi mul + carry
392 umulh $19, $1, $12 # U1
393 cmpult $23, $15, $20 # L0 lo add => carry
394 addq $23, $14, $23 # U0 hi add => answer
397 mulq $19, $2, $13 # U1
398 cmpult $23, $14, $21 # L0 hi add => carry
399 addq $8, $20, $8 # U0 hi mul + carry
402 umulh $19, $2, $14 # U1
403 addq $4, $9, $4 # L0 lo + acc
404 stq $22, -48($16) # L0
405 stq $23, -40($16) # L1
407 bis $31, $31, $31 # L0 st slosh
408 mulq $19, $3, $15 # U1
409 bis $31, $31, $31 # L1 st slosh
410 addq $8, $21, $8 # U0 hi mul + carry
412 cmpult $4, $9, $20 # L0 lo add => carry
413 addq $4, $8, $22 # U0 hi add => answer
414 bis $31, $31, $31 # L1 mt
415 bgt $18, $Loop # U1 bookkeeping
417 # ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
419 cmpult $22, $8, $21 # L0 hi add => carry
420 addq $10, $20, $10 # U0 hi mul + carry
422 addq $5, $11, $23 # L0 lo + acc
423 addq $10, $21, $10 # L0 hi mul + carry
425 umulh $19, $3, $8 # U1
426 cmpult $23, $11, $20 # L0 lo add => carry
427 addq $23, $10, $23 # U0 hi add => answer
428 mulq $19, $0, $9 # U1
429 cmpult $23, $10, $21 # L0 hi add => carry
430 addq $12, $20, $12 # U0 hi mul + carry
431 umulh $19, $0, $10 # U1
432 addq $6, $13, $6 # L0 lo + acc
433 stq $22, -32($16) # L0
434 stq $23, -24($16) # L1
435 mulq $19, $1, $11 # U1
436 addq $12, $21, $12 # U0 hi mul + carry
437 cmpult $6, $13, $20 # L0 lo add => carry
438 addq $6, $12, $22 # U0 hi add => answer
439 cmpult $22, $12, $21 # L0 hi add => carry
440 addq $14, $20, $14 # U0 hi mul + carry
441 addq $7, $15, $23 # L0 lo + acc
442 addq $14, $21, $14 # U0 hi mul + carry
443 umulh $19, $1, $12 # U1
444 cmpult $23, $15, $20 # L0 lo add => carry
445 addq $23, $14, $23 # U0 hi add => answer
446 cmpult $23, $14, $21 # L0 hi add => carry
447 addq $8, $20, $8 # U0 hi mul + carry
448 addq $4, $9, $4 # U0 lo + acc
449 stq $22, -16($16) # L0
450 stq $23, -8($16) # L1
451 bis $31, $31, $31 # L0 st slosh
452 addq $8, $21, $8 # L0 hi mul + carry
453 cmpult $4, $9, $20 # L0 lo add => carry
454 addq $4, $8, $22 # U0 hi add => answer
455 cmpult $22, $8, $21 # L0 hi add => carry
456 addq $10, $20, $10 # U0 hi mul + carry
457 addq $5, $11, $23 # L0 lo + acc
458 addq $10, $21, $10 # L0 hi mul + carry
459 cmpult $23, $11, $20 # L0 lo add => carry
460 addq $23, $10, $23 # U0 hi add => answer
461 cmpult $23, $10, $21 # L0 hi add => carry
462 addq $12, $20, $12 # U0 hi mul + carry
465 addq $12, $21, $0 # U0 hi mul + carry