beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k7 / mul_basecase.asm
blob4dfb5008852d142a06fa43c059249ec5847d97eb
1 dnl AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
3 dnl Copyright 1999-2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
35 C limbs/loop unrolling).
39 dnl K7 UNROLL_COUNT cycles/product (at around 20x20)
40 dnl 8 4.67
41 dnl 16 4.59
42 dnl 32 4.42
43 dnl Maximum possible with the current code is 32.
44 dnl
45 dnl At 32 the typical 13-26 limb sizes from the karatsuba code will get
46 dnl done with a straight run through a block of code, no inner loop. Using
47 dnl 32 gives 1k of code, but the k7 has a 64k L1 code cache.
49 deflit(UNROLL_COUNT, 32)
52 C void mpn_mul_basecase (mp_ptr wp,
53 C mp_srcptr xp, mp_size_t xsize,
54 C mp_srcptr yp, mp_size_t ysize);
56 C Calculate xp,xsize multiplied by yp,ysize, storing the result in
57 C wp,xsize+ysize.
59 C This routine is essentially the same as mpn/generic/mul_basecase.c, but
60 C it's faster because it does most of the mpn_addmul_1() startup
61 C calculations only once. The saving is 15-25% on typical sizes coming from
62 C the Karatsuba multiply code.
64 ifdef(`PIC',`
65 deflit(UNROLL_THRESHOLD, 5)
66 ',`
67 deflit(UNROLL_THRESHOLD, 5)
70 defframe(PARAM_YSIZE,20)
71 defframe(PARAM_YP, 16)
72 defframe(PARAM_XSIZE,12)
73 defframe(PARAM_XP, 8)
74 defframe(PARAM_WP, 4)
76 TEXT
77 ALIGN(32)
78 PROLOGUE(mpn_mul_basecase)
79 deflit(`FRAME',0)
81 movl PARAM_XSIZE, %ecx
82 movl PARAM_YP, %eax
84 movl PARAM_XP, %edx
85 movl (%eax), %eax C yp low limb
87 cmpl $2, %ecx
88 ja L(xsize_more_than_two)
89 je L(two_by_something)
92 C one limb by one limb
94 mull (%edx)
96 movl PARAM_WP, %ecx
97 movl %eax, (%ecx)
98 movl %edx, 4(%ecx)
99 ret
102 C -----------------------------------------------------------------------------
103 L(two_by_something):
104 deflit(`FRAME',0)
105 decl PARAM_YSIZE
106 pushl %ebx defframe_pushl(`SAVE_EBX')
107 movl %eax, %ecx C yp low limb
109 movl PARAM_WP, %ebx
110 pushl %esi defframe_pushl(`SAVE_ESI')
111 movl %edx, %esi C xp
113 movl (%edx), %eax C xp low limb
114 jnz L(two_by_two)
117 C two limbs by one limb
119 mull %ecx
121 movl %eax, (%ebx)
122 movl 4(%esi), %eax
123 movl %edx, %esi C carry
125 mull %ecx
127 addl %eax, %esi
129 movl %esi, 4(%ebx)
130 movl SAVE_ESI, %esi
132 adcl $0, %edx
134 movl %edx, 8(%ebx)
135 movl SAVE_EBX, %ebx
136 addl $FRAME, %esp
142 C -----------------------------------------------------------------------------
143 C Could load yp earlier into another register.
145 ALIGN(16)
146 L(two_by_two):
147 C eax xp low limb
148 C ebx wp
149 C ecx yp low limb
150 C edx
151 C esi xp
152 C edi
153 C ebp
155 dnl FRAME carries on from previous
157 mull %ecx C xp[0] * yp[0]
159 push %edi defframe_pushl(`SAVE_EDI')
160 movl %edx, %edi C carry, for wp[1]
162 movl %eax, (%ebx)
163 movl 4(%esi), %eax
165 mull %ecx C xp[1] * yp[0]
167 addl %eax, %edi
168 movl PARAM_YP, %ecx
170 adcl $0, %edx
171 movl 4(%ecx), %ecx C yp[1]
172 movl %edi, 4(%ebx)
174 movl 4(%esi), %eax C xp[1]
175 movl %edx, %edi C carry, for wp[2]
177 mull %ecx C xp[1] * yp[1]
179 addl %eax, %edi
181 adcl $0, %edx
182 movl (%esi), %eax C xp[0]
184 movl %edx, %esi C carry, for wp[3]
186 mull %ecx C xp[0] * yp[1]
188 addl %eax, 4(%ebx)
189 adcl %edx, %edi
190 movl %edi, 8(%ebx)
192 adcl $0, %esi
193 movl SAVE_EDI, %edi
194 movl %esi, 12(%ebx)
196 movl SAVE_ESI, %esi
197 movl SAVE_EBX, %ebx
198 addl $FRAME, %esp
203 C -----------------------------------------------------------------------------
204 ALIGN(16)
205 L(xsize_more_than_two):
207 C The first limb of yp is processed with a simple mpn_mul_1 style loop
208 C inline. Unrolling this doesn't seem worthwhile since it's only run once
209 C (whereas the addmul below is run ysize-1 many times). A call to the
210 C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
211 C popping, and doesn't seem likely to be worthwhile on the typical 13-26
212 C limb operations the Karatsuba code calls here with.
214 C eax yp[0]
215 C ebx
216 C ecx xsize
217 C edx xp
218 C esi
219 C edi
220 C ebp
222 dnl FRAME doesn't carry on from previous, no pushes yet here
223 defframe(`SAVE_EBX',-4)
224 defframe(`SAVE_ESI',-8)
225 defframe(`SAVE_EDI',-12)
226 defframe(`SAVE_EBP',-16)
227 deflit(`FRAME',0)
229 subl $16, %esp
230 deflit(`FRAME',16)
232 movl %edi, SAVE_EDI
233 movl PARAM_WP, %edi
235 movl %ebx, SAVE_EBX
236 movl %ebp, SAVE_EBP
237 movl %eax, %ebp
239 movl %esi, SAVE_ESI
240 xorl %ebx, %ebx
241 leal (%edx,%ecx,4), %esi C xp end
243 leal (%edi,%ecx,4), %edi C wp end of mul1
244 negl %ecx
247 L(mul1):
248 C eax scratch
249 C ebx carry
250 C ecx counter, negative
251 C edx scratch
252 C esi xp end
253 C edi wp end of mul1
254 C ebp multiplier
256 movl (%esi,%ecx,4), %eax
258 mull %ebp
260 addl %ebx, %eax
261 movl %eax, (%edi,%ecx,4)
262 movl $0, %ebx
264 adcl %edx, %ebx
265 incl %ecx
266 jnz L(mul1)
269 movl PARAM_YSIZE, %edx
270 movl PARAM_XSIZE, %ecx
272 movl %ebx, (%edi) C final carry
273 decl %edx
275 jnz L(ysize_more_than_one)
278 movl SAVE_EDI, %edi
279 movl SAVE_EBX, %ebx
281 movl SAVE_EBP, %ebp
282 movl SAVE_ESI, %esi
283 addl $FRAME, %esp
288 L(ysize_more_than_one):
289 cmpl $UNROLL_THRESHOLD, %ecx
290 movl PARAM_YP, %eax
292 jae L(unroll)
295 C -----------------------------------------------------------------------------
296 C simple addmul looping
298 C eax yp
299 C ebx
300 C ecx xsize
301 C edx ysize-1
302 C esi xp end
303 C edi wp end of mul1
304 C ebp
306 leal 4(%eax,%edx,4), %ebp C yp end
307 negl %ecx
308 negl %edx
310 movl (%esi,%ecx,4), %eax C xp low limb
311 movl %edx, PARAM_YSIZE C -(ysize-1)
312 incl %ecx
314 xorl %ebx, %ebx C initial carry
315 movl %ecx, PARAM_XSIZE C -(xsize-1)
316 movl %ebp, PARAM_YP
318 movl (%ebp,%edx,4), %ebp C yp second lowest limb - multiplier
319 jmp L(simple_outer_entry)
322 C this is offset 0x121 so close enough to aligned
323 L(simple_outer_top):
324 C ebp ysize counter, negative
326 movl PARAM_YP, %edx
327 movl PARAM_XSIZE, %ecx C -(xsize-1)
328 xorl %ebx, %ebx C carry
330 movl %ebp, PARAM_YSIZE
331 addl $4, %edi C next position in wp
333 movl (%edx,%ebp,4), %ebp C yp limb - multiplier
334 movl -4(%esi,%ecx,4), %eax C xp low limb
337 L(simple_outer_entry):
339 L(simple_inner):
340 C eax xp limb
341 C ebx carry limb
342 C ecx loop counter (negative)
343 C edx scratch
344 C esi xp end
345 C edi wp end
346 C ebp multiplier
348 mull %ebp
350 addl %eax, %ebx
351 adcl $0, %edx
353 addl %ebx, (%edi,%ecx,4)
354 movl (%esi,%ecx,4), %eax
355 adcl $0, %edx
357 incl %ecx
358 movl %edx, %ebx
359 jnz L(simple_inner)
362 mull %ebp
364 movl PARAM_YSIZE, %ebp
365 addl %eax, %ebx
367 adcl $0, %edx
368 addl %ebx, (%edi)
370 adcl $0, %edx
371 incl %ebp
373 movl %edx, 4(%edi)
374 jnz L(simple_outer_top)
377 movl SAVE_EBX, %ebx
378 movl SAVE_ESI, %esi
380 movl SAVE_EDI, %edi
381 movl SAVE_EBP, %ebp
382 addl $FRAME, %esp
388 C -----------------------------------------------------------------------------
390 C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
391 C comments.
393 C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
394 C increment xp and wp. This is used to adjust back xp and wp, and rshifted
395 C to given an initial VAR_COUNTER at the top of the outer loop.
397 C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
398 C up to -1, inclusive.
400 C VAR_JMP is the computed jump into the unrolled loop.
402 C VAR_XP_LOW is the least significant limb of xp, which is needed at the
403 C start of the unrolled loop.
405 C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
406 C inclusive.
408 C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
409 C added to give the location of the next limb of yp, which is the multiplier
410 C in the unrolled loop.
412 C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
413 C outer loop to take care of xp, wp and the inner loop counter.
415 defframe(VAR_COUNTER, -20)
416 defframe(VAR_ADJUST, -24)
417 defframe(VAR_JMP, -28)
418 defframe(VAR_XP_LOW, -32)
419 deflit(VAR_EXTRA_SPACE, 16)
422 L(unroll):
423 C eax yp
424 C ebx
425 C ecx xsize
426 C edx ysize-1
427 C esi xp end
428 C edi wp end of mul1
429 C ebp
431 movl PARAM_XP, %esi
432 movl 4(%eax), %ebp C multiplier (yp second limb)
433 leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing
435 movl PARAM_WP, %edi
436 movl %eax, PARAM_YP
437 negl %edx
439 movl %edx, PARAM_YSIZE
440 leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1
441 decl %ecx C xsize-1
443 movl (%esi), %eax C xp low limb
444 andl $-UNROLL_MASK-1, %ebx
445 negl %ecx
447 subl $VAR_EXTRA_SPACE, %esp
448 deflit(`FRAME',16+VAR_EXTRA_SPACE)
449 negl %ebx
450 andl $UNROLL_MASK, %ecx
452 movl %ebx, VAR_ADJUST
453 movl %ecx, %edx
454 shll $4, %ecx
456 sarl $UNROLL_LOG2, %ebx
458 C 17 code bytes per limb
459 ifdef(`PIC',`
460 call L(pic_calc)
461 L(unroll_here):
463 leal L(unroll_entry) (%ecx,%edx,1), %ecx
465 negl %edx
467 movl %eax, VAR_XP_LOW
468 movl %ecx, VAR_JMP
469 leal 4(%edi,%edx,4), %edi C wp and xp, adjust for unrolling,
470 leal 4(%esi,%edx,4), %esi C and start at second limb
471 jmp L(unroll_outer_entry)
474 ifdef(`PIC',`
475 L(pic_calc):
476 C See mpn/x86/README about old gas bugs
477 leal (%ecx,%edx,1), %ecx
478 addl $L(unroll_entry)-L(unroll_here), %ecx
479 addl (%esp), %ecx
480 ret_internal
484 C --------------------------------------------------------------------------
485 ALIGN(32)
486 L(unroll_outer_top):
487 C ebp ysize counter, negative
489 movl VAR_ADJUST, %ebx
490 movl PARAM_YP, %edx
492 movl VAR_XP_LOW, %eax
493 movl %ebp, PARAM_YSIZE C store incremented ysize counter
495 leal 4(%edi,%ebx,4), %edi
496 leal (%esi,%ebx,4), %esi
497 sarl $UNROLL_LOG2, %ebx
499 movl (%edx,%ebp,4), %ebp C yp next multiplier
500 movl VAR_JMP, %ecx
502 L(unroll_outer_entry):
503 mull %ebp
505 testb $1, %cl C and clear carry bit
506 movl %ebx, VAR_COUNTER
507 movl $0, %ebx
509 movl $0, %ecx
510 cmovz( %eax, %ecx) C eax into low carry, zero into high carry limb
511 cmovnz( %eax, %ebx)
513 C Extra fetch of VAR_JMP is bad, but registers are tight
514 jmp *VAR_JMP
517 C -----------------------------------------------------------------------------
518 ALIGN(32)
519 L(unroll_top):
520 C eax xp limb
521 C ebx carry high
522 C ecx carry low
523 C edx scratch
524 C esi xp+8
525 C edi wp
526 C ebp yp multiplier limb
528 C VAR_COUNTER loop counter, negative
530 C 17 bytes each limb
532 L(unroll_entry):
534 deflit(CHUNK_COUNT,2)
535 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
536 deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
537 deflit(`disp1', eval(disp0 + 4))
539 Zdisp( movl, disp0,(%esi), %eax)
540 adcl %edx, %ebx
542 mull %ebp
544 Zdisp( addl, %ecx, disp0,(%edi))
545 movl $0, %ecx
547 adcl %eax, %ebx
550 movl disp1(%esi), %eax
551 adcl %edx, %ecx
553 mull %ebp
555 addl %ebx, disp1(%edi)
556 movl $0, %ebx
558 adcl %eax, %ecx
562 incl VAR_COUNTER
563 leal UNROLL_BYTES(%esi), %esi
564 leal UNROLL_BYTES(%edi), %edi
566 jnz L(unroll_top)
569 C eax
570 C ebx zero
571 C ecx low
572 C edx high
573 C esi
574 C edi wp, pointing at second last limb)
575 C ebp
577 C carry flag to be added to high
579 deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
580 deflit(`disp1', eval(disp0-0 + 4))
582 movl PARAM_YSIZE, %ebp
583 adcl $0, %edx
584 addl %ecx, disp0(%edi)
586 adcl $0, %edx
587 incl %ebp
589 movl %edx, disp1(%edi)
590 jnz L(unroll_outer_top)
593 movl SAVE_ESI, %esi
594 movl SAVE_EBP, %ebp
596 movl SAVE_EDI, %edi
597 movl SAVE_EBX, %ebx
598 addl $FRAME, %esp
602 EPILOGUE()