beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / p6 / mul_basecase.asm
blobd87bc12b6062610056cafc8f56e9bb63edf7f84c
1 dnl Intel P6 mpn_mul_basecase -- multiply two mpn numbers.
3 dnl Copyright 1999-2003 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C P6: approx 6.5 cycles per cross product (16 limbs/loop unrolling).
37 dnl P6 UNROLL_COUNT cycles/product (approx)
38 dnl 8 7
39 dnl 16 6.5
40 dnl 32 6.4
41 dnl Maximum possible with the current code is 32.
43 deflit(UNROLL_COUNT, 16)
46 C void mpn_mul_basecase (mp_ptr wp,
47 C mp_srcptr xp, mp_size_t xsize,
48 C mp_srcptr yp, mp_size_t ysize);
50 C This routine is essentially the same as mpn/generic/mul_basecase.c, but
51 C it's faster because it does most of the mpn_addmul_1() startup
52 C calculations only once.
54 ifdef(`PIC',`
55 deflit(UNROLL_THRESHOLD, 5)
56 ',`
57 deflit(UNROLL_THRESHOLD, 5)
60 defframe(PARAM_YSIZE,20)
61 defframe(PARAM_YP, 16)
62 defframe(PARAM_XSIZE,12)
63 defframe(PARAM_XP, 8)
64 defframe(PARAM_WP, 4)
66 TEXT
67 ALIGN(16)
69 PROLOGUE(mpn_mul_basecase)
70 deflit(`FRAME',0)
72 movl PARAM_XSIZE, %ecx
74 movl PARAM_YP, %eax
76 movl PARAM_XP, %edx
78 movl (%eax), %eax C yp[0]
79 cmpl $2, %ecx
80 ja L(xsize_more_than_two)
81 je L(two_by_something)
84 C one limb by one limb
86 mull (%edx)
88 movl PARAM_WP, %ecx
89 movl %eax, (%ecx)
90 movl %edx, 4(%ecx)
91 ret
94 C -----------------------------------------------------------------------------
95 L(two_by_something):
96 deflit(`FRAME',0)
98 dnl re-use parameter space
99 define(SAVE_EBX, `PARAM_XSIZE')
100 define(SAVE_ESI, `PARAM_YSIZE')
102 movl %ebx, SAVE_EBX
103 cmpl $1, PARAM_YSIZE
104 movl %eax, %ecx C yp[0]
106 movl %esi, SAVE_ESI C save esi
107 movl PARAM_WP, %ebx
108 movl %edx, %esi C xp
110 movl (%edx), %eax C xp[0]
111 jne L(two_by_two)
114 C two limbs by one limb
116 C eax xp[0]
117 C ebx wp
118 C ecx yp[0]
119 C edx
120 C esi xp
122 mull %ecx
124 movl %eax, (%ebx)
125 movl 4(%esi), %eax
126 movl %edx, %esi C carry
128 mull %ecx
130 addl %eax, %esi
132 movl %esi, 4(%ebx)
133 movl SAVE_ESI, %esi
135 adcl $0, %edx
137 movl %edx, 8(%ebx)
138 movl SAVE_EBX, %ebx
144 C -----------------------------------------------------------------------------
146 ALIGN(16)
147 L(two_by_two):
148 C eax xp[0]
149 C ebx wp
150 C ecx yp[0]
151 C edx
152 C esi xp
153 C edi
154 C ebp
156 dnl more parameter space re-use
157 define(SAVE_EDI, `PARAM_WP')
159 mull %ecx C xp[0] * yp[0]
161 movl %edi, SAVE_EDI
162 movl %edx, %edi C carry, for wp[1]
164 movl %eax, (%ebx)
165 movl 4(%esi), %eax
167 mull %ecx C xp[1] * yp[0]
169 addl %eax, %edi
170 movl PARAM_YP, %ecx
172 adcl $0, %edx
173 movl 4(%ecx), %ecx C yp[1]
175 movl %edi, 4(%ebx)
176 movl 4(%esi), %eax C xp[1]
177 movl %edx, %edi C carry, for wp[2]
179 mull %ecx C xp[1] * yp[1]
181 addl %eax, %edi
182 movl (%esi), %eax C xp[0]
184 adcl $0, %edx
185 movl %edx, %esi C carry, for wp[3]
187 mull %ecx C xp[0] * yp[1]
189 addl %eax, 4(%ebx)
190 movl %esi, %eax
192 adcl %edx, %edi
193 movl SAVE_ESI, %esi
195 movl %edi, 8(%ebx)
197 adcl $0, %eax
198 movl SAVE_EDI, %edi
200 movl %eax, 12(%ebx)
201 movl SAVE_EBX, %ebx
206 C -----------------------------------------------------------------------------
207 ALIGN(16)
208 L(xsize_more_than_two):
210 C The first limb of yp is processed with a simple mpn_mul_1 loop running at
211 C about 6.2 c/l. Unrolling this doesn't seem worthwhile since it's only run
212 C once (whereas the addmul_1 below is run ysize-1 many times). A call to
213 C mpn_mul_1 would be slowed down by the parameter pushing and popping etc,
214 C and doesn't seem likely to be worthwhile on the typical sizes reaching
215 C here from the Karatsuba code.
217 C eax yp[0]
218 C ebx
219 C ecx xsize
220 C edx xp
221 C esi
222 C edi
223 C ebp
225 defframe(`SAVE_EBX', -4)
226 defframe(`SAVE_ESI', -8)
227 defframe(`SAVE_EDI', -12)
228 defframe(`SAVE_EBP', -16)
229 defframe(VAR_COUNTER, -20) dnl for use in the unroll case
230 defframe(VAR_ADJUST, -24)
231 defframe(VAR_JMP, -28)
232 defframe(VAR_SWAP, -32)
233 defframe(VAR_XP_LOW, -36)
234 deflit(STACK_SPACE, 36)
236 subl $STACK_SPACE, %esp
237 deflit(`FRAME',STACK_SPACE)
239 movl %edi, SAVE_EDI
240 movl PARAM_WP, %edi
242 movl %ebx, SAVE_EBX
244 movl %ebp, SAVE_EBP
245 movl %eax, %ebp
247 movl %esi, SAVE_ESI
248 xorl %ebx, %ebx
249 leal (%edx,%ecx,4), %esi C xp end
251 leal (%edi,%ecx,4), %edi C wp end of mul1
252 negl %ecx
255 L(mul1):
256 C eax scratch
257 C ebx carry
258 C ecx counter, negative
259 C edx scratch
260 C esi xp end
261 C edi wp end of mul1
262 C ebp multiplier
264 movl (%esi,%ecx,4), %eax
266 mull %ebp
268 addl %ebx, %eax
269 movl %eax, (%edi,%ecx,4)
270 movl $0, %ebx
272 adcl %edx, %ebx
273 incl %ecx
274 jnz L(mul1)
277 movl PARAM_YSIZE, %edx
279 movl %ebx, (%edi) C final carry
280 movl PARAM_XSIZE, %ecx
281 decl %edx
283 jz L(done) C if ysize==1
285 cmpl $UNROLL_THRESHOLD, %ecx
286 movl PARAM_YP, %eax
287 jae L(unroll)
290 C -----------------------------------------------------------------------------
291 C simple addmul looping
293 C eax yp
294 C ebx
295 C ecx xsize
296 C edx ysize-1
297 C esi xp end
298 C edi wp end of mul1
299 C ebp
301 leal 4(%eax,%edx,4), %ebp C yp end
302 negl %ecx
303 negl %edx
305 movl %edx, PARAM_YSIZE C -(ysize-1)
306 movl (%esi,%ecx,4), %eax C xp low limb
307 incl %ecx
309 movl %ecx, PARAM_XSIZE C -(xsize-1)
310 xorl %ebx, %ebx C initial carry
312 movl %ebp, PARAM_YP
313 movl (%ebp,%edx,4), %ebp C yp second lowest limb - multiplier
314 jmp L(simple_outer_entry)
317 L(simple_outer_top):
318 C ebp ysize counter, negative
320 movl PARAM_YP, %edx
322 movl PARAM_XSIZE, %ecx C -(xsize-1)
323 xorl %ebx, %ebx C carry
325 movl %ebp, PARAM_YSIZE
326 addl $4, %edi C next position in wp
328 movl (%edx,%ebp,4), %ebp C yp limb - multiplier
330 movl -4(%esi,%ecx,4), %eax C xp low limb
333 L(simple_outer_entry):
335 L(simple_inner_top):
336 C eax xp limb
337 C ebx carry limb
338 C ecx loop counter (negative)
339 C edx scratch
340 C esi xp end
341 C edi wp end
342 C ebp multiplier
344 mull %ebp
346 addl %eax, %ebx
347 adcl $0, %edx
349 addl %ebx, (%edi,%ecx,4)
350 movl (%esi,%ecx,4), %eax
351 adcl $0, %edx
353 incl %ecx
354 movl %edx, %ebx
355 jnz L(simple_inner_top)
358 C separate code for last limb so outer loop counter handling can be
359 C interleaved
361 mull %ebp
363 movl PARAM_YSIZE, %ebp
364 addl %eax, %ebx
366 adcl $0, %edx
368 addl %ebx, (%edi)
370 adcl $0, %edx
371 incl %ebp
373 movl %edx, 4(%edi)
374 jnz L(simple_outer_top)
377 L(done):
378 movl SAVE_EBX, %ebx
380 movl SAVE_ESI, %esi
382 movl SAVE_EDI, %edi
384 movl SAVE_EBP, %ebp
385 addl $FRAME, %esp
391 C -----------------------------------------------------------------------------
393 C The unrolled loop is the same as in mpn_addmul_1, see that code for some
394 C comments.
396 C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
397 C increment xp and wp. This is used to adjust xp and wp, and is rshifted to
398 C given an initial VAR_COUNTER at the top of the outer loop.
400 C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
401 C up to -1, inclusive.
403 C VAR_JMP is the computed jump into the unrolled loop.
405 C VAR_SWAP is 0 if xsize odd or 0xFFFFFFFF if xsize even, used to swap the
406 C initial ebx and ecx on entry to the unrolling.
408 C VAR_XP_LOW is the least significant limb of xp, which is needed at the
409 C start of the unrolled loop.
411 C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
412 C inclusive.
414 C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
415 C added to give the location of the next limb of yp, which is the multiplier
416 C in the unrolled loop.
418 C The trick with the VAR_ADJUST value means it's only necessary to do one
419 C fetch in the outer loop to take care of xp, wp and the inner loop counter.
422 L(unroll):
423 C eax yp
424 C ebx
425 C ecx xsize
426 C edx ysize-1
427 C esi xp end
428 C edi wp end of mul1
429 C ebp
431 movl PARAM_XP, %esi
433 movl 4(%eax), %ebp C multiplier (yp second limb)
434 leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing
436 movl %eax, PARAM_YP
437 movl PARAM_WP, %edi
438 negl %edx
440 movl %edx, PARAM_YSIZE
441 leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1
442 decl %ecx C xsize-1
444 movl (%esi), %eax C xp low limb
445 andl $-UNROLL_MASK-1, %ebx
446 negl %ecx C -(xsize-1)
448 negl %ebx
449 andl $UNROLL_MASK, %ecx
451 movl %ebx, VAR_ADJUST
452 movl %ecx, %edx
453 shll $4, %ecx
455 movl %eax, VAR_XP_LOW
456 sarl $UNROLL_LOG2, %ebx
457 negl %edx
459 C 15 code bytes per limb
460 ifdef(`PIC',`
461 call L(pic_calc)
462 L(unroll_here):
464 leal L(unroll_inner_entry) (%ecx,%edx,1), %ecx
467 movl %ecx, VAR_JMP
468 movl %edx, %ecx
469 shll $31, %edx
471 sarl $31, %edx C 0 or -1 as xsize odd or even
472 leal 4(%edi,%ecx,4), %edi C wp and xp, adjust for unrolling,
473 leal 4(%esi,%ecx,4), %esi C and start at second limb
475 movl %edx, VAR_SWAP
476 jmp L(unroll_outer_entry)
479 ifdef(`PIC',`
480 L(pic_calc):
481 C See mpn/x86/README about old gas bugs
482 leal (%ecx,%edx,1), %ecx
483 addl $L(unroll_inner_entry)-L(unroll_here), %ecx
484 addl (%esp), %ecx
485 ret_internal
489 C --------------------------------------------------------------------------
490 ALIGN(16)
491 L(unroll_outer_top):
492 C eax
493 C ebx
494 C ecx
495 C edx
496 C esi xp + offset
497 C edi wp + offset
498 C ebp ysize counter, negative
500 movl VAR_ADJUST, %ebx
501 movl PARAM_YP, %edx
503 movl VAR_XP_LOW, %eax
504 movl %ebp, PARAM_YSIZE C store incremented ysize counter
506 leal eval(UNROLL_BYTES + 4) (%edi,%ebx,4), %edi
507 leal (%esi,%ebx,4), %esi
508 sarl $UNROLL_LOG2, %ebx
510 movl (%edx,%ebp,4), %ebp C yp next multiplier
512 L(unroll_outer_entry):
513 mull %ebp
515 movl %ebx, VAR_COUNTER
516 movl %edx, %ebx C carry high
517 movl %eax, %ecx C carry low
519 xorl %edx, %eax
520 movl VAR_JMP, %edx
522 andl VAR_SWAP, %eax
524 xorl %eax, %ebx C carries other way for odd index
525 xorl %eax, %ecx
527 jmp *%edx
530 C -----------------------------------------------------------------------------
532 L(unroll_inner_top):
533 C eax xp limb
534 C ebx carry high
535 C ecx carry low
536 C edx scratch
537 C esi xp+8
538 C edi wp
539 C ebp yp multiplier limb
541 C VAR_COUNTER loop counter, negative
543 C 15 bytes each limb
545 addl $UNROLL_BYTES, %edi
547 L(unroll_inner_entry):
549 deflit(CHUNK_COUNT,2)
550 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
551 deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
552 deflit(`disp1', eval(disp0 + 4))
554 Zdisp( movl, disp0,(%esi), %eax)
555 mull %ebp
556 Zdisp( addl, %ecx, disp0,(%edi))
557 adcl %eax, %ebx C new carry low
558 movl %edx, %ecx
559 adcl $0, %ecx C new carry high
561 movl disp1(%esi), %eax
562 mull %ebp
563 addl %ebx, disp1(%edi)
564 adcl %eax, %ecx C new carry low
565 movl %edx, %ebx
566 adcl $0, %ebx C new carry high
570 incl VAR_COUNTER
571 leal UNROLL_BYTES(%esi), %esi
572 jnz L(unroll_inner_top)
575 C eax
576 C ebx carry high
577 C ecx carry low
578 C edx
579 C esi
580 C edi wp, pointing at second last limb)
581 C ebp
583 deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
584 deflit(`disp1', eval(disp0 + 4))
586 movl PARAM_YSIZE, %ebp
587 addl %ecx, disp0(%edi) C carry low
589 adcl $0, %ebx
590 incl %ebp
592 movl %ebx, disp1(%edi) C carry high
593 jnz L(unroll_outer_top)
596 movl SAVE_ESI, %esi
598 movl SAVE_EBP, %ebp
600 movl SAVE_EDI, %edi
602 movl SAVE_EBX, %ebx
603 addl $FRAME, %esp
607 EPILOGUE()