beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / p6 / sqr_basecase.asm
blob8fc7fdf3753e8a68ef932a84759c538aed2d1444
1 dnl Intel P6 mpn_sqr_basecase -- square an mpn number.
3 dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular
35 C product (measured on the speed difference between 20 and 40 limbs,
36 C which is the Karatsuba recursing range).
39 dnl These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
40 dnl a description. The only difference here is that UNROLL_COUNT can go up
41 dnl to 64 (not 63) making SQR_TOOM2_THRESHOLD_MAX 67.
43 deflit(SQR_TOOM2_THRESHOLD_MAX, 67)
45 ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
46 `define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
48 m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
49 deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
52 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
54 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
55 C lot of function call overheads are avoided, especially when the given size
56 C is small.
58 C The code size might look a bit excessive, but not all of it is executed so
59 C it won't all get into the code cache. The 1x1, 2x2 and 3x3 special cases
60 C clearly apply only to those sizes; mid sizes like 10x10 only need part of
61 C the unrolled addmul; and big sizes like 40x40 that do use the full
62 C unrolling will least be making good use of it, because 40x40 will take
63 C something like 7000 cycles.
65 defframe(PARAM_SIZE,12)
66 defframe(PARAM_SRC, 8)
67 defframe(PARAM_DST, 4)
69 TEXT
70 ALIGN(32)
71 PROLOGUE(mpn_sqr_basecase)
72 deflit(`FRAME',0)
74 movl PARAM_SIZE, %edx
76 movl PARAM_SRC, %eax
78 cmpl $2, %edx
79 movl PARAM_DST, %ecx
80 je L(two_limbs)
82 movl (%eax), %eax
83 ja L(three_or_more)
86 C -----------------------------------------------------------------------------
87 C one limb only
88 C eax src limb
89 C ebx
90 C ecx dst
91 C edx
93 mull %eax
95 movl %eax, (%ecx)
96 movl %edx, 4(%ecx)
98 ret
101 C -----------------------------------------------------------------------------
102 L(two_limbs):
103 C eax src
104 C ebx
105 C ecx dst
106 C edx
108 defframe(SAVE_ESI, -4)
109 defframe(SAVE_EBX, -8)
110 defframe(SAVE_EDI, -12)
111 defframe(SAVE_EBP, -16)
112 deflit(`STACK_SPACE',16)
114 subl $STACK_SPACE, %esp
115 deflit(`FRAME',STACK_SPACE)
117 movl %esi, SAVE_ESI
118 movl %eax, %esi
119 movl (%eax), %eax
121 mull %eax C src[0]^2
123 movl %eax, (%ecx) C dst[0]
124 movl 4(%esi), %eax
126 movl %ebx, SAVE_EBX
127 movl %edx, %ebx C dst[1]
129 mull %eax C src[1]^2
131 movl %edi, SAVE_EDI
132 movl %eax, %edi C dst[2]
133 movl (%esi), %eax
135 movl %ebp, SAVE_EBP
136 movl %edx, %ebp C dst[3]
138 mull 4(%esi) C src[0]*src[1]
140 addl %eax, %ebx
141 movl SAVE_ESI, %esi
143 adcl %edx, %edi
145 adcl $0, %ebp
146 addl %ebx, %eax
147 movl SAVE_EBX, %ebx
149 adcl %edi, %edx
150 movl SAVE_EDI, %edi
152 adcl $0, %ebp
154 movl %eax, 4(%ecx)
156 movl %ebp, 12(%ecx)
157 movl SAVE_EBP, %ebp
159 movl %edx, 8(%ecx)
160 addl $FRAME, %esp
165 C -----------------------------------------------------------------------------
166 L(three_or_more):
167 C eax src low limb
168 C ebx
169 C ecx dst
170 C edx size
171 deflit(`FRAME',0)
173 pushl %esi defframe_pushl(`SAVE_ESI')
174 cmpl $4, %edx
176 movl PARAM_SRC, %esi
177 jae L(four_or_more)
180 C -----------------------------------------------------------------------------
181 C three limbs
183 C eax src low limb
184 C ebx
185 C ecx dst
186 C edx
187 C esi src
188 C edi
189 C ebp
191 pushl %ebp defframe_pushl(`SAVE_EBP')
192 pushl %edi defframe_pushl(`SAVE_EDI')
194 mull %eax C src[0] ^ 2
196 movl %eax, (%ecx)
197 movl %edx, 4(%ecx)
199 movl 4(%esi), %eax
200 xorl %ebp, %ebp
202 mull %eax C src[1] ^ 2
204 movl %eax, 8(%ecx)
205 movl %edx, 12(%ecx)
206 movl 8(%esi), %eax
208 pushl %ebx defframe_pushl(`SAVE_EBX')
210 mull %eax C src[2] ^ 2
212 movl %eax, 16(%ecx)
213 movl %edx, 20(%ecx)
215 movl (%esi), %eax
217 mull 4(%esi) C src[0] * src[1]
219 movl %eax, %ebx
220 movl %edx, %edi
222 movl (%esi), %eax
224 mull 8(%esi) C src[0] * src[2]
226 addl %eax, %edi
227 movl %edx, %ebp
229 adcl $0, %ebp
230 movl 4(%esi), %eax
232 mull 8(%esi) C src[1] * src[2]
234 xorl %esi, %esi
235 addl %eax, %ebp
237 C eax
238 C ebx dst[1]
239 C ecx dst
240 C edx dst[4]
241 C esi zero, will be dst[5]
242 C edi dst[2]
243 C ebp dst[3]
245 adcl $0, %edx
246 addl %ebx, %ebx
248 adcl %edi, %edi
250 adcl %ebp, %ebp
252 adcl %edx, %edx
253 movl 4(%ecx), %eax
255 adcl $0, %esi
256 addl %ebx, %eax
258 movl %eax, 4(%ecx)
259 movl 8(%ecx), %eax
261 adcl %edi, %eax
262 movl 12(%ecx), %ebx
264 adcl %ebp, %ebx
265 movl 16(%ecx), %edi
267 movl %eax, 8(%ecx)
268 movl SAVE_EBP, %ebp
270 movl %ebx, 12(%ecx)
271 movl SAVE_EBX, %ebx
273 adcl %edx, %edi
274 movl 20(%ecx), %eax
276 movl %edi, 16(%ecx)
277 movl SAVE_EDI, %edi
279 adcl %esi, %eax C no carry out of this
280 movl SAVE_ESI, %esi
282 movl %eax, 20(%ecx)
283 addl $FRAME, %esp
289 C -----------------------------------------------------------------------------
290 defframe(VAR_COUNTER,-20)
291 defframe(VAR_JMP, -24)
292 deflit(`STACK_SPACE',24)
294 L(four_or_more):
295 C eax src low limb
296 C ebx
297 C ecx
298 C edx size
299 C esi src
300 C edi
301 C ebp
302 deflit(`FRAME',4) dnl %esi already pushed
304 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
306 subl $STACK_SPACE-FRAME, %esp
307 deflit(`FRAME',STACK_SPACE)
308 movl $1, %ecx
310 movl %edi, SAVE_EDI
311 movl PARAM_DST, %edi
313 movl %ebx, SAVE_EBX
314 subl %edx, %ecx C -(size-1)
316 movl %ebp, SAVE_EBP
317 movl $0, %ebx C initial carry
319 leal (%esi,%edx,4), %esi C &src[size]
320 movl %eax, %ebp C multiplier
322 leal -4(%edi,%edx,4), %edi C &dst[size-1]
325 C This loop runs at just over 6 c/l.
327 L(mul_1):
328 C eax scratch
329 C ebx carry
330 C ecx counter, limbs, negative, -(size-1) to -1
331 C edx scratch
332 C esi &src[size]
333 C edi &dst[size-1]
334 C ebp multiplier
336 movl %ebp, %eax
338 mull (%esi,%ecx,4)
340 addl %ebx, %eax
341 movl $0, %ebx
343 adcl %edx, %ebx
344 movl %eax, 4(%edi,%ecx,4)
346 incl %ecx
347 jnz L(mul_1)
350 movl %ebx, 4(%edi)
353 C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
355 C The last two addmuls, which are the bottom right corner of the product
356 C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
357 C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
358 C cases that need to be done.
360 C The unrolled code is the same as mpn_addmul_1(), see that routine for some
361 C comments.
363 C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
365 C VAR_JMP is the computed jump into the unrolled code, stepped by one code
366 C chunk each outer loop.
368 dnl This is also hard-coded in the address calculation below.
369 deflit(CODE_BYTES_PER_LIMB, 15)
371 dnl With &src[size] and &dst[size-1] pointers, the displacements in the
372 dnl unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above
373 dnl that an offset must be added to them.
374 deflit(OFFSET,
375 ifelse(eval(UNROLL_COUNT>32),1,
376 eval((UNROLL_COUNT-32)*4),
379 C eax
380 C ebx carry
381 C ecx
382 C edx
383 C esi &src[size]
384 C edi &dst[size-1]
385 C ebp
387 movl PARAM_SIZE, %ecx
389 subl $4, %ecx
390 jz L(corner)
392 movl %ecx, %edx
393 negl %ecx
395 shll $4, %ecx
396 ifelse(OFFSET,0,,`subl $OFFSET, %esi')
398 ifdef(`PIC',`
399 call L(pic_calc)
400 L(here):
402 leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
404 negl %edx
406 ifelse(OFFSET,0,,`subl $OFFSET, %edi')
408 C The calculated jump mustn't be before the start of the available
409 C code. This is the limit that UNROLL_COUNT puts on the src operand
410 C size, but checked here using the jump address directly.
412 ASSERT(ae,
413 `movl_text_address( L(unroll_inner_start), %eax)
414 cmpl %eax, %ecx')
417 C -----------------------------------------------------------------------------
418 ALIGN(16)
419 L(unroll_outer_top):
420 C eax
421 C ebx high limb to store
422 C ecx VAR_JMP
423 C edx VAR_COUNTER, limbs, negative
424 C esi &src[size], constant
425 C edi dst ptr, second highest limb of last addmul
426 C ebp
428 movl -12+OFFSET(%esi,%edx,4), %ebp C multiplier
429 movl %edx, VAR_COUNTER
431 movl -8+OFFSET(%esi,%edx,4), %eax C first limb of multiplicand
433 mull %ebp
435 define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')')
437 testb $1, %cl
439 movl %edx, %ebx C high carry
440 leal 4(%edi), %edi
442 movl %ecx, %edx C jump
444 movl %eax, %ecx C low carry
445 leal CODE_BYTES_PER_LIMB(%edx), %edx
447 cmovX( %ebx, %ecx) C high carry reverse
448 cmovX( %eax, %ebx) C low carry reverse
449 movl %edx, VAR_JMP
450 jmp *%edx
453 C Must be on an even address here so the low bit of the jump address
454 C will indicate which way around ecx/ebx should start.
456 ALIGN(2)
458 L(unroll_inner_start):
459 C eax scratch
460 C ebx carry high
461 C ecx carry low
462 C edx scratch
463 C esi src pointer
464 C edi dst pointer
465 C ebp multiplier
467 C 15 code bytes each limb
468 C ecx/ebx reversed on each chunk
470 forloop(`i', UNROLL_COUNT, 1, `
471 deflit(`disp_src', eval(-i*4 + OFFSET))
472 deflit(`disp_dst', eval(disp_src))
474 m4_assert(`disp_src>=-128 && disp_src<128')
475 m4_assert(`disp_dst>=-128 && disp_dst<128')
477 ifelse(eval(i%2),0,`
478 Zdisp( movl, disp_src,(%esi), %eax)
479 mull %ebp
480 Zdisp( addl, %ebx, disp_dst,(%edi))
481 adcl %eax, %ecx
482 movl %edx, %ebx
483 adcl $0, %ebx
485 dnl this one comes out last
486 Zdisp( movl, disp_src,(%esi), %eax)
487 mull %ebp
488 Zdisp( addl, %ecx, disp_dst,(%edi))
489 adcl %eax, %ebx
490 movl %edx, %ecx
491 adcl $0, %ecx
494 L(unroll_inner_end):
496 addl %ebx, m4_empty_if_zero(OFFSET)(%edi)
498 movl VAR_COUNTER, %edx
499 adcl $0, %ecx
501 movl %ecx, m4_empty_if_zero(OFFSET+4)(%edi)
502 movl VAR_JMP, %ecx
504 incl %edx
505 jnz L(unroll_outer_top)
508 ifelse(OFFSET,0,,`
509 addl $OFFSET, %esi
510 addl $OFFSET, %edi
514 C -----------------------------------------------------------------------------
515 ALIGN(16)
516 L(corner):
517 C eax
518 C ebx
519 C ecx
520 C edx
521 C esi &src[size]
522 C edi &dst[2*size-5]
523 C ebp
525 movl -12(%esi), %eax
527 mull -8(%esi)
529 addl %eax, (%edi)
530 movl -12(%esi), %eax
531 movl $0, %ebx
533 adcl %edx, %ebx
535 mull -4(%esi)
537 addl %eax, %ebx
538 movl -8(%esi), %eax
540 adcl $0, %edx
542 addl %ebx, 4(%edi)
543 movl $0, %ebx
545 adcl %edx, %ebx
547 mull -4(%esi)
549 movl PARAM_SIZE, %ecx
550 addl %ebx, %eax
552 adcl $0, %edx
554 movl %eax, 8(%edi)
556 movl %edx, 12(%edi)
557 movl PARAM_DST, %edi
560 C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
562 subl $1, %ecx C size-1
563 xorl %eax, %eax C ready for final adcl, and clear carry
565 movl %ecx, %edx
566 movl PARAM_SRC, %esi
569 L(lshift):
570 C eax
571 C ebx
572 C ecx counter, size-1 to 1
573 C edx size-1 (for later use)
574 C esi src (for later use)
575 C edi dst, incrementing
576 C ebp
578 rcll 4(%edi)
579 rcll 8(%edi)
581 leal 8(%edi), %edi
582 decl %ecx
583 jnz L(lshift)
586 adcl %eax, %eax
588 movl %eax, 4(%edi) C dst most significant limb
589 movl (%esi), %eax C src[0]
591 leal 4(%esi,%edx,4), %esi C &src[size]
592 subl %edx, %ecx C -(size-1)
595 C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
596 C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
597 C low limb of src[0]^2.
600 mull %eax
602 movl %eax, (%edi,%ecx,8) C dst[0]
605 L(diag):
606 C eax scratch
607 C ebx scratch
608 C ecx counter, negative
609 C edx carry
610 C esi &src[size]
611 C edi dst[2*size-2]
612 C ebp
614 movl (%esi,%ecx,4), %eax
615 movl %edx, %ebx
617 mull %eax
619 addl %ebx, 4(%edi,%ecx,8)
620 adcl %eax, 8(%edi,%ecx,8)
621 adcl $0, %edx
623 incl %ecx
624 jnz L(diag)
627 movl SAVE_ESI, %esi
628 movl SAVE_EBX, %ebx
630 addl %edx, 4(%edi) C dst most significant limb
632 movl SAVE_EDI, %edi
633 movl SAVE_EBP, %ebp
634 addl $FRAME, %esp
639 C -----------------------------------------------------------------------------
640 ifdef(`PIC',`
641 L(pic_calc):
642 addl (%esp), %ecx
643 addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
644 addl %edx, %ecx
645 ret_internal
649 EPILOGUE()