beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k6 / sqr_basecase.asm
blobb7ecb5cc8a1750e336ae2aa875145d26e5a28ecb
1 dnl AMD K6 mpn_sqr_basecase -- square an mpn number.
3 dnl Copyright 1999-2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
35 C product (measured on the speed difference between 17 and 33 limbs,
36 C which is roughly the Karatsuba recursing range).
39 dnl SQR_TOOM2_THRESHOLD_MAX is the maximum SQR_TOOM2_THRESHOLD this
40 dnl code supports. This value is used only by the tune program to know
41 dnl what it can go up to. (An attempt to compile with a bigger value will
42 dnl trigger some m4_assert()s in the code, making the build fail.)
43 dnl
44 dnl The value is determined by requiring the displacements in the unrolled
45 dnl addmul to fit in single bytes. This means a maximum UNROLL_COUNT of
46 dnl 63, giving a maximum SQR_TOOM2_THRESHOLD of 66.
48 deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
51 dnl Allow a value from the tune program to override config.m4.
53 ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
54 `define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
57 dnl UNROLL_COUNT is the number of code chunks in the unrolled addmul. The
58 dnl number required is determined by SQR_TOOM2_THRESHOLD, since
59 dnl mpn_sqr_basecase only needs to handle sizes < SQR_TOOM2_THRESHOLD.
60 dnl
61 dnl The first addmul is the biggest, and this takes the second least
62 dnl significant limb and multiplies it by the third least significant and
63 dnl up. Hence for a maximum operand size of SQR_TOOM2_THRESHOLD-1
64 dnl limbs, UNROLL_COUNT needs to be SQR_TOOM2_THRESHOLD-3.
66 m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
67 deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
70 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
72 C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
73 C lot of function call overheads are avoided, especially when the given size
74 C is small.
76 C The code size might look a bit excessive, but not all of it is executed
77 C and so won't fill up the code cache. The 1x1, 2x2 and 3x3 special cases
78 C clearly apply only to those sizes; mid sizes like 10x10 only need part of
79 C the unrolled addmul; and big sizes like 35x35 that do need all of it will
80 C at least be getting value for money, because 35x35 spends something like
81 C 5780 cycles here.
83 C Different values of UNROLL_COUNT give slightly different speeds, between
84 C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
85 C This isn't a big difference, but it's presumably some alignment effect
86 C which if understood could give a simple speedup.
88 defframe(PARAM_SIZE,12)
89 defframe(PARAM_SRC, 8)
90 defframe(PARAM_DST, 4)
92 TEXT
93 ALIGN(32)
94 PROLOGUE(mpn_sqr_basecase)
95 deflit(`FRAME',0)
97 movl PARAM_SIZE, %ecx
98 movl PARAM_SRC, %eax
100 cmpl $2, %ecx
101 je L(two_limbs)
103 movl PARAM_DST, %edx
104 ja L(three_or_more)
107 C -----------------------------------------------------------------------------
108 C one limb only
109 C eax src
110 C ebx
111 C ecx size
112 C edx dst
114 movl (%eax), %eax
115 movl %edx, %ecx
117 mull %eax
119 movl %eax, (%ecx)
120 movl %edx, 4(%ecx)
124 C -----------------------------------------------------------------------------
125 ALIGN(16)
126 L(two_limbs):
127 C eax src
128 C ebx
129 C ecx size
130 C edx dst
132 pushl %ebx
133 movl %eax, %ebx C src
134 deflit(`FRAME',4)
136 movl (%ebx), %eax
137 movl PARAM_DST, %ecx
139 mull %eax C src[0]^2
141 movl %eax, (%ecx)
142 movl 4(%ebx), %eax
144 movl %edx, 4(%ecx)
146 mull %eax C src[1]^2
148 movl %eax, 8(%ecx)
149 movl (%ebx), %eax
151 movl %edx, 12(%ecx)
152 movl 4(%ebx), %edx
154 mull %edx C src[0]*src[1]
156 addl %eax, 4(%ecx)
158 adcl %edx, 8(%ecx)
159 adcl $0, 12(%ecx)
161 popl %ebx
162 addl %eax, 4(%ecx)
164 adcl %edx, 8(%ecx)
165 adcl $0, 12(%ecx)
170 C -----------------------------------------------------------------------------
171 L(three_or_more):
172 deflit(`FRAME',0)
173 cmpl $4, %ecx
174 jae L(four_or_more)
177 C -----------------------------------------------------------------------------
178 C three limbs
179 C eax src
180 C ecx size
181 C edx dst
183 pushl %ebx
184 movl %eax, %ebx C src
186 movl (%ebx), %eax
187 movl %edx, %ecx C dst
189 mull %eax C src[0] ^ 2
191 movl %eax, (%ecx)
192 movl 4(%ebx), %eax
194 movl %edx, 4(%ecx)
195 pushl %esi
197 mull %eax C src[1] ^ 2
199 movl %eax, 8(%ecx)
200 movl 8(%ebx), %eax
202 movl %edx, 12(%ecx)
203 pushl %edi
205 mull %eax C src[2] ^ 2
207 movl %eax, 16(%ecx)
208 movl (%ebx), %eax
210 movl %edx, 20(%ecx)
211 movl 4(%ebx), %edx
213 mull %edx C src[0] * src[1]
215 movl %eax, %esi
216 movl (%ebx), %eax
218 movl %edx, %edi
219 movl 8(%ebx), %edx
221 pushl %ebp
222 xorl %ebp, %ebp
224 mull %edx C src[0] * src[2]
226 addl %eax, %edi
227 movl 4(%ebx), %eax
229 adcl %edx, %ebp
231 movl 8(%ebx), %edx
233 mull %edx C src[1] * src[2]
235 addl %eax, %ebp
237 adcl $0, %edx
240 C eax will be dst[5]
241 C ebx
242 C ecx dst
243 C edx dst[4]
244 C esi dst[1]
245 C edi dst[2]
246 C ebp dst[3]
248 xorl %eax, %eax
249 addl %esi, %esi
250 adcl %edi, %edi
251 adcl %ebp, %ebp
252 adcl %edx, %edx
253 adcl $0, %eax
255 addl %esi, 4(%ecx)
256 adcl %edi, 8(%ecx)
257 adcl %ebp, 12(%ecx)
259 popl %ebp
260 popl %edi
262 adcl %edx, 16(%ecx)
264 popl %esi
265 popl %ebx
267 adcl %eax, 20(%ecx)
268 ASSERT(nc)
273 C -----------------------------------------------------------------------------
275 defframe(SAVE_EBX, -4)
276 defframe(SAVE_ESI, -8)
277 defframe(SAVE_EDI, -12)
278 defframe(SAVE_EBP, -16)
279 defframe(VAR_COUNTER,-20)
280 defframe(VAR_JMP, -24)
281 deflit(STACK_SPACE, 24)
283 ALIGN(16)
284 L(four_or_more):
286 C eax src
287 C ebx
288 C ecx size
289 C edx dst
290 C esi
291 C edi
292 C ebp
294 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
296 C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
297 C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
298 C a 5780 cycle operation, which is not surprising since the loop here is 8
299 C c/l and mpn_mul_1 is 6.25 c/l.
301 subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
303 movl %edi, SAVE_EDI
304 leal 4(%edx), %edi
306 movl %ebx, SAVE_EBX
307 leal 4(%eax), %ebx
309 movl %esi, SAVE_ESI
310 xorl %esi, %esi
312 movl %ebp, SAVE_EBP
314 C eax
315 C ebx src+4
316 C ecx size
317 C edx
318 C esi
319 C edi dst+4
320 C ebp
322 movl (%eax), %ebp C multiplier
323 leal -1(%ecx), %ecx C size-1, and pad to a 16 byte boundary
326 ALIGN(16)
327 L(mul_1):
328 C eax scratch
329 C ebx src ptr
330 C ecx counter
331 C edx scratch
332 C esi carry
333 C edi dst ptr
334 C ebp multiplier
336 movl (%ebx), %eax
337 addl $4, %ebx
339 mull %ebp
341 addl %esi, %eax
342 movl $0, %esi
344 adcl %edx, %esi
346 movl %eax, (%edi)
347 addl $4, %edi
349 loop L(mul_1)
352 C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
354 C The last two addmuls, which are the bottom right corner of the product
355 C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
356 C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
357 C cases that need to be done.
359 C The unrolled code is the same as mpn_addmul_1(), see that routine for some
360 C comments.
362 C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
364 C VAR_JMP is the computed jump into the unrolled code, stepped by one code
365 C chunk each outer loop.
367 C K6 doesn't do any branch prediction on indirect jumps, which is good
368 C actually because it's a different target each time. The unrolled addmul
369 C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
370 C the indirect jump is quickly recovered.
373 dnl This value is also implicitly encoded in a shift and add.
375 deflit(CODE_BYTES_PER_LIMB, 15)
377 dnl With the unmodified &src[size] and &dst[size] pointers, the
378 dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT
379 dnl values up to 31. Above that an offset must be added to them.
381 deflit(OFFSET,
382 ifelse(eval(UNROLL_COUNT>31),1,
383 eval((UNROLL_COUNT-31)*4),
386 C eax
387 C ebx &src[size]
388 C ecx
389 C edx
390 C esi carry
391 C edi &dst[size]
392 C ebp
394 movl PARAM_SIZE, %ecx
395 movl %esi, (%edi)
397 subl $4, %ecx
398 jz L(corner)
400 movl %ecx, %edx
401 ifelse(OFFSET,0,,
402 ` subl $OFFSET, %ebx')
404 shll $4, %ecx
405 ifelse(OFFSET,0,,
406 ` subl $OFFSET, %edi')
408 negl %ecx
410 ifdef(`PIC',`
411 call L(pic_calc)
412 L(here):
414 leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
416 negl %edx
419 C The calculated jump mustn't be before the start of the available
420 C code. This is the limitation UNROLL_COUNT puts on the src operand
421 C size, but checked here using the jump address directly.
423 ASSERT(ae,`
424 movl_text_address( L(unroll_inner_start), %eax)
425 cmpl %eax, %ecx
429 C -----------------------------------------------------------------------------
430 ALIGN(16)
431 L(unroll_outer_top):
432 C eax
433 C ebx &src[size], constant
434 C ecx VAR_JMP
435 C edx VAR_COUNTER, limbs, negative
436 C esi high limb to store
437 C edi dst ptr, high of last addmul
438 C ebp
440 movl -12+OFFSET(%ebx,%edx,4), %ebp C multiplier
441 movl %edx, VAR_COUNTER
443 movl -8+OFFSET(%ebx,%edx,4), %eax C first limb of multiplicand
445 mull %ebp
447 testb $1, %cl
449 movl %edx, %esi C high carry
450 movl %ecx, %edx C jump
452 movl %eax, %ecx C low carry
453 leal CODE_BYTES_PER_LIMB(%edx), %edx
455 movl %edx, VAR_JMP
456 leal 4(%edi), %edi
458 C A branch-free version of this using some xors was found to be a
459 C touch slower than just a conditional jump, despite the jump
460 C switching between taken and not taken on every loop.
462 ifelse(eval(UNROLL_COUNT%2),0,
463 jz,jnz) L(unroll_noswap)
464 movl %esi, %eax C high,low carry other way around
466 movl %ecx, %esi
467 movl %eax, %ecx
468 L(unroll_noswap):
470 jmp *%edx
473 C Must be on an even address here so the low bit of the jump address
474 C will indicate which way around ecx/esi should start.
476 C An attempt was made at padding here to get the end of the unrolled
477 C code to come out on a good alignment, to save padding before
478 C L(corner). This worked, but turned out to run slower than just an
479 C ALIGN(2). The reason for this is not clear, it might be related
480 C to the different speeds on different UNROLL_COUNTs noted above.
482 ALIGN(2)
484 L(unroll_inner_start):
485 C eax scratch
486 C ebx src
487 C ecx carry low
488 C edx scratch
489 C esi carry high
490 C edi dst
491 C ebp multiplier
493 C 15 code bytes each limb
494 C ecx/esi swapped on each chunk
496 forloop(`i', UNROLL_COUNT, 1, `
497 deflit(`disp_src', eval(-i*4 + OFFSET))
498 deflit(`disp_dst', eval(disp_src - 4))
500 m4_assert(`disp_src>=-128 && disp_src<128')
501 m4_assert(`disp_dst>=-128 && disp_dst<128')
503 ifelse(eval(i%2),0,`
504 Zdisp( movl, disp_src,(%ebx), %eax)
505 mull %ebp
506 Zdisp( addl, %esi, disp_dst,(%edi))
507 adcl %eax, %ecx
508 movl %edx, %esi
509 jadcl0( %esi)
511 dnl this one comes out last
512 Zdisp( movl, disp_src,(%ebx), %eax)
513 mull %ebp
514 Zdisp( addl, %ecx, disp_dst,(%edi))
515 adcl %eax, %esi
516 movl %edx, %ecx
517 jadcl0( %ecx)
520 L(unroll_inner_end):
522 addl %esi, -4+OFFSET(%edi)
524 movl VAR_COUNTER, %edx
525 jadcl0( %ecx)
527 movl %ecx, m4_empty_if_zero(OFFSET)(%edi)
528 movl VAR_JMP, %ecx
530 incl %edx
531 jnz L(unroll_outer_top)
534 ifelse(OFFSET,0,,`
535 addl $OFFSET, %ebx
536 addl $OFFSET, %edi
540 C -----------------------------------------------------------------------------
541 ALIGN(16)
542 L(corner):
543 C ebx &src[size]
544 C edi &dst[2*size-5]
546 movl -12(%ebx), %ebp
548 movl -8(%ebx), %eax
549 movl %eax, %ecx
551 mull %ebp
553 addl %eax, -4(%edi)
554 adcl $0, %edx
556 movl -4(%ebx), %eax
557 movl %edx, %esi
558 movl %eax, %ebx
560 mull %ebp
562 addl %esi, %eax
563 adcl $0, %edx
565 addl %eax, (%edi)
566 adcl $0, %edx
568 movl %edx, %esi
569 movl %ebx, %eax
571 mull %ecx
573 addl %esi, %eax
574 movl %eax, 4(%edi)
576 adcl $0, %edx
578 movl %edx, 8(%edi)
581 C -----------------------------------------------------------------------------
582 C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
583 C The loop measures about 6 cycles/iteration, though it looks like it should
584 C decode in 5.
586 L(lshift_start):
587 movl PARAM_SIZE, %ecx
589 movl PARAM_DST, %edi
590 subl $1, %ecx C size-1 and clear carry
592 movl PARAM_SRC, %ebx
593 movl %ecx, %edx
595 xorl %eax, %eax C ready for adcl
598 ALIGN(16)
599 L(lshift):
600 C eax
601 C ebx src (for later use)
602 C ecx counter, decrementing
603 C edx size-1 (for later use)
604 C esi
605 C edi dst, incrementing
606 C ebp
608 rcll 4(%edi)
609 rcll 8(%edi)
610 leal 8(%edi), %edi
611 loop L(lshift)
614 adcl %eax, %eax
616 movl %eax, 4(%edi) C dst most significant limb
617 movl (%ebx), %eax C src[0]
619 leal 4(%ebx,%edx,4), %ebx C &src[size]
620 subl %edx, %ecx C -(size-1)
623 C -----------------------------------------------------------------------------
624 C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
625 C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
626 C low limb of src[0]^2.
629 mull %eax
631 movl %eax, (%edi,%ecx,8) C dst[0]
634 ALIGN(16)
635 L(diag):
636 C eax scratch
637 C ebx &src[size]
638 C ecx counter, negative
639 C edx carry
640 C esi scratch
641 C edi dst[2*size-2]
642 C ebp
644 movl (%ebx,%ecx,4), %eax
645 movl %edx, %esi
647 mull %eax
649 addl %esi, 4(%edi,%ecx,8)
650 adcl %eax, 8(%edi,%ecx,8)
651 adcl $0, %edx
653 incl %ecx
654 jnz L(diag)
657 movl SAVE_EBX, %ebx
658 movl SAVE_ESI, %esi
660 addl %edx, 4(%edi) C dst most significant limb
662 movl SAVE_EDI, %edi
663 movl SAVE_EBP, %ebp
664 addl $FRAME, %esp
669 C -----------------------------------------------------------------------------
670 ifdef(`PIC',`
671 L(pic_calc):
672 C See mpn/x86/README about old gas bugs
673 addl (%esp), %ecx
674 addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
675 addl %edx, %ecx
676 ret_internal
680 EPILOGUE()