beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k7 / sqr_basecase.asm
blob7b6a97e0df4a8affc88014fa807f2b7b6e02defa
1 dnl AMD K7 mpn_sqr_basecase -- square an mpn number.
3 dnl Copyright 1999-2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product
35 C (measured on the speed difference between 25 and 50 limbs, which is
36 C roughly the Karatsuba recursing range).
39 dnl These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for
40 dnl some comments.
42 deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
44 ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
45 `define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
47 m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
48 deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
51 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
53 C With a SQR_TOOM2_THRESHOLD around 50 this code is about 1500 bytes,
54 C which is quite a bit, but is considered good value since squares big
55 C enough to use most of the code will be spending quite a few cycles in it.
58 defframe(PARAM_SIZE,12)
59 defframe(PARAM_SRC, 8)
60 defframe(PARAM_DST, 4)
62 TEXT
63 ALIGN(32)
64 PROLOGUE(mpn_sqr_basecase)
65 deflit(`FRAME',0)
67 movl PARAM_SIZE, %ecx
68 movl PARAM_SRC, %eax
69 cmpl $2, %ecx
71 movl PARAM_DST, %edx
72 je L(two_limbs)
73 ja L(three_or_more)
76 C------------------------------------------------------------------------------
77 C one limb only
78 C eax src
79 C ecx size
80 C edx dst
82 movl (%eax), %eax
83 movl %edx, %ecx
85 mull %eax
87 movl %edx, 4(%ecx)
88 movl %eax, (%ecx)
89 ret
92 C------------------------------------------------------------------------------
94 C Using the read/modify/write "add"s seems to be faster than saving and
95 C restoring registers. Perhaps the loads for the first set hide under the
96 C mul latency and the second gets store to load forwarding.
98 ALIGN(16)
99 L(two_limbs):
100 C eax src
101 C ebx
102 C ecx size
103 C edx dst
104 deflit(`FRAME',0)
106 pushl %ebx FRAME_pushl()
107 movl %eax, %ebx C src
108 movl (%eax), %eax
110 movl %edx, %ecx C dst
112 mull %eax C src[0]^2
114 movl %eax, (%ecx) C dst[0]
115 movl 4(%ebx), %eax
117 movl %edx, 4(%ecx) C dst[1]
119 mull %eax C src[1]^2
121 movl %eax, 8(%ecx) C dst[2]
122 movl (%ebx), %eax
124 movl %edx, 12(%ecx) C dst[3]
126 mull 4(%ebx) C src[0]*src[1]
128 popl %ebx
130 addl %eax, 4(%ecx)
131 adcl %edx, 8(%ecx)
132 adcl $0, 12(%ecx)
133 ASSERT(nc)
135 addl %eax, 4(%ecx)
136 adcl %edx, 8(%ecx)
137 adcl $0, 12(%ecx)
138 ASSERT(nc)
143 C------------------------------------------------------------------------------
144 defframe(SAVE_EBX, -4)
145 defframe(SAVE_ESI, -8)
146 defframe(SAVE_EDI, -12)
147 defframe(SAVE_EBP, -16)
148 deflit(STACK_SPACE, 16)
150 L(three_or_more):
151 subl $STACK_SPACE, %esp
152 cmpl $4, %ecx
153 jae L(four_or_more)
154 deflit(`FRAME',STACK_SPACE)
157 C------------------------------------------------------------------------------
158 C Three limbs
160 C Writing out the loads and stores separately at the end of this code comes
161 C out about 10 cycles faster than using adcls to memory.
163 C eax src
164 C ecx size
165 C edx dst
167 movl %ebx, SAVE_EBX
168 movl %eax, %ebx C src
169 movl (%eax), %eax
171 movl %edx, %ecx C dst
172 movl %esi, SAVE_ESI
173 movl %edi, SAVE_EDI
175 mull %eax C src[0] ^ 2
177 movl %eax, (%ecx)
178 movl 4(%ebx), %eax
179 movl %edx, 4(%ecx)
181 mull %eax C src[1] ^ 2
183 movl %eax, 8(%ecx)
184 movl 8(%ebx), %eax
185 movl %edx, 12(%ecx)
187 mull %eax C src[2] ^ 2
189 movl %eax, 16(%ecx)
190 movl (%ebx), %eax
191 movl %edx, 20(%ecx)
193 mull 4(%ebx) C src[0] * src[1]
195 movl %eax, %esi
196 movl (%ebx), %eax
197 movl %edx, %edi
199 mull 8(%ebx) C src[0] * src[2]
201 addl %eax, %edi
202 movl %ebp, SAVE_EBP
203 movl $0, %ebp
205 movl 4(%ebx), %eax
206 adcl %edx, %ebp
208 mull 8(%ebx) C src[1] * src[2]
210 xorl %ebx, %ebx
211 addl %eax, %ebp
213 adcl $0, %edx
215 C eax
216 C ebx zero, will be dst[5]
217 C ecx dst
218 C edx dst[4]
219 C esi dst[1]
220 C edi dst[2]
221 C ebp dst[3]
223 adcl $0, %edx
224 addl %esi, %esi
226 adcl %edi, %edi
227 movl 4(%ecx), %eax
229 adcl %ebp, %ebp
231 adcl %edx, %edx
233 adcl $0, %ebx
234 addl %eax, %esi
235 movl 8(%ecx), %eax
237 adcl %eax, %edi
238 movl 12(%ecx), %eax
239 movl %esi, 4(%ecx)
241 adcl %eax, %ebp
242 movl 16(%ecx), %eax
243 movl %edi, 8(%ecx)
245 movl SAVE_ESI, %esi
246 movl SAVE_EDI, %edi
248 adcl %eax, %edx
249 movl 20(%ecx), %eax
250 movl %ebp, 12(%ecx)
252 adcl %ebx, %eax
253 ASSERT(nc)
254 movl SAVE_EBX, %ebx
255 movl SAVE_EBP, %ebp
257 movl %edx, 16(%ecx)
258 movl %eax, 20(%ecx)
259 addl $FRAME, %esp
264 C------------------------------------------------------------------------------
265 L(four_or_more):
267 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
268 C Further products are added in rather than stored.
270 C eax src
271 C ebx
272 C ecx size
273 C edx dst
274 C esi
275 C edi
276 C ebp
278 defframe(`VAR_COUNTER',-20)
279 defframe(`VAR_JMP', -24)
280 deflit(EXTRA_STACK_SPACE, 8)
282 movl %ebx, SAVE_EBX
283 movl %edi, SAVE_EDI
284 leal (%edx,%ecx,4), %edi C &dst[size]
286 movl %esi, SAVE_ESI
287 movl %ebp, SAVE_EBP
288 leal (%eax,%ecx,4), %esi C &src[size]
290 movl (%eax), %ebp C multiplier
291 movl $0, %ebx
292 decl %ecx
294 negl %ecx
295 subl $EXTRA_STACK_SPACE, %esp
296 FRAME_subl_esp(EXTRA_STACK_SPACE)
298 L(mul_1):
299 C eax scratch
300 C ebx carry
301 C ecx counter
302 C edx scratch
303 C esi &src[size]
304 C edi &dst[size]
305 C ebp multiplier
307 movl (%esi,%ecx,4), %eax
309 mull %ebp
311 addl %ebx, %eax
312 movl %eax, (%edi,%ecx,4)
313 movl $0, %ebx
315 adcl %edx, %ebx
316 incl %ecx
317 jnz L(mul_1)
320 C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
322 C The last two products, which are the bottom right corner of the product
323 C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
324 C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
325 C cases that need to be done.
327 C The unrolled code is the same as in mpn_addmul_1, see that routine for
328 C some comments.
330 C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive.
332 C VAR_JMP is the computed jump into the unrolled code, stepped by one code
333 C chunk each outer loop.
335 C K7 does branch prediction on indirect jumps, which is bad since it's a
336 C different target each time. There seems no way to avoid this.
338 dnl This value also hard coded in some shifts and adds
339 deflit(CODE_BYTES_PER_LIMB, 17)
341 dnl With the unmodified &src[size] and &dst[size] pointers, the
342 dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT
343 dnl values up to 31, but above that an offset must be added to them.
345 deflit(OFFSET,
346 ifelse(eval(UNROLL_COUNT>31),1,
347 eval((UNROLL_COUNT-31)*4),
350 dnl Because the last chunk of code is generated differently, a label placed
351 dnl at the end doesn't work. Instead calculate the implied end using the
352 dnl start and how many chunks of code there are.
354 deflit(UNROLL_INNER_END,
355 `L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)')
357 C eax
358 C ebx carry
359 C ecx
360 C edx
361 C esi &src[size]
362 C edi &dst[size]
363 C ebp
365 movl PARAM_SIZE, %ecx
366 movl %ebx, (%edi)
368 subl $4, %ecx
369 jz L(corner)
371 negl %ecx
372 ifelse(OFFSET,0,,`subl $OFFSET, %edi')
373 ifelse(OFFSET,0,,`subl $OFFSET, %esi')
375 movl %ecx, %edx
376 shll $4, %ecx
378 ifdef(`PIC',`
379 call L(pic_calc)
380 L(here):
382 leal UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
386 C The calculated jump mustn't come out to before the start of the
387 C code available. This is the limit UNROLL_COUNT puts on the src
388 C operand size, but checked here directly using the jump address.
389 ASSERT(ae,
390 `movl_text_address(L(unroll_inner_start), %eax)
391 cmpl %eax, %ecx')
394 C------------------------------------------------------------------------------
395 ALIGN(16)
396 L(unroll_outer_top):
397 C eax
398 C ebx high limb to store
399 C ecx VAR_JMP
400 C edx VAR_COUNTER, limbs, negative
401 C esi &src[size], constant
402 C edi dst ptr, high of last addmul
403 C ebp
405 movl -12+OFFSET(%esi,%edx,4), %ebp C next multiplier
406 movl -8+OFFSET(%esi,%edx,4), %eax C first of multiplicand
408 movl %edx, VAR_COUNTER
410 mull %ebp
412 define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')')
414 testb $1, %cl
415 movl %edx, %ebx C high carry
416 movl %ecx, %edx C jump
418 movl %eax, %ecx C low carry
419 cmovX( %ebx, %ecx) C high carry reverse
420 cmovX( %eax, %ebx) C low carry reverse
422 leal CODE_BYTES_PER_LIMB(%edx), %eax
423 xorl %edx, %edx
424 leal 4(%edi), %edi
426 movl %eax, VAR_JMP
428 jmp *%eax
431 ifdef(`PIC',`
432 L(pic_calc):
433 addl (%esp), %ecx
434 addl $UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx
435 addl %edx, %ecx
436 ret_internal
440 C Must be an even address to preserve the significance of the low
441 C bit of the jump address indicating which way around ecx/ebx should
442 C start.
443 ALIGN(2)
445 L(unroll_inner_start):
446 C eax next limb
447 C ebx carry high
448 C ecx carry low
449 C edx scratch
450 C esi src
451 C edi dst
452 C ebp multiplier
454 forloop(`i', UNROLL_COUNT, 1, `
455 deflit(`disp_src', eval(-i*4 + OFFSET))
456 deflit(`disp_dst', eval(disp_src - 4))
458 m4_assert(`disp_src>=-128 && disp_src<128')
459 m4_assert(`disp_dst>=-128 && disp_dst<128')
461 ifelse(eval(i%2),0,`
462 Zdisp( movl, disp_src,(%esi), %eax)
463 adcl %edx, %ebx
465 mull %ebp
467 Zdisp( addl, %ecx, disp_dst,(%edi))
468 movl $0, %ecx
470 adcl %eax, %ebx
473 dnl this bit comes out last
474 Zdisp( movl, disp_src,(%esi), %eax)
475 adcl %edx, %ecx
477 mull %ebp
479 Zdisp( addl, %ebx, disp_dst,(%edi))
481 ifelse(forloop_last,0,
482 ` movl $0, %ebx')
484 adcl %eax, %ecx
488 C eax next limb
489 C ebx carry high
490 C ecx carry low
491 C edx scratch
492 C esi src
493 C edi dst
494 C ebp multiplier
496 adcl $0, %edx
497 addl %ecx, -4+OFFSET(%edi)
498 movl VAR_JMP, %ecx
500 adcl $0, %edx
502 movl %edx, m4_empty_if_zero(OFFSET) (%edi)
503 movl VAR_COUNTER, %edx
505 incl %edx
506 jnz L(unroll_outer_top)
509 ifelse(OFFSET,0,,`
510 addl $OFFSET, %esi
511 addl $OFFSET, %edi
515 C------------------------------------------------------------------------------
516 L(corner):
517 C esi &src[size]
518 C edi &dst[2*size-5]
520 movl -12(%esi), %ebp
521 movl -8(%esi), %eax
522 movl %eax, %ecx
524 mull %ebp
526 addl %eax, -4(%edi)
527 movl -4(%esi), %eax
529 adcl $0, %edx
530 movl %edx, %ebx
531 movl %eax, %esi
533 mull %ebp
535 addl %ebx, %eax
537 adcl $0, %edx
538 addl %eax, (%edi)
539 movl %esi, %eax
541 adcl $0, %edx
542 movl %edx, %ebx
544 mull %ecx
546 addl %ebx, %eax
547 movl %eax, 4(%edi)
549 adcl $0, %edx
550 movl %edx, 8(%edi)
554 C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
556 L(lshift_start):
557 movl PARAM_SIZE, %eax
558 movl PARAM_DST, %edi
559 xorl %ecx, %ecx C clear carry
561 leal (%edi,%eax,8), %edi
562 notl %eax C -size-1, preserve carry
564 leal 2(%eax), %eax C -(size-1)
566 L(lshift):
567 C eax counter, negative
568 C ebx
569 C ecx
570 C edx
571 C esi
572 C edi dst, pointing just after last limb
573 C ebp
575 rcll -4(%edi,%eax,8)
576 rcll (%edi,%eax,8)
577 incl %eax
578 jnz L(lshift)
580 setc %al
582 movl PARAM_SRC, %esi
583 movl %eax, -4(%edi) C dst most significant limb
585 movl PARAM_SIZE, %ecx
588 C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
589 C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
590 C low limb of src[0]^2.
592 movl (%esi), %eax C src[0]
594 mull %eax
596 leal (%esi,%ecx,4), %esi C src point just after last limb
597 negl %ecx
599 movl %eax, (%edi,%ecx,8) C dst[0]
600 incl %ecx
602 L(diag):
603 C eax scratch
604 C ebx scratch
605 C ecx counter, negative
606 C edx carry
607 C esi src just after last limb
608 C edi dst just after last limb
609 C ebp
611 movl (%esi,%ecx,4), %eax
612 movl %edx, %ebx
614 mull %eax
616 addl %ebx, -4(%edi,%ecx,8)
617 adcl %eax, (%edi,%ecx,8)
618 adcl $0, %edx
620 incl %ecx
621 jnz L(diag)
624 movl SAVE_ESI, %esi
625 movl SAVE_EBX, %ebx
627 addl %edx, -4(%edi) C dst most significant limb
628 movl SAVE_EDI, %edi
630 movl SAVE_EBP, %ebp
631 addl $FRAME, %esp
635 EPILOGUE()