beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / pentium4 / sse2 / mode1o.asm
blobaa9ef31d826dd94a7185d987e2cc99d4ac64f035
1 dnl Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder.
3 dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C P4: 19.0 cycles/limb
37 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
38 C mp_limb_t divisor);
39 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
40 C mp_limb_t divisor, mp_limb_t carry);
43 defframe(PARAM_CARRY, 16)
44 defframe(PARAM_DIVISOR,12)
45 defframe(PARAM_SIZE, 8)
46 defframe(PARAM_SRC, 4)
48 TEXT
50 ALIGN(16)
51 PROLOGUE(mpn_modexact_1c_odd)
52 deflit(`FRAME',0)
54 movd PARAM_CARRY, %mm1
55 jmp L(start_1c)
57 EPILOGUE()
60 ALIGN(16)
61 PROLOGUE(mpn_modexact_1_odd)
62 deflit(`FRAME',0)
64 pxor %mm1, %mm1 C carry limb
65 L(start_1c):
66 movl PARAM_DIVISOR, %eax
68 movd PARAM_DIVISOR, %mm7
70 shrl %eax
72 andl $127, %eax C d/2, 7 bits
74 ifdef(`PIC',`
75 LEA( binvert_limb_table, %edx)
76 movzbl (%eax,%edx), %eax C inv 8 bits
77 ',`
78 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
83 movd %eax, %mm6 C inv
85 movd %eax, %mm0 C inv
87 pmuludq %mm6, %mm6 C inv*inv
91 pmuludq %mm7, %mm6 C inv*inv*d
92 paddd %mm0, %mm0 C 2*inv
96 psubd %mm6, %mm0 C inv = 2*inv - inv*inv*d
97 pxor %mm6, %mm6
99 paddd %mm0, %mm6
100 pmuludq %mm0, %mm0 C inv*inv
104 pmuludq %mm7, %mm0 C inv*inv*d
105 paddd %mm6, %mm6 C 2*inv
108 movl PARAM_SRC, %eax
109 movl PARAM_SIZE, %ecx
113 psubd %mm0, %mm6 C inv = 2*inv - inv*inv*d
115 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
116 pushl %eax FRAME_pushl()
117 movd %mm6, %eax
118 imul PARAM_DIVISOR, %eax
119 cmpl $1, %eax
120 popl %eax FRAME_popl()')
122 pxor %mm0, %mm0 C carry bit
125 C The dependent chain here is as follows.
127 C latency
128 C psubq s = (src-cbit) - climb 2
129 C pmuludq q = s*inverse 8
130 C pmuludq prod = q*divisor 8
131 C psrlq climb = high(prod) 2
132 C --
133 C 20
135 C Yet the loop measures 19.0 c/l, so obviously there's something gained
136 C there over a straight reading of the chip documentation.
138 L(top):
139 C eax src, incrementing
140 C ebx
141 C ecx counter, limbs
142 C edx
144 C mm0 carry bit
145 C mm1 carry limb
146 C mm6 inverse
147 C mm7 divisor
149 movd (%eax), %mm2
150 addl $4, %eax
152 psubq %mm0, %mm2 C src - cbit
154 psubq %mm1, %mm2 C src - cbit - climb
155 movq %mm2, %mm0
156 psrlq $63, %mm0 C new cbit
158 pmuludq %mm6, %mm2 C s*inverse
160 movq %mm7, %mm1
161 pmuludq %mm2, %mm1 C q*divisor
162 psrlq $32, %mm1 C new climb
164 subl $1, %ecx
165 jnz L(top)
168 L(done):
169 paddq %mm1, %mm0
170 movd %mm0, %eax
171 emms
174 EPILOGUE()
175 ASM_END()