beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / pentium4 / sse2 / mod_1_4.asm
blobeb2edb6297d144226502063804483103e7d0e6f7
1 dnl x86-32 mpn_mod_1s_4p for Pentium 4 and P6 models with SSE2 (i.e. 9,D,E,F).
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright 2009, 2010 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C TODO:
36 C * Optimize. The present code was written quite straightforwardly.
37 C * Optimize post-loop reduction code.
38 C * Write a cps function that uses sse2 insns.
40 C cycles/limb
41 C P6 model 0-8,10-12 -
42 C P6 model 9 (Banias) ?
43 C P6 model 13 (Dothan) 3.4
44 C P4 model 0-1 (Willamette) ?
45 C P4 model 2 (Northwood) 4
46 C P4 model 3-4 (Prescott) 4.5
48 C INPUT PARAMETERS
49 C ap sp + 4
50 C n sp + 8
51 C b sp + 12
52 C cps sp + 16
54 define(`B1modb', `%mm1')
55 define(`B2modb', `%mm2')
56 define(`B3modb', `%mm3')
57 define(`B4modb', `%mm4')
58 define(`B5modb', `%mm5')
59 define(`ap', `%edx')
60 define(`n', `%eax')
62 ASM_START()
63 TEXT
64 ALIGN(16)
65 PROLOGUE(mpn_mod_1s_4p)
66 push %ebx
67 mov 8(%esp), ap
68 mov 12(%esp), n
69 mov 20(%esp), %ecx
71 movd 8(%ecx), B1modb
72 movd 12(%ecx), B2modb
73 movd 16(%ecx), B3modb
74 movd 20(%ecx), B4modb
75 movd 24(%ecx), B5modb
77 mov n, %ebx
78 lea -4(ap,n,4), ap
79 and $3, %ebx
80 je L(b0)
81 cmp $2, %ebx
82 jc L(b1)
83 je L(b2)
85 L(b3): movd -4(ap), %mm7
86 pmuludq B1modb, %mm7
87 movd -8(ap), %mm6
88 paddq %mm6, %mm7
89 movd (ap), %mm6
90 pmuludq B2modb, %mm6
91 paddq %mm6, %mm7
92 lea -24(ap), ap
93 add $-3, n
94 jz L(end)
95 jmp L(top)
97 L(b0): movd -8(ap), %mm7
98 pmuludq B1modb, %mm7
99 movd -12(ap), %mm6
100 paddq %mm6, %mm7
101 movd -4(ap), %mm6
102 pmuludq B2modb, %mm6
103 paddq %mm6, %mm7
104 movd (ap), %mm6
105 pmuludq B3modb, %mm6
106 paddq %mm6, %mm7
107 lea -28(ap), ap
108 add $-4, n
109 jz L(end)
110 jmp L(top)
112 L(b1): movd (ap), %mm7
113 lea -16(ap), ap
114 dec n
115 jz L(x)
116 jmp L(top)
118 L(b2): movd -4(ap), %mm7 C rl
119 punpckldq (ap), %mm7 C rh
120 lea -20(ap), ap
121 add $-2, n
122 jz L(end)
124 ALIGN(8)
125 L(top): movd 4(ap), %mm0
126 pmuludq B1modb, %mm0
127 movd 0(ap), %mm6
128 paddq %mm6, %mm0
130 movd 8(ap), %mm6
131 pmuludq B2modb, %mm6
132 paddq %mm6, %mm0
134 movd 12(ap), %mm6
135 pmuludq B3modb, %mm6
136 paddq %mm6, %mm0
138 movq %mm7, %mm6
139 psrlq $32, %mm7 C rh
140 pmuludq B5modb, %mm7
141 pmuludq B4modb, %mm6
143 paddq %mm0, %mm7
144 paddq %mm6, %mm7
146 add $-16, ap
147 add $-4, n
148 jnz L(top)
150 L(end): pcmpeqd %mm4, %mm4
151 psrlq $32, %mm4 C 0x00000000FFFFFFFF
152 pand %mm7, %mm4 C rl
153 psrlq $32, %mm7 C rh
154 pmuludq B1modb, %mm7 C rh,cl
155 paddq %mm4, %mm7 C rh,rl
156 L(x): movd 4(%ecx), %mm4 C cnt
157 psllq %mm4, %mm7 C rh,rl normalized
158 movq %mm7, %mm2 C rl in low half
159 psrlq $32, %mm7 C rh
160 movd (%ecx), %mm1 C bi
161 pmuludq %mm7, %mm1 C qh,ql
162 paddq %mm2, %mm1 C qh-1,ql
163 movd %mm1, %ecx C ql
164 psrlq $32, %mm1 C qh-1
165 movd 16(%esp), %mm3 C b
166 pmuludq %mm1, %mm3 C (qh-1) * b
167 psubq %mm3, %mm2 C r in low half (could use psubd)
168 movd %mm2, %eax C r
169 mov 16(%esp), %ebx
170 sub %ebx, %eax C r
171 cmp %eax, %ecx
172 lea (%eax,%ebx), %edx
173 cmovc( %edx, %eax)
174 movd %mm4, %ecx C cnt
175 cmp %ebx, %eax
176 jae L(fix)
177 emms
178 pop %ebx
179 shr %cl, %eax
182 L(fix): sub %ebx, %eax
183 emms
184 pop %ebx
185 shr %cl, %eax
187 EPILOGUE()
189 ALIGN(16)
190 PROLOGUE(mpn_mod_1s_4p_cps)
191 C CAUTION: This is the same code as in k7/mod_1_4.asm
192 push %ebp
193 push %edi
194 push %esi
195 push %ebx
196 mov 20(%esp), %ebp C FIXME: avoid bp for 0-idx
197 mov 24(%esp), %ebx
198 bsr %ebx, %ecx
199 xor $31, %ecx
200 sal %cl, %ebx C b << cnt
201 mov %ebx, %edx
202 not %edx
203 mov $-1, %eax
204 div %ebx
205 xor %edi, %edi
206 sub %ebx, %edi
207 mov $1, %esi
208 mov %eax, (%ebp) C store bi
209 mov %ecx, 4(%ebp) C store cnt
210 shld %cl, %eax, %esi
211 imul %edi, %esi
212 mov %eax, %edi
213 mul %esi
215 add %esi, %edx
216 shr %cl, %esi
217 mov %esi, 8(%ebp) C store B1modb
219 not %edx
220 imul %ebx, %edx
221 lea (%edx,%ebx), %esi
222 cmp %edx, %eax
223 cmovnc( %edx, %esi)
224 mov %edi, %eax
225 mul %esi
227 add %esi, %edx
228 shr %cl, %esi
229 mov %esi, 12(%ebp) C store B2modb
231 not %edx
232 imul %ebx, %edx
233 lea (%edx,%ebx), %esi
234 cmp %edx, %eax
235 cmovnc( %edx, %esi)
236 mov %edi, %eax
237 mul %esi
239 add %esi, %edx
240 shr %cl, %esi
241 mov %esi, 16(%ebp) C store B3modb
243 not %edx
244 imul %ebx, %edx
245 lea (%edx,%ebx), %esi
246 cmp %edx, %eax
247 cmovnc( %edx, %esi)
248 mov %edi, %eax
249 mul %esi
251 add %esi, %edx
252 shr %cl, %esi
253 mov %esi, 20(%ebp) C store B4modb
255 not %edx
256 imul %ebx, %edx
257 add %edx, %ebx
258 cmp %edx, %eax
259 cmovnc( %edx, %ebx)
261 shr %cl, %ebx
262 mov %ebx, 24(%ebp) C store B5modb
264 pop %ebx
265 pop %esi
266 pop %edi
267 pop %ebp
269 EPILOGUE()