beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / pentium4 / sse2 / mod_1_1.asm
blobee88babeeea3355e5d839ef5c70fb807908e3b1a
1 dnl x86-32 mpn_mod_1_1p for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright 2009, 2010 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C TODO:
36 C * Optimize. The present code was written quite straightforwardly.
37 C * Optimize post-loop reduction code; it is from mod_1s_4p, thus overkill.
38 C * Write a cps function that uses sse2 insns.
40 C cycles/limb
41 C P6 model 0-8,10-12 -
42 C P6 model 9 (Banias) ?
43 C P6 model 13 (Dothan) ?
44 C P4 model 0-1 (Willamette) ?
45 C P4 model 2 (Northwood) 16
46 C P4 model 3-4 (Prescott) 18
48 C INPUT PARAMETERS
49 C ap sp + 4
50 C n sp + 8
51 C b sp + 12
52 C cps sp + 16
54 define(`B1modb', `%mm1')
55 define(`B2modb', `%mm2')
56 define(`ap', `%edx')
57 define(`n', `%eax')
59 TEXT
60 ALIGN(16)
61 PROLOGUE(mpn_mod_1_1p)
62 push %ebx
63 mov 8(%esp), ap
64 mov 12(%esp), n
65 mov 20(%esp), %ecx
66 movd 8(%ecx), B1modb
67 movd 12(%ecx), B2modb
69 lea -4(ap,n,4), ap
71 C FIXME: See comment in generic/mod_1_1.c.
72 movd (ap), %mm7
73 movd -4(ap), %mm4
74 pmuludq B1modb, %mm7
75 paddq %mm4, %mm7
76 add $-2, n
77 jz L(end)
79 ALIGN(8)
80 L(top): movq %mm7, %mm6
81 psrlq $32, %mm7 C rh
82 movd -8(ap), %mm0
83 add $-4, ap
84 pmuludq B2modb, %mm7
85 pmuludq B1modb, %mm6
86 add $-1, n
87 paddq %mm0, %mm7
88 paddq %mm6, %mm7
89 jnz L(top)
91 L(end): pcmpeqd %mm4, %mm4
92 psrlq $32, %mm4 C 0x00000000FFFFFFFF
93 pand %mm7, %mm4 C rl
94 psrlq $32, %mm7 C rh
95 pmuludq B1modb, %mm7 C rh,cl
96 paddq %mm4, %mm7 C rh,rl
97 movd 4(%ecx), %mm4 C cnt
98 psllq %mm4, %mm7 C rh,rl normalized
99 movq %mm7, %mm2 C rl in low half
100 psrlq $32, %mm7 C rh
101 movd (%ecx), %mm1 C bi
102 pmuludq %mm7, %mm1 C qh,ql
103 paddq %mm2, %mm1 C qh-1,ql
104 movd %mm1, %ecx C ql
105 psrlq $32, %mm1 C qh-1
106 movd 16(%esp), %mm3 C b
107 pmuludq %mm1, %mm3 C (qh-1) * b
108 psubq %mm3, %mm2 C r in low half (could use psubd)
109 movd %mm2, %eax C r
110 mov 16(%esp), %ebx
111 sub %ebx, %eax C r
112 cmp %eax, %ecx
113 lea (%eax,%ebx), %edx
114 cmovc( %edx, %eax)
115 movd %mm4, %ecx C cnt
116 cmp %ebx, %eax
117 jae L(fix)
118 emms
119 pop %ebx
120 shr %cl, %eax
123 L(fix): sub %ebx, %eax
124 emms
125 pop %ebx
126 shr %cl, %eax
128 EPILOGUE()
130 PROLOGUE(mpn_mod_1_1p_cps)
131 C CAUTION: This is the same code as in k7/mod_1_1.asm
132 push %ebp
133 mov 12(%esp), %ebp
134 push %esi
135 bsr %ebp, %ecx
136 push %ebx
137 xor $31, %ecx
138 mov 16(%esp), %esi
139 sal %cl, %ebp
140 mov %ebp, %edx
141 not %edx
142 mov $-1, %eax
143 div %ebp
144 mov %eax, (%esi) C store bi
145 mov %ecx, 4(%esi) C store cnt
146 xor %ebx, %ebx
147 sub %ebp, %ebx
148 mov $1, %edx
149 shld %cl, %eax, %edx
150 imul %edx, %ebx
151 mul %ebx
152 add %ebx, %edx
153 not %edx
154 imul %ebp, %edx
155 add %edx, %ebp
156 cmp %edx, %eax
157 cmovc( %ebp, %edx)
158 shr %cl, %ebx
159 mov %ebx, 8(%esi) C store B1modb
160 shr %cl, %edx
161 mov %edx, 12(%esi) C store B2modb
162 pop %ebx
163 pop %esi
164 pop %ebp
166 EPILOGUE()