beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / pentium / mode1o.asm
bloba90abca5a5668bc51135d51f95df4cf9efe0bb56
1 dnl Intel Pentium mpn_modexact_1_odd -- exact division style remainder.
3 dnl Copyright 2000-2002, 2014 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C P5: 23.0 cycles/limb
37 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
38 C mp_limb_t divisor);
39 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
40 C mp_limb_t divisor, mp_limb_t carry);
42 C There seems no way to pair up the two lone instructions in the main loop.
44 C The special case for size==1 saves about 20 cycles (non-PIC), making it
45 C the same as mpn_mod_1, and in fact making modexact faster than mod_1 at
46 C all sizes.
48 C Alternatives:
50 C Using mmx for the multiplies might be possible, with pmullw and pmulhw
51 C having just 3 cycle latencies, but carry bit handling would probably be
52 C complicated.
54 defframe(PARAM_CARRY, 16)
55 defframe(PARAM_DIVISOR,12)
56 defframe(PARAM_SIZE, 8)
57 defframe(PARAM_SRC, 4)
59 dnl re-using parameter space
60 define(VAR_INVERSE,`PARAM_SIZE')
62 TEXT
64 ALIGN(16)
65 PROLOGUE(mpn_modexact_1c_odd)
66 deflit(`FRAME',0)
68 movl PARAM_DIVISOR, %eax
69 movl PARAM_CARRY, %edx
71 jmp L(start_1c)
73 EPILOGUE()
75 ALIGN(16)
76 PROLOGUE(mpn_modexact_1_odd)
77 deflit(`FRAME',0)
79 movl PARAM_DIVISOR, %eax
80 xorl %edx, %edx C carry
82 L(start_1c):
84 ifdef(`PIC',`
85 ifdef(`DARWIN',`
86 shrl %eax C d/2
87 LEA( binvert_limb_table, %ecx)
88 pushl %ebx FRAME_pushl()
89 movl PARAM_SIZE, %ebx
91 andl $127, %eax
92 subl $2, %ebx
94 movb (%eax,%ecx), %cl
95 jc L(one_limb)
96 ',`
97 call L(here) FRAME_pushl()
98 L(here):
100 shrl %eax C d/2
101 movl (%esp), %ecx C eip
103 addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ecx
104 movl %ebx, (%esp) C push ebx
106 andl $127, %eax
107 movl PARAM_SIZE, %ebx
109 movl binvert_limb_table@GOT(%ecx), %ecx
110 subl $2, %ebx
112 movb (%eax,%ecx), %cl C inv 8 bits
113 jc L(one_limb)
116 dnl non-PIC
117 shrl %eax C d/2
118 pushl %ebx FRAME_pushl()
120 movl PARAM_SIZE, %ebx
121 andl $127, %eax
123 subl $2, %ebx
124 jc L(one_limb)
126 movb binvert_limb_table(%eax), %cl C inv 8 bits
129 movl %ecx, %eax
130 addl %ecx, %ecx C 2*inv
132 imull %eax, %eax C inv*inv
134 imull PARAM_DIVISOR, %eax C inv*inv*d
136 subl %eax, %ecx C inv = 2*inv - inv*inv*d
138 movl %ecx, %eax
139 addl %ecx, %ecx C 2*inv
141 imull %eax, %eax C inv*inv
143 imull PARAM_DIVISOR, %eax C inv*inv*d
145 subl %eax, %ecx C inv = 2*inv - inv*inv*d
146 pushl %esi FRAME_pushl()
148 ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
149 movl %ecx, %eax
150 imull PARAM_DIVISOR, %eax
151 cmpl $1, %eax')
153 movl PARAM_SRC, %esi
154 movl %ecx, VAR_INVERSE
156 movl (%esi), %eax C src[0]
157 leal 4(%esi,%ebx,4), %esi C &src[size-1]
159 xorl $-1, %ebx C -(size-1)
160 ASSERT(nz)
161 jmp L(entry)
164 C The use of VAR_INVERSE means only a store is needed for that value, rather
165 C than a push and pop of say %edi.
167 ALIGN(16)
168 L(top):
169 C eax scratch, low product
170 C ebx counter, limbs, negative
171 C ecx carry bit
172 C edx scratch, high product
173 C esi &src[size-1]
174 C edi
175 C ebp
177 mull PARAM_DIVISOR C h:dummy = q*d
179 movl (%esi,%ebx,4), %eax C src[i]
180 subl %ecx, %edx C h -= -c
182 L(entry):
183 subl %edx, %eax C s = src[i] - h
185 sbbl %ecx, %ecx C new -c (0 or -1)
187 imull VAR_INVERSE, %eax C q = s*i
189 incl %ebx
190 jnz L(top)
193 mull PARAM_DIVISOR
195 movl (%esi), %eax C src high
196 subl %ecx, %edx C h -= -c
198 cmpl PARAM_DIVISOR, %eax
200 jbe L(skip_last)
201 deflit(FRAME_LAST,FRAME)
204 subl %edx, %eax C s = src[i] - h
205 popl %esi FRAME_popl()
207 sbbl %ecx, %ecx C c (0 or -1)
208 popl %ebx FRAME_popl()
210 imull VAR_INVERSE, %eax C q = s*i
212 mull PARAM_DIVISOR C h:dummy = q*d
214 movl %edx, %eax
216 subl %ecx, %eax
221 C When high<divisor can skip last step.
223 L(skip_last):
224 deflit(`FRAME',FRAME_LAST)
225 C eax src high
226 C ebx
227 C ecx
228 C edx r
229 C esi
231 subl %eax, %edx C r-s
232 popl %esi FRAME_popl()
234 sbbl %eax, %eax C -1 if underflow
235 movl PARAM_DIVISOR, %ebx
237 andl %ebx, %eax C divisor if underflow
238 popl %ebx FRAME_popl()
240 addl %edx, %eax C addback if underflow
245 C Special case for size==1 using a division for r = c-a mod d.
246 C Could look for a-c<d and save a division sometimes, but that doesn't seem
247 C worth bothering about.
249 L(one_limb):
250 deflit(`FRAME',4)
251 C eax
252 C ebx size-2 (==-1)
253 C ecx
254 C edx carry
255 C esi src end
256 C edi
257 C ebp
259 movl %edx, %eax
260 movl PARAM_SRC, %edx
262 movl PARAM_DIVISOR, %ecx
263 popl %ebx FRAME_popl()
265 subl (%edx), %eax C c-a
267 sbbl %edx, %edx
268 decl %ecx C d-1
270 andl %ecx, %edx C b*d+c-a if c<a, or c-a if c>=a
272 divl PARAM_DIVISOR
274 movl %edx, %eax
278 EPILOGUE()
279 ASM_END()