beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k6 / aors_n.asm
blob168f9b4ae4a80967e5c9e45f9b9f045b3e17effd
1 dnl AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
3 dnl Copyright 1999-2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
37 ifdef(`OPERATION_add_n', `
38 define(M4_inst, adcl)
39 define(M4_function_n, mpn_add_n)
40 define(M4_function_nc, mpn_add_nc)
41 define(M4_description, add)
42 ',`ifdef(`OPERATION_sub_n', `
43 define(M4_inst, sbbl)
44 define(M4_function_n, mpn_sub_n)
45 define(M4_function_nc, mpn_sub_nc)
46 define(M4_description, subtract)
47 ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
48 ')')')
50 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
53 C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
54 C mp_size_t size);
55 C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
56 C mp_size_t size, mp_limb_t carry);
58 C Calculate src1,size M4_description src2,size, and store the result in
59 C dst,size. The return value is the carry bit from the top of the result
60 C (1 or 0).
62 C The _nc version accepts 1 or 0 for an initial carry into the low limb of
63 C the calculation. Note values other than 1 or 0 here will lead to garbage
64 C results.
66 C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
67 C an in-place dst+=src to 2.5 c/l. The unrolled loops have 1 cycle/loop of
68 C loop control, which with 4 limbs/loop means an extra 0.25 c/l.
70 define(PARAM_CARRY, `FRAME+20(%esp)')
71 define(PARAM_SIZE, `FRAME+16(%esp)')
72 define(PARAM_SRC2, `FRAME+12(%esp)')
73 define(PARAM_SRC1, `FRAME+8(%esp)')
74 define(PARAM_DST, `FRAME+4(%esp)')
75 deflit(`FRAME',0)
77 dnl minimum 5 because the unrolled code can't handle less
78 deflit(UNROLL_THRESHOLD, 5)
80 TEXT
81 ALIGN(32)
83 PROLOGUE(M4_function_nc)
84 movl PARAM_CARRY, %eax
85 jmp L(start)
86 EPILOGUE()
89 PROLOGUE(M4_function_n)
90 xorl %eax, %eax
91 L(start):
92 movl PARAM_SIZE, %ecx
93 pushl %ebx
94 FRAME_pushl()
96 movl PARAM_SRC1, %ebx
97 pushl %edi
98 FRAME_pushl()
100 movl PARAM_SRC2, %edx
101 cmpl $UNROLL_THRESHOLD, %ecx
103 movl PARAM_DST, %edi
104 jae L(unroll)
107 shrl %eax C initial carry flag
109 C offset 0x21 here, close enough to aligned
110 L(simple):
111 C eax scratch
112 C ebx src1
113 C ecx counter
114 C edx src2
115 C esi
116 C edi dst
117 C ebp
119 C The store to (%edi) could be done with a stosl; it'd be smaller
120 C code, but there's no speed gain and a cld would have to be added
121 C (per mpn/x86/README).
123 movl (%ebx), %eax
124 leal 4(%ebx), %ebx
126 M4_inst (%edx), %eax
128 movl %eax, (%edi)
129 leal 4(%edi), %edi
131 leal 4(%edx), %edx
132 loop L(simple)
135 movl $0, %eax
136 popl %edi
138 setc %al
140 popl %ebx
144 C -----------------------------------------------------------------------------
145 L(unroll):
146 C eax carry
147 C ebx src1
148 C ecx counter
149 C edx src2
150 C esi
151 C edi dst
152 C ebp
154 cmpl %edi, %ebx
155 pushl %esi
157 je L(inplace)
159 ifdef(`OPERATION_add_n',`
160 cmpl %edi, %edx
162 je L(inplace_reverse)
165 movl %ecx, %esi
167 andl $-4, %ecx
168 andl $3, %esi
170 leal (%ebx,%ecx,4), %ebx
171 leal (%edx,%ecx,4), %edx
172 leal (%edi,%ecx,4), %edi
174 negl %ecx
175 shrl %eax
177 ALIGN(32)
178 L(normal_top):
179 C eax counter, qwords, negative
180 C ebx src1
181 C ecx scratch
182 C edx src2
183 C esi
184 C edi dst
185 C ebp
187 movl (%ebx,%ecx,4), %eax
188 leal 5(%ecx), %ecx
189 M4_inst -20(%edx,%ecx,4), %eax
190 movl %eax, -20(%edi,%ecx,4)
192 movl 4-20(%ebx,%ecx,4), %eax
193 M4_inst 4-20(%edx,%ecx,4), %eax
194 movl %eax, 4-20(%edi,%ecx,4)
196 movl 8-20(%ebx,%ecx,4), %eax
197 M4_inst 8-20(%edx,%ecx,4), %eax
198 movl %eax, 8-20(%edi,%ecx,4)
200 movl 12-20(%ebx,%ecx,4), %eax
201 M4_inst 12-20(%edx,%ecx,4), %eax
202 movl %eax, 12-20(%edi,%ecx,4)
204 loop L(normal_top)
207 decl %esi
208 jz L(normal_finish_one)
209 js L(normal_done)
211 C two or three more limbs
213 movl (%ebx), %eax
214 M4_inst (%edx), %eax
215 movl %eax, (%edi)
217 movl 4(%ebx), %eax
218 M4_inst 4(%edx), %eax
219 decl %esi
220 movl %eax, 4(%edi)
222 jz L(normal_done)
223 movl $2, %ecx
225 L(normal_finish_one):
226 movl (%ebx,%ecx,4), %eax
227 M4_inst (%edx,%ecx,4), %eax
228 movl %eax, (%edi,%ecx,4)
230 L(normal_done):
231 popl %esi
232 popl %edi
234 movl $0, %eax
235 popl %ebx
237 setc %al
242 C -----------------------------------------------------------------------------
244 ifdef(`OPERATION_add_n',`
245 L(inplace_reverse):
246 C dst==src2
248 movl %ebx, %edx
251 L(inplace):
252 C eax initial carry
253 C ebx
254 C ecx size
255 C edx src
256 C esi
257 C edi dst
258 C ebp
260 leal -1(%ecx), %esi
261 decl %ecx
263 andl $-4, %ecx
264 andl $3, %esi
266 movl (%edx), %ebx C src low limb
267 leal (%edx,%ecx,4), %edx
269 leal (%edi,%ecx,4), %edi
270 negl %ecx
272 shrl %eax
275 ALIGN(32)
276 L(inplace_top):
277 C eax
278 C ebx next src limb
279 C ecx size
280 C edx src
281 C esi
282 C edi dst
283 C ebp
285 M4_inst %ebx, (%edi,%ecx,4)
287 movl 4(%edx,%ecx,4), %eax
288 leal 5(%ecx), %ecx
290 M4_inst %eax, 4-20(%edi,%ecx,4)
292 movl 8-20(%edx,%ecx,4), %eax
293 movl 12-20(%edx,%ecx,4), %ebx
295 M4_inst %eax, 8-20(%edi,%ecx,4)
296 M4_inst %ebx, 12-20(%edi,%ecx,4)
298 movl 16-20(%edx,%ecx,4), %ebx
299 loop L(inplace_top)
302 C now %esi is 0 to 3 representing respectively 1 to 4 limbs more
304 M4_inst %ebx, (%edi)
306 decl %esi
307 jz L(inplace_finish_one)
308 js L(inplace_done)
310 C two or three more limbs
312 movl 4(%edx), %eax
313 movl 8(%edx), %ebx
314 M4_inst %eax, 4(%edi)
315 M4_inst %ebx, 8(%edi)
317 decl %esi
318 movl $2, %ecx
320 jz L(normal_done)
322 L(inplace_finish_one):
323 movl 4(%edx,%ecx,4), %eax
324 M4_inst %eax, 4(%edi,%ecx,4)
326 L(inplace_done):
327 popl %esi
328 popl %edi
330 movl $0, %eax
331 popl %ebx
333 setc %al
337 EPILOGUE()