beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k7 / aors_n.asm
blob1a08072029f0be551318ded718a7d53e2c482458
1 dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
3 dnl Copyright 1999-2003 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C K7: 1.64 cycles/limb (at 16 limbs/loop).
38 dnl K7: UNROLL_COUNT cycles/limb
39 dnl 8 1.9
40 dnl 16 1.64
41 dnl 32 1.7
42 dnl 64 2.0
43 dnl Maximum possible with the current code is 64.
45 deflit(UNROLL_COUNT, 16)
48 ifdef(`OPERATION_add_n', `
49 define(M4_inst, adcl)
50 define(M4_function_n, mpn_add_n)
51 define(M4_function_nc, mpn_add_nc)
52 define(M4_description, add)
53 ',`ifdef(`OPERATION_sub_n', `
54 define(M4_inst, sbbl)
55 define(M4_function_n, mpn_sub_n)
56 define(M4_function_nc, mpn_sub_nc)
57 define(M4_description, subtract)
58 ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
59 ')')')
61 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
64 C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
65 C mp_size_t size);
66 C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
67 C mp_size_t size, mp_limb_t carry);
69 C Calculate src1,size M4_description src2,size, and store the result in
70 C dst,size. The return value is the carry bit from the top of the result (1
71 C or 0).
73 C The _nc version accepts 1 or 0 for an initial carry into the low limb of
74 C the calculation. Note values other than 1 or 0 here will lead to garbage
75 C results.
77 C This code runs at 1.64 cycles/limb, which might be the best possible with
78 C plain integer operations. Each limb is 2 loads and 1 store, any 2 of
79 C which can be done each cycle, leading to 1.5 c/l.
81 dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
82 ifdef(`PIC',`
83 deflit(UNROLL_THRESHOLD, 8)
84 ',`
85 deflit(UNROLL_THRESHOLD, 8)
88 defframe(PARAM_CARRY,20)
89 defframe(PARAM_SIZE, 16)
90 defframe(PARAM_SRC2, 12)
91 defframe(PARAM_SRC1, 8)
92 defframe(PARAM_DST, 4)
94 defframe(SAVE_EBP, -4)
95 defframe(SAVE_ESI, -8)
96 defframe(SAVE_EBX, -12)
97 defframe(SAVE_EDI, -16)
98 deflit(STACK_SPACE, 16)
100 TEXT
101 ALIGN(32)
102 deflit(`FRAME',0)
104 PROLOGUE(M4_function_nc)
105 movl PARAM_CARRY, %eax
106 jmp L(start)
107 EPILOGUE()
109 PROLOGUE(M4_function_n)
111 xorl %eax, %eax C carry
112 L(start):
113 movl PARAM_SIZE, %ecx
114 subl $STACK_SPACE, %esp
115 deflit(`FRAME',STACK_SPACE)
117 movl %edi, SAVE_EDI
118 movl %ebx, SAVE_EBX
119 cmpl $UNROLL_THRESHOLD, %ecx
121 movl PARAM_SRC2, %edx
122 movl PARAM_SRC1, %ebx
123 jae L(unroll)
125 movl PARAM_DST, %edi
126 leal (%ebx,%ecx,4), %ebx
127 leal (%edx,%ecx,4), %edx
129 leal (%edi,%ecx,4), %edi
130 negl %ecx
131 shrl %eax
133 C This loop in in a single 16 byte code block already, so no
134 C alignment necessary.
135 L(simple):
136 C eax scratch
137 C ebx src1
138 C ecx counter
139 C edx src2
140 C esi
141 C edi dst
142 C ebp
144 movl (%ebx,%ecx,4), %eax
145 M4_inst (%edx,%ecx,4), %eax
146 movl %eax, (%edi,%ecx,4)
147 incl %ecx
148 jnz L(simple)
150 movl $0, %eax
151 movl SAVE_EDI, %edi
153 movl SAVE_EBX, %ebx
154 setc %al
155 addl $STACK_SPACE, %esp
160 C -----------------------------------------------------------------------------
161 C This is at 0x55, close enough to aligned.
162 L(unroll):
163 deflit(`FRAME',STACK_SPACE)
164 movl %ebp, SAVE_EBP
165 andl $-2, %ecx C size low bit masked out
166 andl $1, PARAM_SIZE C size low bit kept
168 movl %ecx, %edi
169 decl %ecx
170 movl PARAM_DST, %ebp
172 shrl $UNROLL_LOG2, %ecx
173 negl %edi
174 movl %esi, SAVE_ESI
176 andl $UNROLL_MASK, %edi
178 ifdef(`PIC',`
179 call L(pic_calc)
180 L(here):
182 leal L(entry) (%edi,%edi,8), %esi C 9 bytes per
184 negl %edi
185 shrl %eax
187 leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
188 leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
189 leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
191 jmp *%esi
194 ifdef(`PIC',`
195 L(pic_calc):
196 C See mpn/x86/README about old gas bugs
197 leal (%edi,%edi,8), %esi
198 addl $L(entry)-L(here), %esi
199 addl (%esp), %esi
200 ret_internal
204 C -----------------------------------------------------------------------------
205 ALIGN(32)
206 L(top):
207 C eax zero
208 C ebx src1
209 C ecx counter
210 C edx src2
211 C esi scratch (was computed jump)
212 C edi dst
213 C ebp scratch
215 leal UNROLL_BYTES(%edx), %edx
217 L(entry):
218 deflit(CHUNK_COUNT, 2)
219 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
220 deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
221 deflit(`disp1', eval(disp0 + 4))
223 Zdisp( movl, disp0,(%ebx), %esi)
224 movl disp1(%ebx), %ebp
225 Zdisp( M4_inst,disp0,(%edx), %esi)
226 Zdisp( movl, %esi, disp0,(%edi))
227 M4_inst disp1(%edx), %ebp
228 movl %ebp, disp1(%edi)
231 decl %ecx
232 leal UNROLL_BYTES(%ebx), %ebx
233 leal UNROLL_BYTES(%edi), %edi
234 jns L(top)
237 mov PARAM_SIZE, %esi
238 movl SAVE_EBP, %ebp
239 movl $0, %eax
241 decl %esi
242 js L(even)
244 movl (%ebx), %ecx
245 M4_inst UNROLL_BYTES(%edx), %ecx
246 movl %ecx, (%edi)
247 L(even):
249 movl SAVE_EDI, %edi
250 movl SAVE_EBX, %ebx
251 setc %al
253 movl SAVE_ESI, %esi
254 addl $STACK_SPACE, %esp
258 EPILOGUE()