beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / p6 / aorsmul_1.asm
blobbc8c49c62e886a21bbe1f2a0f8157c94c038cb6e
1 dnl Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
3 dnl Copyright 1999-2002, 2005 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C cycles/limb
35 C P5
36 C P6 model 0-8,10-12 6.44
37 C P6 model 9 (Banias) 6.15
38 C P6 model 13 (Dothan) 6.11
39 C P4 model 0 (Willamette)
40 C P4 model 1 (?)
41 C P4 model 2 (Northwood)
42 C P4 model 3 (Prescott)
43 C P4 model 4 (Nocona)
44 C AMD K6
45 C AMD K7
46 C AMD K8
49 dnl P6 UNROLL_COUNT cycles/limb
50 dnl 8 6.7
51 dnl 16 6.35
52 dnl 32 6.3
53 dnl 64 6.3
54 dnl Maximum possible with the current code is 64.
56 deflit(UNROLL_COUNT, 16)
59 ifdef(`OPERATION_addmul_1', `
60 define(M4_inst, addl)
61 define(M4_function_1, mpn_addmul_1)
62 define(M4_function_1c, mpn_addmul_1c)
63 define(M4_description, add it to)
64 define(M4_desc_retval, carry)
65 ',`ifdef(`OPERATION_submul_1', `
66 define(M4_inst, subl)
67 define(M4_function_1, mpn_submul_1)
68 define(M4_function_1c, mpn_submul_1c)
69 define(M4_description, subtract it from)
70 define(M4_desc_retval, borrow)
71 ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
72 ')')')
74 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
77 C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
78 C mp_limb_t mult);
79 C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
80 C mp_limb_t mult, mp_limb_t carry);
82 C Calculate src,size multiplied by mult and M4_description dst,size.
83 C Return the M4_desc_retval limb from the top of the result.
85 C This code is pretty much the same as the K6 code. The unrolled loop is
86 C the same, but there's just a few scheduling tweaks in the setups and the
87 C simple loop.
89 C A number of variations have been tried for the unrolled loop, with one or
90 C two carries, and with loads scheduled earlier, but nothing faster than 6
91 C cycles/limb has been found.
93 ifdef(`PIC',`
94 deflit(UNROLL_THRESHOLD, 5)
95 ',`
96 deflit(UNROLL_THRESHOLD, 5)
99 defframe(PARAM_CARRY, 20)
100 defframe(PARAM_MULTIPLIER,16)
101 defframe(PARAM_SIZE, 12)
102 defframe(PARAM_SRC, 8)
103 defframe(PARAM_DST, 4)
105 TEXT
106 ALIGN(32)
108 PROLOGUE(M4_function_1c)
109 pushl %ebx
110 deflit(`FRAME',4)
111 movl PARAM_CARRY, %ebx
112 jmp L(start_nc)
113 EPILOGUE()
115 PROLOGUE(M4_function_1)
116 push %ebx
117 deflit(`FRAME',4)
118 xorl %ebx, %ebx C initial carry
120 L(start_nc):
121 movl PARAM_SIZE, %ecx
122 pushl %esi
123 deflit(`FRAME',8)
125 movl PARAM_SRC, %esi
126 pushl %edi
127 deflit(`FRAME',12)
129 movl PARAM_DST, %edi
130 pushl %ebp
131 deflit(`FRAME',16)
132 cmpl $UNROLL_THRESHOLD, %ecx
134 movl PARAM_MULTIPLIER, %ebp
135 jae L(unroll)
138 C simple loop
139 C this is offset 0x22, so close enough to aligned
140 L(simple):
141 C eax scratch
142 C ebx carry
143 C ecx counter
144 C edx scratch
145 C esi src
146 C edi dst
147 C ebp multiplier
149 movl (%esi), %eax
150 addl $4, %edi
152 mull %ebp
154 addl %ebx, %eax
155 adcl $0, %edx
157 M4_inst %eax, -4(%edi)
158 movl %edx, %ebx
160 adcl $0, %ebx
161 decl %ecx
163 leal 4(%esi), %esi
164 jnz L(simple)
167 popl %ebp
168 popl %edi
170 popl %esi
171 movl %ebx, %eax
173 popl %ebx
178 C------------------------------------------------------------------------------
179 C VAR_JUMP holds the computed jump temporarily because there's not enough
180 C registers when doing the mul for the initial two carry limbs.
182 C The add/adc for the initial carry in %ebx is necessary only for the
183 C mpn_add/submul_1c entry points. Duplicating the startup code to
184 C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
185 C idea.
187 dnl overlapping with parameters already fetched
188 define(VAR_COUNTER,`PARAM_SIZE')
189 define(VAR_JUMP, `PARAM_DST')
191 C this is offset 0x43, so close enough to aligned
192 L(unroll):
193 C eax
194 C ebx initial carry
195 C ecx size
196 C edx
197 C esi src
198 C edi dst
199 C ebp
201 movl %ecx, %edx
202 decl %ecx
204 subl $2, %edx
205 negl %ecx
207 shrl $UNROLL_LOG2, %edx
208 andl $UNROLL_MASK, %ecx
210 movl %edx, VAR_COUNTER
211 movl %ecx, %edx
213 C 15 code bytes per limb
214 ifdef(`PIC',`
215 call L(pic_calc)
216 L(here):
218 shll $4, %edx
219 negl %ecx
221 leal L(entry) (%edx,%ecx,1), %edx
223 movl (%esi), %eax C src low limb
225 movl %edx, VAR_JUMP
226 leal ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
228 mull %ebp
230 addl %ebx, %eax C initial carry (from _1c)
231 adcl $0, %edx
233 movl %edx, %ebx C high carry
234 leal ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
236 movl VAR_JUMP, %edx
237 testl $1, %ecx
238 movl %eax, %ecx C low carry
240 cmovnz( %ebx, %ecx) C high,low carry other way around
241 cmovnz( %eax, %ebx)
243 jmp *%edx
246 ifdef(`PIC',`
247 L(pic_calc):
248 shll $4, %edx
249 negl %ecx
251 C See mpn/x86/README about old gas bugs
252 leal (%edx,%ecx,1), %edx
253 addl $L(entry)-L(here), %edx
255 addl (%esp), %edx
257 ret_internal
261 C -----------------------------------------------------------
262 ALIGN(32)
263 L(top):
264 deflit(`FRAME',16)
265 C eax scratch
266 C ebx carry hi
267 C ecx carry lo
268 C edx scratch
269 C esi src
270 C edi dst
271 C ebp multiplier
273 C VAR_COUNTER loop counter
275 C 15 code bytes per limb
277 addl $UNROLL_BYTES, %edi
279 L(entry):
280 deflit(CHUNK_COUNT,2)
281 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
282 deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
283 deflit(`disp1', eval(disp0 + 4))
285 Zdisp( movl, disp0,(%esi), %eax)
286 mull %ebp
287 Zdisp( M4_inst,%ecx, disp0,(%edi))
288 adcl %eax, %ebx
289 movl %edx, %ecx
290 adcl $0, %ecx
292 movl disp1(%esi), %eax
293 mull %ebp
294 M4_inst %ebx, disp1(%edi)
295 adcl %eax, %ecx
296 movl %edx, %ebx
297 adcl $0, %ebx
300 decl VAR_COUNTER
301 leal UNROLL_BYTES(%esi), %esi
303 jns L(top)
306 deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
308 M4_inst %ecx, disp0(%edi)
309 movl %ebx, %eax
311 popl %ebp
312 popl %edi
314 popl %esi
315 popl %ebx
316 adcl $0, %eax
320 EPILOGUE()