beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k6 / mul_1.asm
blob3ef7ec24fe70cf4f2a574041ecdf15c9cb74de79
1 dnl AMD K6 mpn_mul_1 -- mpn by limb multiply.
3 dnl Copyright 1999, 2000, 2002, 2005 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C cycles/limb
35 C P5
36 C P6 model 0-8,10-12 5.5
37 C P6 model 9 (Banias)
38 C P6 model 13 (Dothan) 4.87
39 C P4 model 0 (Willamette)
40 C P4 model 1 (?)
41 C P4 model 2 (Northwood)
42 C P4 model 3 (Prescott)
43 C P4 model 4 (Nocona)
44 C AMD K6 6.25
45 C AMD K7
46 C AMD K8
49 C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
50 C mp_limb_t multiplier);
51 C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
52 C mp_limb_t multiplier, mp_limb_t carry);
54 C Multiply src,size by mult and store the result in dst,size.
55 C Return the carry limb from the top of the result.
57 C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
58 C the low limb of the result.
60 defframe(PARAM_CARRY, 20)
61 defframe(PARAM_MULTIPLIER,16)
62 defframe(PARAM_SIZE, 12)
63 defframe(PARAM_SRC, 8)
64 defframe(PARAM_DST, 4)
66 dnl minimum 5 because the unrolled code can't handle less
67 deflit(UNROLL_THRESHOLD, 5)
69 TEXT
70 ALIGN(32)
72 PROLOGUE(mpn_mul_1c)
73 pushl %esi
74 deflit(`FRAME',4)
75 movl PARAM_CARRY, %esi
76 jmp L(start_nc)
77 EPILOGUE()
80 PROLOGUE(mpn_mul_1)
81 push %esi
82 deflit(`FRAME',4)
83 xorl %esi, %esi C initial carry
85 L(start_nc):
86 mov PARAM_SIZE, %ecx
87 push %ebx
88 FRAME_pushl()
90 movl PARAM_SRC, %ebx
91 push %edi
92 FRAME_pushl()
94 movl PARAM_DST, %edi
95 pushl %ebp
96 FRAME_pushl()
98 cmpl $UNROLL_THRESHOLD, %ecx
99 movl PARAM_MULTIPLIER, %ebp
101 jae L(unroll)
104 C code offset 0x22 here, close enough to aligned
105 L(simple):
106 C eax scratch
107 C ebx src
108 C ecx counter
109 C edx scratch
110 C esi carry
111 C edi dst
112 C ebp multiplier
114 C this loop 8 cycles/limb
116 movl (%ebx), %eax
117 addl $4, %ebx
119 mull %ebp
121 addl %esi, %eax
122 movl $0, %esi
124 adcl %edx, %esi
126 movl %eax, (%edi)
127 addl $4, %edi
129 loop L(simple)
132 popl %ebp
134 popl %edi
135 popl %ebx
137 movl %esi, %eax
138 popl %esi
143 C -----------------------------------------------------------------------------
144 C The code for each limb is 6 cycles, with instruction decoding being the
145 C limiting factor. At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25
146 C cycles/limb in total.
148 C The secret ingredient to get 6.25 is to start the loop with the mul and
149 C have the load/store pair at the end. Rotating the load/store to the top
150 C is an 0.5 c/l slowdown. (Some address generation effect probably.)
152 C The whole unrolled loop fits nicely in exactly 80 bytes.
155 ALIGN(16) C already aligned to 16 here actually
156 L(unroll):
157 movl (%ebx), %eax
158 leal -16(%ebx,%ecx,4), %ebx
160 leal -16(%edi,%ecx,4), %edi
161 subl $4, %ecx
163 negl %ecx
166 ALIGN(16) C one byte nop for this alignment
167 L(top):
168 C eax scratch
169 C ebx &src[size-4]
170 C ecx counter
171 C edx scratch
172 C esi carry
173 C edi &dst[size-4]
174 C ebp multiplier
176 mull %ebp
178 addl %esi, %eax
179 movl $0, %esi
181 adcl %edx, %esi
183 movl %eax, (%edi,%ecx,4)
184 movl 4(%ebx,%ecx,4), %eax
187 mull %ebp
189 addl %esi, %eax
190 movl $0, %esi
192 adcl %edx, %esi
194 movl %eax, 4(%edi,%ecx,4)
195 movl 8(%ebx,%ecx,4), %eax
198 mull %ebp
200 addl %esi, %eax
201 movl $0, %esi
203 adcl %edx, %esi
205 movl %eax, 8(%edi,%ecx,4)
206 movl 12(%ebx,%ecx,4), %eax
209 mull %ebp
211 addl %esi, %eax
212 movl $0, %esi
214 adcl %edx, %esi
216 movl %eax, 12(%edi,%ecx,4)
217 movl 16(%ebx,%ecx,4), %eax
220 addl $4, %ecx
221 js L(top)
225 C eax next src limb
226 C ebx &src[size-4]
227 C ecx 0 to 3 representing respectively 4 to 1 further limbs
228 C edx
229 C esi carry
230 C edi &dst[size-4]
232 testb $2, %cl
233 jnz L(finish_not_two)
235 mull %ebp
237 addl %esi, %eax
238 movl $0, %esi
240 adcl %edx, %esi
242 movl %eax, (%edi,%ecx,4)
243 movl 4(%ebx,%ecx,4), %eax
246 mull %ebp
248 addl %esi, %eax
249 movl $0, %esi
251 adcl %edx, %esi
253 movl %eax, 4(%edi,%ecx,4)
254 movl 8(%ebx,%ecx,4), %eax
256 addl $2, %ecx
257 L(finish_not_two):
260 testb $1, %cl
261 jnz L(finish_not_one)
263 mull %ebp
265 addl %esi, %eax
266 movl $0, %esi
268 adcl %edx, %esi
270 movl %eax, 8(%edi)
271 movl 12(%ebx), %eax
272 L(finish_not_one):
275 mull %ebp
277 addl %esi, %eax
278 popl %ebp
280 adcl $0, %edx
282 movl %eax, 12(%edi)
283 popl %edi
285 popl %ebx
286 movl %edx, %eax
288 popl %esi
292 EPILOGUE()