beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / pentium / mmx / mul_1.asm
blob4ced577b13ae013874f41e800bfa600e369b6881
1 dnl Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication.
3 dnl Copyright 2000-2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C cycles/limb
35 C P5: 12.0 for 32-bit multiplier
36 C 7.0 for 16-bit multiplier
39 C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
40 C mp_limb_t multiplier);
42 C When the multiplier is 16 bits some special case MMX code is used. Small
43 C multipliers might arise reasonably often from mpz_mul_ui etc. If the size
44 C is odd there's roughly a 5 cycle penalty, so times for say size==7 and
45 C size==8 end up being quite close. If src isn't aligned to an 8 byte
46 C boundary then one limb is processed separately with roughly a 5 cycle
47 C penalty, so in that case it's say size==8 and size==9 which are close.
49 C Alternatives:
51 C MMX is not believed to be of any use for 32-bit multipliers, since for
52 C instance the current method would just have to be more or less duplicated
53 C for the high and low halves of the multiplier, and would probably
54 C therefore run at about 14 cycles, which is slower than the plain integer
55 C at 12.
57 C Adding the high and low MMX products using integer code seems best. An
58 C attempt at using paddd and carry bit propagation with pcmpgtd didn't give
59 C any joy. Perhaps something could be done keeping the values signed and
60 C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or
61 C perhaps not.
63 C Future:
65 C An mpn_mul_1c entrypoint would need a double carry out of the low result
66 C limb in the 16-bit code, unless it could be assumed the carry fits in 16
67 C bits, possibly as carry<multiplier, this being true of a big calculation
68 C done piece by piece. But let's worry about that if/when mul_1c is
69 C actually used.
71 defframe(PARAM_MULTIPLIER,16)
72 defframe(PARAM_SIZE, 12)
73 defframe(PARAM_SRC, 8)
74 defframe(PARAM_DST, 4)
76 TEXT
78 ALIGN(8)
79 PROLOGUE(mpn_mul_1)
80 deflit(`FRAME',0)
82 movl PARAM_SIZE, %ecx
83 movl PARAM_SRC, %edx
85 cmpl $1, %ecx
86 jne L(two_or_more)
88 C one limb only
90 movl PARAM_MULTIPLIER, %eax
91 movl PARAM_DST, %ecx
93 mull (%edx)
95 movl %eax, (%ecx)
96 movl %edx, %eax
98 ret
101 L(two_or_more):
102 C eax size
103 C ebx
104 C ecx carry
105 C edx
106 C esi src
107 C edi
108 C ebp
110 pushl %esi FRAME_pushl()
111 pushl %edi FRAME_pushl()
113 movl %edx, %esi C src
114 movl PARAM_DST, %edi
116 movl PARAM_MULTIPLIER, %eax
117 pushl %ebx FRAME_pushl()
119 leal (%esi,%ecx,4), %esi C src end
120 leal (%edi,%ecx,4), %edi C dst end
122 negl %ecx C -size
124 pushl %ebp FRAME_pushl()
125 cmpl $65536, %eax
127 jb L(small)
130 L(big):
131 xorl %ebx, %ebx C carry limb
132 sarl %ecx C -size/2
134 jnc L(top) C with carry flag clear
137 C size was odd, process one limb separately
139 mull 4(%esi,%ecx,8) C m * src[0]
141 movl %eax, 4(%edi,%ecx,8)
142 incl %ecx
144 orl %edx, %ebx C carry limb, and clear carry flag
147 L(top):
148 C eax
149 C ebx carry
150 C ecx counter, negative
151 C edx
152 C esi src end
153 C edi dst end
154 C ebp (scratch carry)
156 adcl $0, %ebx
157 movl (%esi,%ecx,8), %eax
159 mull PARAM_MULTIPLIER
161 movl %edx, %ebp
162 addl %eax, %ebx
164 adcl $0, %ebp
165 movl 4(%esi,%ecx,8), %eax
167 mull PARAM_MULTIPLIER
169 movl %ebx, (%edi,%ecx,8)
170 addl %ebp, %eax
172 movl %eax, 4(%edi,%ecx,8)
173 incl %ecx
175 movl %edx, %ebx
176 jnz L(top)
179 adcl $0, %ebx
180 popl %ebp
182 movl %ebx, %eax
183 popl %ebx
185 popl %edi
186 popl %esi
191 L(small):
192 C Special case for 16-bit multiplier.
194 C eax multiplier
195 C ebx
196 C ecx -size
197 C edx src
198 C esi src end
199 C edi dst end
200 C ebp multiplier
202 C size<3 not supported here. At size==3 we're already a couple of
203 C cycles faster, so there's no threshold as such, just use the MMX
204 C as soon as possible.
206 cmpl $-3, %ecx
207 ja L(big)
209 movd %eax, %mm7 C m
210 pxor %mm6, %mm6 C initial carry word
212 punpcklwd %mm7, %mm7 C m replicated 2 times
213 addl $2, %ecx C -size+2
215 punpckldq %mm7, %mm7 C m replicated 4 times
216 andl $4, %edx C test alignment, clear carry flag
218 movq %mm7, %mm0 C m
219 jz L(small_entry)
222 C Source is unaligned, process one limb separately.
224 C Plain integer code is used here, since it's smaller and is about
225 C the same 13 cycles as an mmx block would be.
227 C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence
228 C the use of separate incl and orl.
230 mull -8(%esi,%ecx,4) C m * src[0]
232 movl %eax, -8(%edi,%ecx,4) C dst[0]
233 incl %ecx C one limb processed
235 movd %edx, %mm6 C initial carry
237 orl %eax, %eax C clear carry flag
238 jmp L(small_entry)
241 C The scheduling here is quite tricky, since so many instructions have
242 C pairing restrictions. In particular the js won't pair with a movd, and
243 C can't be paired with an adc since it wants flags from the inc, so
244 C instructions are rotated to the top of the loop to find somewhere useful
245 C for it.
247 C Trouble has been taken to avoid overlapping successive loop iterations,
248 C since that would greatly increase the size of the startup and finishup
249 C code. Actually there's probably not much advantage to be had from
250 C overlapping anyway, since the difficulties are mostly with pairing, not
251 C with latencies as such.
253 C In the comments x represents the src data and m the multiplier (16
254 C bits, but replicated 4 times).
256 C The m signs calculated in %mm3 are a loop invariant and could be held in
257 C say %mm5, but that would save only one instruction and hence be no faster.
259 L(small_top):
260 C eax l.low, then l.high
261 C ebx (h.low)
262 C ecx counter, -size+2 to 0 or 1
263 C edx (h.high)
264 C esi &src[size]
265 C edi &dst[size]
266 C ebp
268 C %mm0 (high products)
269 C %mm1 (low products)
270 C %mm2 (adjust for m using x signs)
271 C %mm3 (adjust for x using m signs)
272 C %mm4
273 C %mm5
274 C %mm6 h.low, then carry
275 C %mm7 m replicated 4 times
277 movd %mm6, %ebx C h.low
278 psrlq $32, %mm1 C l.high
280 movd %mm0, %edx C h.high
281 movq %mm0, %mm6 C new c
283 adcl %eax, %ebx
284 incl %ecx
286 movd %mm1, %eax C l.high
287 movq %mm7, %mm0
289 adcl %eax, %edx
290 movl %ebx, -16(%edi,%ecx,4)
292 movl %edx, -12(%edi,%ecx,4)
293 psrlq $32, %mm6 C c
295 L(small_entry):
296 pmulhw -8(%esi,%ecx,4), %mm0 C h = (x*m).high
297 movq %mm7, %mm1
299 pmullw -8(%esi,%ecx,4), %mm1 C l = (x*m).low
300 movq %mm7, %mm3
302 movq -8(%esi,%ecx,4), %mm2 C x
303 psraw $15, %mm3 C m signs
305 pand -8(%esi,%ecx,4), %mm3 C x selected by m signs
306 psraw $15, %mm2 C x signs
308 paddw %mm3, %mm0 C add x to h if m neg
309 pand %mm7, %mm2 C m selected by x signs
311 paddw %mm2, %mm0 C add m to h if x neg
312 incl %ecx
314 movd %mm1, %eax C l.low
315 punpcklwd %mm0, %mm6 C c + h.low << 16
317 psrlq $16, %mm0 C h.high
318 js L(small_top)
323 movd %mm6, %ebx C h.low
324 psrlq $32, %mm1 C l.high
326 adcl %eax, %ebx
327 popl %ebp FRAME_popl()
329 movd %mm0, %edx C h.high
330 psrlq $32, %mm0 C l.high
332 movd %mm1, %eax C l.high
334 adcl %eax, %edx
335 movl %ebx, -12(%edi,%ecx,4)
337 movd %mm0, %eax C c
339 adcl $0, %eax
340 movl %edx, -8(%edi,%ecx,4)
342 orl %ecx, %ecx
343 jnz L(small_done) C final %ecx==1 means even, ==0 odd
346 C Size odd, one extra limb to process.
347 C Plain integer code is used here, since it's smaller and is about
348 C the same speed as another mmx block would be.
350 movl %eax, %ecx
351 movl PARAM_MULTIPLIER, %eax
353 mull -4(%esi)
355 addl %ecx, %eax
357 adcl $0, %edx
358 movl %eax, -4(%edi)
360 movl %edx, %eax
361 L(small_done):
362 popl %ebx
364 popl %edi
365 popl %esi
367 emms
371 EPILOGUE()