1 dnl Intel Pentium MMX mpn_mul_1
-- mpn by limb multiplication.
3 dnl Copyright
2000-2002 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
35 C P5: 12.0 for 32-bit multiplier
36 C 7.0 for 16-bit multiplier
39 C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
40 C mp_limb_t multiplier);
42 C When the multiplier is 16 bits some special case MMX code is used. Small
43 C multipliers might arise reasonably often from mpz_mul_ui etc. If the size
44 C is odd there's roughly a
5 cycle penalty
, so times for say
size==7 and
45 C
size==8 end up being quite close. If src isn
't aligned to an 8 byte
46 C boundary then one limb is processed separately with roughly a 5 cycle
47 C penalty, so in that case it's say
size==8 and size==9 which are close.
51 C MMX is
not believed to be of any use for
32-bit multipliers
, since for
52 C instance the current method would just have to be more
or less duplicated
53 C for the
high and low halves of the multiplier
, and would probably
54 C therefore run at about
14 cycles
, which is slower than the plain integer
57 C Adding the
high and low MMX products using integer code seems best. An
58 C attempt at using paddd
and carry bit propagation with pcmpgtd didn
't give
59 C any joy. Perhaps something could be done keeping the values signed and
60 C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or
65 C An mpn_mul_1c entrypoint would need a double carry out of the low result
66 C limb in the 16-bit code, unless it could be assumed the carry fits in 16
67 C bits, possibly as carry<multiplier, this being true of a big calculation
68 C done piece by piece. But let's worry about that if
/when mul_1c is
71 defframe
(PARAM_MULTIPLIER
,16)
72 defframe
(PARAM_SIZE
, 12)
73 defframe
(PARAM_SRC
, 8)
74 defframe
(PARAM_DST
, 4)
90 movl PARAM_MULTIPLIER, %eax
110 pushl %esi FRAME_pushl()
111 pushl %edi FRAME_pushl()
113 movl %edx, %esi C src
116 movl PARAM_MULTIPLIER, %eax
117 pushl %ebx FRAME_pushl()
119 leal (%esi,%ecx,4), %esi C src end
120 leal (%edi,%ecx,4), %edi C dst end
124 pushl %ebp FRAME_pushl()
131 xorl %ebx, %ebx C carry limb
134 jnc L(top) C with carry flag clear
137 C size was odd, process one limb separately
139 mull 4(%esi,%ecx,8) C m * src[0]
141 movl %eax, 4(%edi,%ecx,8)
144 orl %edx, %ebx C carry limb, and clear carry flag
150 C ecx counter, negative
154 C ebp (scratch carry)
157 movl (%esi,%ecx,8), %eax
159 mull PARAM_MULTIPLIER
165 movl 4(%esi,%ecx,8), %eax
167 mull PARAM_MULTIPLIER
169 movl %ebx, (%edi,%ecx,8)
172 movl %eax, 4(%edi,%ecx,8)
192 C Special case for 16-bit multiplier.
202 C size<3 not supported here. At size==3 we're already a couple of
203 C cycles faster
, so there
's no threshold as such, just use the MMX
204 C as soon as possible.
210 pxor %mm6, %mm6 C initial carry word
212 punpcklwd %mm7, %mm7 C m replicated 2 times
213 addl $2, %ecx C -size+2
215 punpckldq %mm7, %mm7 C m replicated 4 times
216 andl $4, %edx C test alignment, clear carry flag
222 C Source is unaligned, process one limb separately.
224 C Plain integer code is used here, since it's smaller
and is about
225 C the same
13 cycles as an mmx block would be.
227 C An
"addl $1,%ecx" doesn
't clear the carry flag when size==3, hence
228 C the use of separate incl and orl.
230 mull -8(%esi,%ecx,4) C m * src[0]
232 movl %eax, -8(%edi,%ecx,4) C dst[0]
233 incl %ecx C one limb processed
235 movd %edx, %mm6 C initial carry
237 orl %eax, %eax C clear carry flag
241 C The scheduling here is quite tricky, since so many instructions have
242 C pairing restrictions. In particular the js won't pair with a movd
, and
243 C can
't be paired with an adc since it wants flags from the inc, so
244 C instructions are rotated to the top of the loop to find somewhere useful
247 C Trouble has been taken to avoid overlapping successive loop iterations,
248 C since that would greatly increase the size of the startup and finishup
249 C code. Actually there's probably
not much advantage to be had from
250 C overlapping anyway
, since the difficulties are mostly with pairing
, not
251 C with latencies as such.
253 C
In the comments x represents the src data
and m the multiplier
(16
254 C bits
, but replicated
4 times
).
256 C The m signs calculated
in %mm3 are a
loop invariant
and could be held
in
257 C say
%mm5
, but that would save only one instruction
and hence be no faster.
260 C
eax l.
low, then l.
high
262 C
ecx counter
, -size+2 to
0 or 1
268 C
%mm0
(high products
)
269 C
%mm1
(low products
)
270 C
%mm2
(adjust for m using x signs
)
271 C
%mm3
(adjust for x using m signs
)
274 C
%mm6 h.
low, then carry
275 C
%mm7 m replicated
4 times
277 movd
%mm6
, %ebx C h.
low
278 psrlq
$32, %mm1 C l.
high
280 movd
%mm0
, %edx C h.
high
281 movq
%mm0
, %mm6 C new c
286 movd
%mm1
, %eax C l.
high
290 movl
%ebx, -16(%edi,%ecx,4)
292 movl
%edx, -12(%edi,%ecx,4)
296 pmulhw
-8(%esi,%ecx,4), %mm0 C h
= (x
*m
).
high
299 pmullw
-8(%esi,%ecx,4), %mm1 C l
= (x
*m
).
low
302 movq
-8(%esi,%ecx,4), %mm2 C x
303 psraw
$15, %mm3 C m signs
305 pand
-8(%esi,%ecx,4), %mm3 C x selected by m signs
306 psraw
$15, %mm2 C x signs
308 paddw
%mm3
, %mm0 C
add x to h if m
neg
309 pand
%mm7
, %mm2 C m selected by x signs
311 paddw
%mm2
, %mm0 C
add m to h if x
neg
314 movd
%mm1
, %eax C l.
low
315 punpcklwd
%mm0
, %mm6 C c
+ h.
low << 16
317 psrlq
$16, %mm0 C h.
high
323 movd
%mm6
, %ebx C h.
low
324 psrlq
$32, %mm1 C l.
high
327 popl
%ebp FRAME_popl
()
329 movd
%mm0
, %edx C h.
high
330 psrlq
$32, %mm0 C l.
high
332 movd
%mm1
, %eax C l.
high
335 movl
%ebx, -12(%edi,%ecx,4)
340 movl
%edx, -8(%edi,%ecx,4)
343 jnz L
(small_done
) C final
%ecx==1 means even
, ==0 odd
346 C
Size odd
, one extra limb to process.
347 C Plain integer code is used here
, since it
's smaller and is about
348 C the same speed as another mmx block would be.
351 movl PARAM_MULTIPLIER, %eax