1 dnl Intel Pentium
-4 mpn_submul_1
-- Multiply a limb vector with a limb
and
2 dnl subtract the result from a second limb vector.
4 dnl Copyright
2001, 2002, 2008, 2010 Free Software Foundation
, Inc.
6 dnl
This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
9 dnl it under the terms of
either:
11 dnl
* the GNU Lesser General
Public License as published by the Free
12 dnl Software Foundation
; either version 3 of the License, or (at your
13 dnl option
) any later version.
17 dnl
* the GNU General
Public License as published by the Free Software
18 dnl Foundation
; either version 2 of the License, or (at your option) any
21 dnl
or both
in parallel
, as here.
23 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
24 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
25 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
28 dnl You should have received copies of the GNU General
Public License
and the
29 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
30 dnl see
https://www.gnu.
org/licenses
/.
32 include(`..
/config.m4
')
36 C P6 model 0-8,10-12 -
37 C P6 model 9 (Banias) 6.8
38 C P6 model 13 (Dothan) 6.9
39 C P4 model 0-1 (Willamette) ?
40 C P4 model 2 (Northwood) 5.87
41 C P4 model 3-4 (Prescott) 6.5
43 C This code represents a step forwards compared to the code available before
44 C GMP 5.1, but it is not carefully tuned for either P6 or P4. In fact, it is
45 C not good for P6. For P4 it saved a bit over 1 c/l for both Northwood and
46 C Prescott compared to the old code.
48 C The arrangements made here to get a two instruction dependent chain are
49 C slightly subtle. In the loop the carry (or borrow rather) is a negative so
50 C that a paddq can be used to give a low limb ready to store, and a high limb
51 C ready to become the new carry after a psrlq.
53 C If the carry was a simple twos complement negative then the psrlq shift would
54 C need to bring in 0 bits or 1 bits according to whether the high was zero or
55 C non-zero, since a non-zero value would represent a negative needing sign
56 C extension. That wouldn't be particularly easy to arrange
and certainly would
57 C
add an instruction to the dependent chain
, so instead an
offset is applied so
58 C that the
high limb will be
0xFFFFFFFF+c. With c
in the range
-0xFFFFFFFF to
59 C
0, the value
0xFFFFFFFF+c is
in the range
0 to
0xFFFFFFFF and is therefore
60 C always positive
and can always have
0 bits shifted
in, which is what psrlq
63 C The extra
0xFFFFFFFF must be subtracted before c is used
, but that can be
64 C done off the dependent chain. The total adjustment then is to
add
65 C
0xFFFFFFFF00000000 to
offset the new carry
, and subtract
0x00000000FFFFFFFF
66 C to remove the
offset from the current carry
, for a net
add of
67 C
0xFFFFFFFE00000001.
In the code
this is applied to the destination limb when
70 C It
's also possible to view the 0xFFFFFFFF adjustment as a ones-complement
71 C negative, which is how it's undone for the return value
, but that doesn
't
74 defframe(PARAM_CARRY, 20)
75 defframe(PARAM_MULTIPLIER,16)
76 defframe(PARAM_SIZE, 12)
77 defframe(PARAM_SRC, 8)
78 defframe(PARAM_DST, 4)
83 PROLOGUE(mpn_submul_1c)
85 movd PARAM_CARRY
, %mm1
89 PROLOGUE
(mpn_submul_1
)
91 pxor %mm1, %mm1 C initial borrow
97 movd PARAM_MULTIPLIER, %mm7
101 psrlq $32, %mm0 C 0x00000000FFFFFFFF
104 psllq $32, %mm6 C 0xFFFFFFFF00000000
106 psubq %mm0, %mm6 C 0xFFFFFFFE00000001
108 psubq %mm1, %mm0 C 0xFFFFFFFF - borrow
111 movd (%eax), %mm3 C up
112 movd (%edx), %mm4 C rp
115 paddq %mm6, %mm4 C add 0xFFFFFFFE00000001
118 psubq %mm3, %mm4 C prod
119 paddq %mm4, %mm0 C borrow
120 movd %mm0, (%edx) C result
123 L(gt1): movd 4(%eax), %mm1 C up
124 movd 4(%edx), %mm2 C rp
130 L(top): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001
132 psubq %mm3, %mm4 C prod
133 movd 8(%eax), %mm3 C up
134 paddq %mm4, %mm0 C borrow
135 movd 8(%edx), %mm4 C rp
136 movd %mm0, (%edx) C result
142 paddq %mm6, %mm4 C add 0xFFFFFFFE00000001
144 psubq %mm1, %mm2 C prod
145 movd 12(%eax), %mm1 C up
146 paddq %mm2, %mm0 C borrow
147 movd 12(%edx), %mm2 C rp
148 movd %mm0, 4(%edx) C result
157 L(eev): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001
159 psubq %mm3, %mm4 C prod
160 paddq %mm4, %mm0 C borrow
161 movd %mm0, (%edx) C result
163 psubq %mm1, %mm2 C prod
164 paddq %mm2, %mm0 C borrow
165 movd %mm0, 4(%edx) C result
166 L(rt): psrlq $32, %mm0
172 L(eod): paddq %mm6, %mm4 C add 0xFFFFFFFE00000001
174 psubq %mm1, %mm2 C prod
175 paddq %mm2, %mm0 C borrow
176 movd %mm0, 4(%edx) C result
178 psubq %mm3, %mm4 C prod
179 paddq %mm4, %mm0 C borrow
180 movd %mm0, 8(%edx) C result