1 dnl x86
-64 mpn_rsh1add_n
/mpn_rsh1sub_n optimized for Pentium
4.
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright
2007, 2008, 2010-2012 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
46 C * Try to make this smaller, 746 bytes seem excessive for this 2nd class
47 C function. Less sw pipelining would help, and since we now probably
48 C pipeline somewhat too deeply, it might not affect performance too much.
49 C * A separate small-n loop might speed things as well as make things smaller.
50 C That loop should be selected before pushing registers.
59 ifdef(`OPERATION_rsh1add_n', `
61 define
(func
, mpn_rsh1add_n
)
62 define
(func_nc
, mpn_rsh1add_nc
)')
63 ifdef(`OPERATION_rsh1sub_n', `
65 define
(func
, mpn_rsh1sub_n
)
66 define
(func_nc
, mpn_rsh1sub_nc
)')
71 MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
82 IFDOS
(`
mov 56(%rsp
), %r8
')
96 mov R32(%r8), R32(%rbx) C n = 0, 4, 8, ...
101 ADDSUB %rbx, %r15 C return bit
123 L(n00): cmp $2, R32(%rax)
125 xor R32(%rbx), R32(%rbx) C n = 1, 5, 9, ...
127 mov R32(%r8), R32(%rax)
139 L(gt1): mov 8(up), %r8
166 lea -16(rp), rp C n = 2, 6, 10, ...
167 mov R32(%r8), R32(%rbx)
187 L(gt2): mov 16(up), %r8
206 L(n10): xor R32(%rbx), R32(%rbx) C n = 3, 7, 11, ...
208 mov R32(%r8), R32(%rax)
232 L(gt3): mov 24(up), %r8
243 L(c0): mov $1, R8(%rbx)
245 L(c1): mov $1, R8(%rax)
247 L(c2): mov $1, R8(%rbx)
251 L(top): mov (up), %r8 C not on critical path
253 ADDSUB %r9, %r11 C not on critical path
254 mov (vp), %r9 C not on critical path
255 setc R8(%rbx) C save carry out
256 mov %r12, %r13 C new for later
257 shl $63, %r12 C shift new right
258 shr %r14 C shift old left
260 L(L11): ADDSUB %rax, %r11 C apply previous carry out
261 jc L(c0) C jump if ripple
262 L(rc0): mov 8(up), %r10
271 L(L10): ADDSUB %rbx, %r8
273 L(rc1): mov 16(up), %r12
282 L(L01): ADDSUB %rax, %r10
284 L(rc2): mov 24(up), %r11
296 L(L00): ADDSUB %rbx, %r12
301 L(end): or %r13, %r10
308 L(cj3): ADDSUB %rax, %r11
316 L(cj2): or %r13, %r11
320 L(cj1): or %r14, %rbx
323 mov R32(%r15), R32(%rax)
332 L(c3): mov $1, R8(%rax)