1 dnl AMD64 mpn_addlsh_n
-- rp
[] = up
[] + (vp
[] << k
)
2 dnl AMD64 mpn_rsblsh_n
-- rp
[] = (vp
[] << k
) - up
[]
3 dnl Optimised for Nehalem.
5 dnl Contributed to the GNU project by Torbjorn Granlund.
7 dnl Copyright
2011, 2012 Free Software Foundation
, Inc.
9 dnl
This file is part of the GNU MP Library.
11 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
12 dnl it under the terms of
either:
14 dnl
* the GNU Lesser General
Public License as published by the Free
15 dnl Software Foundation
; either version 3 of the License, or (at your
16 dnl option
) any later version.
20 dnl
* the GNU General
Public License as published by the Free Software
21 dnl Foundation
; either version 2 of the License, or (at your option) any
24 dnl
or both
in parallel
, as here.
26 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
27 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
28 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
31 dnl You should have received copies of the GNU General
Public License
and the
32 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
33 dnl see
https://www.gnu.
org/licenses
/.
35 include(`..
/config.m4
')
47 C The inner-loop probably runs close to optimally on Nehalem (using 4-way
48 C unrolling). The rest of the code is quite crude, and could perhaps be made
49 C both smaller and faster.
57 define(`cy', `
%r9
') C for _nc variant
59 ifdef(`OPERATION_addlsh_n', `
63 define
(func_n
, mpn_addlsh_n
)
64 define
(func_nc
, mpn_addlsh_nc
)')
65 ifdef(`OPERATION_rsblsh_n', `
69 define(func_n, mpn_rsblsh_n)
70 define(func_nc, mpn_rsblsh_nc)')
72 C mpn_rsblsh_nc removed below
, its idea of carry
-in is inconsistent with
74 MULFUNC_PROLOGUE
(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n
)
84 IFDOS
(`
mov 56(%rsp
), %r8d
') C cnt
86 xor R32(%rbx), R32(%rbx) C clear CF save register
91 mov R32(cnt), R32(%rcx)
96 lea -40(rp,%rax,8), rp
105 L(b3): xor R32(%r9), R32(%r9)
106 mov 8(vp,%rax,8), %r10
107 mov 16(vp,%rax,8), %r11
110 add R32(%rbx), R32(%rbx)
111 ADCSBB 8(up,%rax,8), %r9
112 mov 24(vp,%rax,8), %r8
113 ADCSBB 16(up,%rax,8), %r10
114 sbb R32(%rbx), R32(%rbx)
118 L(b0): mov 8(vp,%rax,8), %r9
119 xor R32(%r8), R32(%r8)
121 mov 16(vp,%rax,8), %r10
122 mov 24(vp,%rax,8), %r11
125 add R32(%rbx), R32(%rbx)
126 ADCSBB 8(up,%rax,8), %r8
127 mov %r8, 40(rp,%rax,8) C offset 40
128 ADCSBB 16(up,%rax,8), %r9
129 mov 32(vp,%rax,8), %r8
130 ADCSBB 24(up,%rax,8), %r10
131 sbb R32(%rbx), R32(%rbx)
135 L(b1): mov 8(vp,%rax,8), %r8
138 mov 8(vp,%rax,8), %r9
139 xor R32(%rbp), R32(%rbp)
141 L(1): xor R32(%r11), R32(%r11)
144 L(b2): xor %r10, %r10
145 mov 8(vp,%rax,8), %r11
147 add R32(%rbx), R32(%rbx)
148 mov 16(vp,%rax,8), %r8
149 ADCSBB 8(up,%rax,8), %r10
150 sbb R32(%rbx), R32(%rbx)
155 L(top): mov 8(vp,%rax,8), %r9
157 L(lo2): mov %r10, 24(rp,%rax,8) C offset 24
158 L(lo1): shrd %cl, %r8, %rbp
160 mov 16(vp,%rax,8), %r10
161 mov 24(vp,%rax,8), %r11
164 add R32(%rbx), R32(%rbx)
165 ADCSBB (up,%rax,8), %rbp
166 ADCSBB 8(up,%rax,8), %r8
167 mov %r8, 40(rp,%rax,8) C offset 40
168 ADCSBB 16(up,%rax,8), %r9
169 mov 32(vp,%rax,8), %r8
170 ADCSBB 24(up,%rax,8), %r10
171 sbb R32(%rbx), R32(%rbx)
173 mov %rbp, (rp,%rax,8) C offset 32
175 L(lo3): mov %r9, 16(rp,%rax,8) C offset 48
178 L(end): mov %r10, 24(rp,%rax,8)
179 L(wd1): shrd %cl, %r8, %r11
180 add R32(%rbx), R32(%rbx)
181 ADCSBB (up,%rax,8), %r11
182 mov %r11, 32(rp,%rax,8) C offset 32
183 adc R32(%rax), R32(%rax) C rax is zero after loop
194 IFDOS(` mov 56(%rsp), %r8d ') C cnt
195 IFDOS
(`
mov 64(%rsp
), %r9
') C cy
198 sbb R32(%rbx), R32(%rbx) C initialise CF save register