1 dnl AMD64 mpn_addlsh_n
-- rp
[] = up
[] + (vp
[] << k
)
2 dnl AMD64 mpn_rsblsh_n
-- rp
[] = (vp
[] << k
) - up
[]
3 dnl Optimised for Sandy Bridge.
5 dnl Contributed to the GNU project by Torbjorn Granlund.
7 dnl Copyright
2011, 2012 Free Software Foundation
, Inc.
9 dnl
This file is part of the GNU MP Library.
11 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
12 dnl it under the terms of
either:
14 dnl
* the GNU Lesser General
Public License as published by the Free
15 dnl Software Foundation
; either version 3 of the License, or (at your
16 dnl option
) any later version.
20 dnl
* the GNU General
Public License as published by the Free Software
21 dnl Foundation
; either version 2 of the License, or (at your option) any
24 dnl
or both
in parallel
, as here.
26 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
27 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
28 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
31 dnl You should have received copies of the GNU General
Public License
and the
32 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
33 dnl see
https://www.gnu.
org/licenses
/.
35 include(`..
/config.m4
')
47 C The inner-loop probably runs close to optimally on Sandy Bridge (using 4-way
48 C unrolling). The rest of the code is quite crude, and could perhaps be made
49 C both smaller and faster.
57 define(`cy', `
%r9
') C for _nc variant
59 ifdef(`OPERATION_addlsh_n', `
63 define
(func_n
, mpn_addlsh_n
)
64 define
(func_nc
, mpn_addlsh_nc
)')
65 ifdef(`OPERATION_rsblsh_n', `
69 define(func_n, mpn_rsblsh_n)
70 define(func_nc, mpn_rsblsh_nc)')
75 C mpn_rsblsh_nc removed below
, its idea of carry
-in is inconsistent with
77 MULFUNC_PROLOGUE
(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n
)
84 IFDOS
(`
mov 56(%rsp
), %r8d
') C cnt
86 xor R32(%rbx), R32(%rbx) C clear CF save register
90 mov R32(cnt), R32(%rcx)
94 lea -32(vp,%rbp,8), vp
95 lea -32(up,%rbp,8), up
96 lea -32(rp,%rbp,8), rp
104 shrd R8(%rcx), %r9, %r8
105 shrd R8(%rcx), %r10, %r9
107 shrd R8(%rcx), %r11, %r10
110 add R32(%rbx), R32(%rbx)
117 L(3): add R32(%rbx), R32(%rbx)
126 xor R32(%rbp), R32(%rbp)
129 L(b1): xor %r10, %r10
131 shrd R8(%rcx), %r11, %r10
134 add R32(%rbx), R32(%rbx)
140 L(1): add R32(%rbx), R32(%rbx)
146 shrd R8(%rcx), %r10, %r9
148 shrd R8(%rcx), %r11, %r10
151 add R32(%rbx), R32(%rbx)
157 L(2): add R32(%rbx), R32(%rbx)
162 ALIGN(32) C 16-byte alignment is not enough!
163 L(top): shrd R8(%rcx), %r11, %r10
164 add R32(%rbx), R32(%rbx)
172 L(lo3): mov %r8, 8(rp)
173 L(lo2): mov %r9, 16(rp)
175 L(lo1): mov %r10, 24(rp)
179 sbb R32(%rbx), R32(%rbx)
180 L(lo0): shrd R8(%rcx), %r8, %rbp
182 shrd R8(%rcx), %r9, %r8
183 shrd R8(%rcx), %r10, %r9
188 shrd R8(%rcx), %r11, %r10
189 add R32(%rbx), R32(%rbx)
195 L(wd3): mov %r8, 8(rp)
196 L(wd2): mov %r9, 16(rp)
197 L(wd1): mov %r10, 24(rp)
198 adc R32(%rax), R32(%rax) C rax is zero after loop
209 IFDOS(` mov 56(%rsp), %r8d ') C cnt
210 IFDOS
(`
mov 64(%rsp
), %r9
') C cy
213 sbb R32(%rbx), R32(%rbx) C initialise CF save register