beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / aorrlsh_n.asm
blob5ca128fbf35233e1d2585a9be2ca0f25b973b606
1 dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U.
3 dnl Copyright 2006, 2010-2012 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C cycles/limb
35 C AMD K8,K9 3.1 < 3.85 for lshift + add_n
36 C AMD K10 3.1 < 3.85 for lshift + add_n
37 C Intel P4 14.6 > 7.33 for lshift + add_n
38 C Intel core2 3.87 > 3.27 for lshift + add_n
39 C Intel NHM 4 > 3.75 for lshift + add_n
40 C Intel SBR (5.8) > 3.46 for lshift + add_n
41 C Intel atom (7.75) < 8.75 for lshift + add_n
42 C VIA nano 4.7 < 6.25 for lshift + add_n
44 C This was written quickly and not optimized at all. Surely one could get
45 C closer to 3 c/l or perhaps even under 3 c/l. Ideas:
46 C 1) Use indexing to save the 3 LEA
47 C 2) Write reasonable feed-in code
48 C 3) Be more clever about register usage
49 C 4) Unroll more, handling CL negation, carry save/restore cost much now
50 C 5) Reschedule
52 C INPUT PARAMETERS
53 define(`rp', `%rdi')
54 define(`up', `%rsi')
55 define(`vp', `%rdx')
56 define(`n', `%rcx')
57 define(`cnt', `%r8')
59 ifdef(`OPERATION_addlsh_n',`
60 define(ADCSBB, `adc')
61 define(func, mpn_addlsh_n)
63 ifdef(`OPERATION_rsblsh_n',`
64 define(ADCSBB, `sbb')
65 define(func, mpn_rsblsh_n)
68 MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
70 ABI_SUPPORT(DOS64)
71 ABI_SUPPORT(STD64)
73 ASM_START()
74 TEXT
75 ALIGN(16)
76 PROLOGUE(func)
77 FUNC_ENTRY(4)
78 IFDOS(` mov 56(%rsp), %r8d ')
79 push %r12
80 push %r13
81 push %r14
82 push %rbp
83 push %rbx
85 mov n, %rax
86 xor R32(%rbx), R32(%rbx) C clear carry save register
87 mov R32(%r8), R32(%rcx) C shift count
88 xor R32(%rbp), R32(%rbp) C limb carry
90 mov R32(%rax), R32(%r11)
91 and $3, R32(%r11)
92 je L(4)
93 sub $1, R32(%r11)
95 L(012): mov (vp), %r8
96 mov %r8, %r12
97 shl R8(%rcx), %r8
98 or %rbp, %r8
99 neg R8(%rcx)
100 mov %r12, %rbp
101 shr R8(%rcx), %rbp
102 neg R8(%rcx)
103 add R32(%rbx), R32(%rbx)
104 ADCSBB (up), %r8
105 mov %r8, (rp)
106 sbb R32(%rbx), R32(%rbx)
107 lea 8(up), up
108 lea 8(vp), vp
109 lea 8(rp), rp
110 sub $1, R32(%r11)
111 jnc L(012)
113 L(4): sub $4, %rax
114 jc L(end)
116 ALIGN(16)
117 L(top): mov (vp), %r8
118 mov %r8, %r12
119 mov 8(vp), %r9
120 mov %r9, %r13
121 mov 16(vp), %r10
122 mov %r10, %r14
123 mov 24(vp), %r11
125 shl R8(%rcx), %r8
126 shl R8(%rcx), %r9
127 shl R8(%rcx), %r10
128 or %rbp, %r8
129 mov %r11, %rbp
130 shl R8(%rcx), %r11
132 neg R8(%rcx)
134 shr R8(%rcx), %r12
135 shr R8(%rcx), %r13
136 shr R8(%rcx), %r14
137 shr R8(%rcx), %rbp C used next iteration
139 or %r12, %r9
140 or %r13, %r10
141 or %r14, %r11
143 neg R8(%rcx)
145 add R32(%rbx), R32(%rbx) C restore carry flag
147 ADCSBB (up), %r8
148 ADCSBB 8(up), %r9
149 ADCSBB 16(up), %r10
150 ADCSBB 24(up), %r11
152 mov %r8, (rp)
153 mov %r9, 8(rp)
154 mov %r10, 16(rp)
155 mov %r11, 24(rp)
157 sbb R32(%rbx), R32(%rbx) C save carry flag
159 lea 32(up), up
160 lea 32(vp), vp
161 lea 32(rp), rp
163 sub $4, %rax
164 jnc L(top)
166 L(end): add R32(%rbx), R32(%rbx)
167 ADCSBB $0, %rbp
168 mov %rbp, %rax
169 pop %rbx
170 pop %rbp
171 pop %r14
172 pop %r13
173 pop %r12
174 FUNC_EXIT()
176 EPILOGUE()