beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / cnd_aors_n.asm
blob13a2ab3be9503f5c3f4f5c1e0de544676423094f
1 dnl AMD64 mpn_cnd_add_n, mpn_cnd_sub_n
3 dnl Copyright 2011-2013 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C AMD K8,K9 2
35 C AMD K10 2
36 C AMD bd1 2.32
37 C AMD bobcat 3
38 C Intel P4 13
39 C Intel core2 2.9
40 C Intel NHM 2.8
41 C Intel SBR 2.4
42 C Intel atom 5.33
43 C VIA nano 3
45 C NOTES
46 C * It might seem natural to use the cmov insn here, but since this function
47 C is supposed to have the exact same execution pattern for cnd true and
48 C false, and since cmov's documentation is not clear about whether it
49 C actually reads both source operands and writes the register for a false
50 C condition, we cannot use it.
51 C * Two cases could be optimised: (1) cnd_add_n could use ADCSBB-from-memory
52 C to save one insn/limb, and (2) when up=rp cnd_add_n and cnd_sub_n could use
53 C ADCSBB-to-memory, again saving 1 insn/limb.
54 C * This runs optimally at decoder bandwidth on K10. It has not been tuned
55 C for any other processor.
57 C INPUT PARAMETERS
58 define(`cnd', `%rdi') dnl rcx
59 define(`rp', `%rsi') dnl rdx
60 define(`up', `%rdx') dnl r8
61 define(`vp', `%rcx') dnl r9
62 define(`n', `%r8') dnl rsp+40
64 ifdef(`OPERATION_cnd_add_n', `
65 define(ADDSUB, add)
66 define(ADCSBB, adc)
67 define(func, mpn_cnd_add_n)')
68 ifdef(`OPERATION_cnd_sub_n', `
69 define(ADDSUB, sub)
70 define(ADCSBB, sbb)
71 define(func, mpn_cnd_sub_n)')
73 MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
75 ABI_SUPPORT(DOS64)
76 ABI_SUPPORT(STD64)
78 ASM_START()
79 TEXT
80 ALIGN(16)
81 PROLOGUE(func)
82 FUNC_ENTRY(4)
83 IFDOS(` mov 56(%rsp), R32(%r8)')
84 push %rbx
85 push %rbp
86 push %r12
87 push %r13
88 push %r14
90 neg cnd
91 sbb cnd, cnd C make cnd mask
93 lea (vp,n,8), vp
94 lea (up,n,8), up
95 lea (rp,n,8), rp
97 mov R32(n), R32(%rax)
98 neg n
99 and $3, R32(%rax)
100 jz L(top) C carry-save reg rax = 0 in this arc
101 cmp $2, R32(%rax)
102 jc L(b1)
103 jz L(b2)
105 L(b3): mov (vp,n,8), %r12
106 mov 8(vp,n,8), %r13
107 mov 16(vp,n,8), %r14
108 and cnd, %r12
109 mov (up,n,8), %r10
110 and cnd, %r13
111 mov 8(up,n,8), %rbx
112 and cnd, %r14
113 mov 16(up,n,8), %rbp
114 ADDSUB %r12, %r10
115 mov %r10, (rp,n,8)
116 ADCSBB %r13, %rbx
117 mov %rbx, 8(rp,n,8)
118 ADCSBB %r14, %rbp
119 mov %rbp, 16(rp,n,8)
120 sbb R32(%rax), R32(%rax) C save carry
121 add $3, n
122 js L(top)
123 jmp L(end)
125 L(b2): mov (vp,n,8), %r12
126 mov 8(vp,n,8), %r13
127 mov (up,n,8), %r10
128 and cnd, %r12
129 mov 8(up,n,8), %rbx
130 and cnd, %r13
131 ADDSUB %r12, %r10
132 mov %r10, (rp,n,8)
133 ADCSBB %r13, %rbx
134 mov %rbx, 8(rp,n,8)
135 sbb R32(%rax), R32(%rax) C save carry
136 add $2, n
137 js L(top)
138 jmp L(end)
140 L(b1): mov (vp,n,8), %r12
141 mov (up,n,8), %r10
142 and cnd, %r12
143 ADDSUB %r12, %r10
144 mov %r10, (rp,n,8)
145 sbb R32(%rax), R32(%rax) C save carry
146 add $1, n
147 jns L(end)
149 ALIGN(16)
150 L(top): mov (vp,n,8), %r12
151 mov 8(vp,n,8), %r13
152 mov 16(vp,n,8), %r14
153 mov 24(vp,n,8), %r11
154 and cnd, %r12
155 mov (up,n,8), %r10
156 and cnd, %r13
157 mov 8(up,n,8), %rbx
158 and cnd, %r14
159 mov 16(up,n,8), %rbp
160 and cnd, %r11
161 mov 24(up,n,8), %r9
162 add R32(%rax), R32(%rax) C restore carry
163 ADCSBB %r12, %r10
164 mov %r10, (rp,n,8)
165 ADCSBB %r13, %rbx
166 mov %rbx, 8(rp,n,8)
167 ADCSBB %r14, %rbp
168 mov %rbp, 16(rp,n,8)
169 ADCSBB %r11, %r9
170 mov %r9, 24(rp,n,8)
171 sbb R32(%rax), R32(%rax) C save carry
172 add $4, n
173 js L(top)
175 L(end): neg R32(%rax)
176 pop %r14
177 pop %r13
178 pop %r12
179 pop %rbp
180 pop %rbx
181 FUNC_EXIT()
183 EPILOGUE()