beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / mulx / aorsmul_1.asm
blob285c07335e88c4d074f4751eef1f2ada5b1de197
1 dnl AMD64 mpn_addmul_1 and mpn_submul_1 for CPUs with mulx.
3 dnl Copyright 2012, 2013 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C AMD K8,K9 -
35 C AMD K10 -
36 C AMD bd1 -
37 C AMD bd2 ?
38 C AMD bobcat -
39 C AMD jaguar ?
40 C Intel P4 -
41 C Intel PNR -
42 C Intel NHM -
43 C Intel SBR -
44 C Intel HWL ?
45 C Intel BWL ?
46 C Intel atom -
47 C VIA nano -
49 define(`rp', `%rdi') C rcx
50 define(`up', `%rsi') C rdx
51 define(`n_param', `%rdx') C r8
52 define(`v0_param',`%rcx') C r9
54 define(`n', `%rcx')
55 define(`v0', `%rdx')
57 ifdef(`OPERATION_addmul_1',`
58 define(`ADDSUB', `add')
59 define(`ADCSBB', `adc')
60 define(`func', `mpn_addmul_1')
62 ifdef(`OPERATION_submul_1',`
63 define(`ADDSUB', `sub')
64 define(`ADCSBB', `sbb')
65 define(`func', `mpn_submul_1')
68 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
70 IFDOS(` define(`up', ``%rsi'') ') dnl
71 IFDOS(` define(`rp', ``%rcx'') ') dnl
72 IFDOS(` define(`vl', ``%r9'') ') dnl
73 IFDOS(` define(`r9', ``rdi'') ') dnl
74 IFDOS(` define(`n', ``%r8'') ') dnl
75 IFDOS(` define(`r8', ``r11'') ') dnl
77 ASM_START()
78 TEXT
79 ALIGN(16)
80 PROLOGUE(func)
81 mov (up), %r8
83 push %rbx
84 push %r12
85 push %r13
87 lea (up,n_param,8), up
88 lea -32(rp,n_param,8), rp
89 mov R32(n_param), R32(%rax)
90 xchg v0_param, v0 C FIXME: is this insn fast?
92 neg n
94 and $3, R8(%rax)
95 jz L(b0)
96 cmp $2, R8(%rax)
97 jz L(b2)
98 jg L(b3)
100 L(b1): mulx %r8, %rbx, %rax
101 sub $-1, n
102 jz L(wd1)
103 mulx (up,n,8), %r9, %r8
104 mulx 8(up,n,8), %r11, %r10
105 test R32(%rax), R32(%rax) C clear cy
106 jmp L(lo1)
108 L(b0): mulx %r8, %r9, %r8
109 mulx 8(up,n,8), %r11, %r10
110 mulx 16(up,n,8), %r13, %r12
111 xor R32(%rax), R32(%rax)
112 jmp L(lo0)
114 L(b3): mulx %r8, %r11, %r10
115 mulx 8(up,n,8), %r13, %r12
116 mulx 16(up,n,8), %rbx, %rax
117 add %r10, %r13
118 adc %r12, %rbx
119 adc $0, %rax
120 sub $-3, n
121 jz L(wd3)
122 test R32(%rax), R32(%rax) C clear cy
123 jmp L(lo3)
125 L(b2): mulx %r8, %r13, %r12
126 mulx 8(up,n,8), %rbx, %rax
127 add %r12, %rbx
128 adc $0, %rax
129 sub $-2, n
130 jz L(wd2)
131 mulx (up,n,8), %r9, %r8
132 test R32(%rax), R32(%rax) C clear cy
133 jmp L(lo2)
135 L(top): ADDSUB %r9, (rp,n,8)
136 L(lo3): mulx (up,n,8), %r9, %r8
137 ADCSBB %r11, 8(rp,n,8)
138 L(lo2): mulx 8(up,n,8), %r11, %r10
139 ADCSBB %r13, 16(rp,n,8)
140 L(lo1): mulx 16(up,n,8), %r13, %r12
141 ADCSBB %rbx, 24(rp,n,8)
142 adc %rax, %r9
143 L(lo0): mulx 24(up,n,8), %rbx, %rax
144 adc %r8, %r11
145 adc %r10, %r13
146 adc %r12, %rbx
147 adc $0, %rax C rax = carry limb
148 add $4, n
149 js L(top)
151 L(end): ADDSUB %r9, (rp)
152 L(wd3): ADCSBB %r11, 8(rp)
153 L(wd2): ADCSBB %r13, 16(rp)
154 L(wd1): ADCSBB %rbx, 24(rp)
155 adc n, %rax
156 pop %r13
157 pop %r12
158 pop %rbx
160 EPILOGUE()
161 ASM_END()