beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / coreisbr / aorrlshC_n.asm
blob23ace418895e559c7a935b8d52c55ac0793569ee
1 dnl AMD64 mpn_addlshC_n -- rp[] = up[] + (vp[] << C)
2 dnl AMD64 mpn_rsblshC_n -- rp[] = (vp[] << C) - up[]
4 dnl Copyright 2009-2012 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
33 C cycles/limb
34 C AMD K8,K9 ?
35 C AMD K10 ?
36 C Intel P4 ?
37 C Intel core2 3.25
38 C Intel NHM 4
39 C Intel SBR 2 C (or 1.95 when L(top)'s alignment = 16 (mod 32))
40 C Intel atom ?
41 C VIA nano ?
43 C This code probably runs close to optimally on Sandy Bridge (using 4-way
44 C unrolling). It also runs reasonably well on Core 2, but it runs poorly on
45 C all other processors, including Nehalem.
47 C INPUT PARAMETERS
48 define(`rp', `%rdi')
49 define(`up', `%rsi')
50 define(`vp', `%rdx')
51 define(`n', `%rcx')
52 define(`cy', `%r8')
54 ABI_SUPPORT(DOS64)
55 ABI_SUPPORT(STD64)
57 ASM_START()
58 TEXT
59 ALIGN(16)
60 PROLOGUE(func_nc)
61 FUNC_ENTRY(4)
62 IFDOS(` mov 56(%rsp), %r8 ')
63 push %rbp
64 mov cy, %rax
65 neg %rax C set msb on carry
66 xor R32(%rbp), R32(%rbp) C limb carry
67 mov (vp), %r8
68 shrd $RSH, %r8, %rbp
69 mov R32(n), R32(%r9)
70 and $3, R32(%r9)
71 je L(b00)
72 cmp $2, R32(%r9)
73 jc L(b01)
74 je L(b10)
75 jmp L(b11)
76 EPILOGUE()
78 ALIGN(16)
79 PROLOGUE(func_n)
80 FUNC_ENTRY(4)
81 push %rbp
82 xor R32(%rbp), R32(%rbp) C limb carry
83 mov (vp), %r8
84 shrd $RSH, %r8, %rbp
85 mov R32(n), R32(%rax)
86 and $3, R32(%rax)
87 je L(b00)
88 cmp $2, R32(%rax)
89 jc L(b01)
90 je L(b10)
92 L(b11): mov 8(vp), %r9
93 shrd $RSH, %r9, %r8
94 mov 16(vp), %r10
95 shrd $RSH, %r10, %r9
96 add R32(%rax), R32(%rax) C init carry flag
97 ADCSBB (up), %rbp
98 ADCSBB 8(up), %r8
99 ADCSBB 16(up), %r9
100 mov %rbp, (rp)
101 mov %r8, 8(rp)
102 mov %r9, 16(rp)
103 mov %r10, %rbp
104 lea 24(up), up
105 lea 24(vp), vp
106 lea 24(rp), rp
107 sbb R32(%rax), R32(%rax) C save carry flag
108 sub $3, n
109 ja L(top)
110 jmp L(end)
112 L(b01): add R32(%rax), R32(%rax) C init carry flag
113 ADCSBB (up), %rbp
114 mov %rbp, (rp)
115 mov %r8, %rbp
116 lea 8(up), up
117 lea 8(vp), vp
118 lea 8(rp), rp
119 sbb R32(%rax), R32(%rax) C save carry flag
120 sub $1, n
121 ja L(top)
122 jmp L(end)
124 L(b10): mov 8(vp), %r9
125 shrd $RSH, %r9, %r8
126 add R32(%rax), R32(%rax) C init carry flag
127 ADCSBB (up), %rbp
128 ADCSBB 8(up), %r8
129 mov %rbp, (rp)
130 mov %r8, 8(rp)
131 mov %r9, %rbp
132 lea 16(up), up
133 lea 16(vp), vp
134 lea 16(rp), rp
135 sbb R32(%rax), R32(%rax) C save carry flag
136 sub $2, n
137 ja L(top)
138 jmp L(end)
140 ALIGN(16)
141 L(top): mov (vp), %r8
142 shrd $RSH, %r8, %rbp
143 L(b00): mov 8(vp), %r9
144 shrd $RSH, %r9, %r8
145 mov 16(vp), %r10
146 shrd $RSH, %r10, %r9
147 mov 24(vp), %r11
148 shrd $RSH, %r11, %r10
149 lea 32(vp), vp
150 add R32(%rax), R32(%rax) C restore carry flag
151 ADCSBB (up), %rbp
152 ADCSBB 8(up), %r8
153 ADCSBB 16(up), %r9
154 ADCSBB 24(up), %r10
155 lea 32(up), up
156 mov %rbp, (rp)
157 mov %r8, 8(rp)
158 mov %r9, 16(rp)
159 mov %r10, 24(rp)
160 mov %r11, %rbp
161 lea 32(rp), rp
162 sbb R32(%rax), R32(%rax) C save carry flag
163 sub $4, n
164 jnz L(top)
166 L(end): shr $RSH, %rbp
167 add R32(%rax), R32(%rax) C restore carry flag
168 ADCSBB $0, %rbp
169 mov %rbp, %rax
170 pop %rbp
171 FUNC_EXIT()
173 EPILOGUE()