beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / core2 / sublshC_n.asm
blob5acc46b032a49f6b7e8f6196fa4ed56fd6268f69
1 dnl AMD64 mpn_sublshC_n -- rp[] = up[] - (vp[] << 1), optimised for Core 2 and
2 dnl Core iN.
4 dnl Contributed to the GNU project by Torbjorn Granlund.
6 dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
9 dnl
10 dnl The GNU MP Library is free software; you can redistribute it and/or modify
11 dnl it under the terms of either:
12 dnl
13 dnl * the GNU Lesser General Public License as published by the Free
14 dnl Software Foundation; either version 3 of the License, or (at your
15 dnl option) any later version.
16 dnl
17 dnl or
18 dnl
19 dnl * the GNU General Public License as published by the Free Software
20 dnl Foundation; either version 2 of the License, or (at your option) any
21 dnl later version.
22 dnl
23 dnl or both in parallel, as here.
24 dnl
25 dnl The GNU MP Library is distributed in the hope that it will be useful, but
26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
28 dnl for more details.
29 dnl
30 dnl You should have received copies of the GNU General Public License and the
31 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
32 dnl see https://www.gnu.org/licenses/.
34 C cycles/limb
35 C AMD K8,K9 4.25
36 C AMD K10 ?
37 C Intel P4 ?
38 C Intel core2 3
39 C Intel NHM 3.1
40 C Intel SBR 2.47
41 C Intel atom ?
42 C VIA nano ?
44 C INPUT PARAMETERS
45 define(`rp',`%rdi')
46 define(`up',`%rsi')
47 define(`vp',`%rdx')
48 define(`n', `%rcx')
50 ASM_START()
51 TEXT
52 ALIGN(8)
53 PROLOGUE(func)
54 FUNC_ENTRY(4)
55 push %rbx
56 push %r12
58 mov R32(%rcx), R32(%rax)
59 lea 24(up,n,8), up
60 lea 24(vp,n,8), vp
61 lea 24(rp,n,8), rp
62 neg n
64 xor R32(%r11), R32(%r11)
66 mov -24(vp,n,8), %r8 C do first limb early
67 shrd $RSH, %r8, %r11
69 and $3, R32(%rax)
70 je L(b0)
71 cmp $2, R32(%rax)
72 jc L(b1)
73 je L(b2)
75 L(b3): mov -16(vp,n,8), %r9
76 shrd $RSH, %r9, %r8
77 mov -8(vp,n,8), %r10
78 shrd $RSH, %r10, %r9
79 mov -24(up,n,8), %r12
80 ADDSUB %r11, %r12
81 mov %r12, -24(rp,n,8)
82 mov -16(up,n,8), %r12
83 ADCSBB %r8, %r12
84 mov %r12, -16(rp,n,8)
85 mov -8(up,n,8), %r12
86 ADCSBB %r9, %r12
87 mov %r12, -8(rp,n,8)
88 mov %r10, %r11
89 sbb R32(%rax), R32(%rax) C save cy
90 add $3, n
91 js L(top)
92 jmp L(end)
94 L(b1): mov -24(up,n,8), %r12
95 ADDSUB %r11, %r12
96 mov %r12, -24(rp,n,8)
97 mov %r8, %r11
98 sbb R32(%rax), R32(%rax) C save cy
99 inc n
100 js L(top)
101 jmp L(end)
103 L(b2): mov -16(vp,n,8), %r9
104 shrd $RSH, %r9, %r8
105 mov -24(up,n,8), %r12
106 ADDSUB %r11, %r12
107 mov %r12, -24(rp,n,8)
108 mov -16(up,n,8), %r12
109 ADCSBB %r8, %r12
110 mov %r12, -16(rp,n,8)
111 mov %r9, %r11
112 sbb R32(%rax), R32(%rax) C save cy
113 add $2, n
114 js L(top)
115 jmp L(end)
117 ALIGN(16)
118 L(top): mov -24(vp,n,8), %r8
119 shrd $RSH, %r8, %r11
120 L(b0): mov -16(vp,n,8), %r9
121 shrd $RSH, %r9, %r8
122 mov -8(vp,n,8), %r10
123 shrd $RSH, %r10, %r9
124 mov (vp,n,8), %rbx
125 shrd $RSH, %rbx, %r10
127 add R32(%rax), R32(%rax) C restore cy
129 mov -24(up,n,8), %r12
130 ADCSBB %r11, %r12
131 mov %r12, -24(rp,n,8)
133 mov -16(up,n,8), %r12
134 ADCSBB %r8, %r12
135 mov %r12, -16(rp,n,8)
137 mov -8(up,n,8), %r12
138 ADCSBB %r9, %r12
139 mov %r12, -8(rp,n,8)
141 mov (up,n,8), %r12
142 ADCSBB %r10, %r12
143 mov %r12, (rp,n,8)
145 mov %rbx, %r11
146 sbb R32(%rax), R32(%rax) C save cy
148 add $4, n
149 js L(top)
151 L(end): shr $RSH, %r11
152 pop %r12
153 pop %rbx
154 sub R32(%r11), R32(%rax)
155 neg R32(%rax)
156 FUNC_EXIT()
158 EPILOGUE()