beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / sublsh1_n.asm
blobc6d829fcb2d35bd7ab9c8a310826f99870c43c70
1 dnl AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
3 dnl Copyright 2003, 2005-2007, 2011, 2012 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C cycles/limb
35 C AMD K8,K9 2.2
36 C AMD K10 2.2
37 C Intel P4 12.75
38 C Intel core2 3.45
39 C Intel corei ?
40 C Intel atom ?
41 C VIA nano 3.25
43 C Sometimes speed degenerates, supposedly related to that some operand
44 C alignments cause cache conflicts.
46 C The speed is limited by decoding/issue bandwidth. There are 26 instructions
47 C in the loop, which corresponds to 26/3/4 = 2.167 c/l.
49 C INPUT PARAMETERS
50 define(`rp',`%rdi')
51 define(`up',`%rsi')
52 define(`vp',`%rdx')
53 define(`n', `%rcx')
55 ABI_SUPPORT(DOS64)
56 ABI_SUPPORT(STD64)
58 ASM_START()
59 TEXT
60 ALIGN(16)
61 PROLOGUE(mpn_sublsh1_n)
62 FUNC_ENTRY(4)
63 push %rbx
64 push %rbp
66 mov (vp), %r8
67 mov R32(n), R32(%rax)
68 lea (rp,n,8), rp
69 lea (up,n,8), up
70 lea (vp,n,8), vp
71 neg n
72 xor R32(%rbp), R32(%rbp)
73 and $3, R32(%rax)
74 je L(b00)
75 cmp $2, R32(%rax)
76 jc L(b01)
77 je L(b10)
79 L(b11): add %r8, %r8
80 mov 8(vp,n,8), %r9
81 adc %r9, %r9
82 mov 16(vp,n,8), %r10
83 adc %r10, %r10
84 sbb R32(%rax), R32(%rax) C save scy
85 mov (up,n,8), %rbp
86 mov 8(up,n,8), %rbx
87 sub %r8, %rbp
88 sbb %r9, %rbx
89 mov %rbp, (rp,n,8)
90 mov %rbx, 8(rp,n,8)
91 mov 16(up,n,8), %rbp
92 sbb %r10, %rbp
93 mov %rbp, 16(rp,n,8)
94 sbb R32(%rbp), R32(%rbp) C save acy
95 add $3, n
96 jmp L(ent)
98 L(b10): add %r8, %r8
99 mov 8(vp,n,8), %r9
100 adc %r9, %r9
101 sbb R32(%rax), R32(%rax) C save scy
102 mov (up,n,8), %rbp
103 mov 8(up,n,8), %rbx
104 sub %r8, %rbp
105 sbb %r9, %rbx
106 mov %rbp, (rp,n,8)
107 mov %rbx, 8(rp,n,8)
108 sbb R32(%rbp), R32(%rbp) C save acy
109 add $2, n
110 jmp L(ent)
112 L(b01): add %r8, %r8
113 sbb R32(%rax), R32(%rax) C save scy
114 mov (up,n,8), %rbp
115 sub %r8, %rbp
116 mov %rbp, (rp,n,8)
117 sbb R32(%rbp), R32(%rbp) C save acy
118 inc n
119 L(ent): jns L(end)
121 ALIGN(16)
122 L(top): add R32(%rax), R32(%rax) C restore scy
124 mov (vp,n,8), %r8
125 L(b00): adc %r8, %r8
126 mov 8(vp,n,8), %r9
127 adc %r9, %r9
128 mov 16(vp,n,8), %r10
129 adc %r10, %r10
130 mov 24(vp,n,8), %r11
131 adc %r11, %r11
133 sbb R32(%rax), R32(%rax) C save scy
134 add R32(%rbp), R32(%rbp) C restore acy
136 mov (up,n,8), %rbp
137 mov 8(up,n,8), %rbx
138 sbb %r8, %rbp
139 sbb %r9, %rbx
140 mov %rbp, (rp,n,8)
141 mov %rbx, 8(rp,n,8)
142 mov 16(up,n,8), %rbp
143 mov 24(up,n,8), %rbx
144 sbb %r10, %rbp
145 sbb %r11, %rbx
146 mov %rbp, 16(rp,n,8)
147 mov %rbx, 24(rp,n,8)
149 sbb R32(%rbp), R32(%rbp) C save acy
150 add $4, n
151 js L(top)
153 L(end): add R32(%rbp), R32(%rax)
154 neg R32(%rax)
156 pop %rbp
157 pop %rbx
158 FUNC_EXIT()
160 EPILOGUE()