beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / lshsub_n.asm
blob4d428c0bd25f298665d899bdfc23be8a18121761
1 dnl AMD64 mpn_lshsub_n. R = 2^k(U - V).
3 dnl Copyright 2006, 2011, 2012 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C cycles/limb
35 C AMD K8,K9 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l)
36 C AMD K10 3.15 (mpn_sub_n + mpn_lshift costs about 4 c/l)
37 C Intel P4 16.5
38 C Intel core2 4.35
39 C Intel corei ?
40 C Intel atom ?
41 C VIA nano ?
43 C This was written quickly and not optimized at all, but it runs very well on
44 C K8. But perhaps one could get under 3 c/l. Ideas:
45 C 1) Use indexing to save the 3 LEA
46 C 2) Write reasonable feed-in code
47 C 3) Be more clever about register usage
48 C 4) Unroll more, handling CL negation, carry save/restore cost much now
49 C 5) Reschedule
51 C INPUT PARAMETERS
52 define(`rp', `%rdi')
53 define(`up', `%rsi')
54 define(`vp', `%rdx')
55 define(`n', `%rcx')
56 define(`cnt', `%r8')
58 ABI_SUPPORT(DOS64)
59 ABI_SUPPORT(STD64)
61 ASM_START()
62 TEXT
63 ALIGN(16)
64 PROLOGUE(mpn_lshsub_n)
65 FUNC_ENTRY(4)
66 IFDOS(` mov 56(%rsp), %r8d ')
68 push %r12
69 push %r13
70 push %r14
71 push %r15
72 push %rbx
74 mov n, %rax
75 xor R32(%rbx), R32(%rbx) C clear carry save register
76 mov R32(%r8), R32(%rcx) C shift count
77 xor R32(%r15), R32(%r15) C limb carry
79 mov R32(%rax), R32(%r11)
80 and $3, R32(%r11)
81 je L(4)
82 sub $1, R32(%r11)
84 L(oopette):
85 add R32(%rbx), R32(%rbx) C restore carry flag
86 mov 0(up), %r8
87 lea 8(up), up
88 sbb 0(vp), %r8
89 mov %r8, %r12
90 sbb R32(%rbx), R32(%rbx) C save carry flag
91 shl R8(%rcx), %r8
92 or %r15, %r8
93 mov %r12, %r15
94 lea 8(vp), vp
95 neg R8(%rcx)
96 shr R8(%rcx), %r15
97 neg R8(%rcx)
98 mov %r8, 0(rp)
99 lea 8(rp), rp
100 sub $1, R32(%r11)
101 jnc L(oopette)
103 L(4):
104 sub $4, %rax
105 jc L(end)
107 ALIGN(16)
108 L(oop):
109 add R32(%rbx), R32(%rbx) C restore carry flag
111 mov 0(up), %r8
112 mov 8(up), %r9
113 mov 16(up), %r10
114 mov 24(up), %r11
116 lea 32(up), up
118 sbb 0(vp), %r8
119 mov %r8, %r12
120 sbb 8(vp), %r9
121 mov %r9, %r13
122 sbb 16(vp), %r10
123 mov %r10, %r14
124 sbb 24(vp), %r11
126 sbb R32(%rbx), R32(%rbx) C save carry flag
128 shl R8(%rcx), %r8
129 shl R8(%rcx), %r9
130 shl R8(%rcx), %r10
131 or %r15, %r8
132 mov %r11, %r15
133 shl R8(%rcx), %r11
135 lea 32(vp), vp
137 neg R8(%rcx)
139 shr R8(%rcx), %r12
140 shr R8(%rcx), %r13
141 shr R8(%rcx), %r14
142 shr R8(%rcx), %r15 C used next loop
144 or %r12, %r9
145 or %r13, %r10
146 or %r14, %r11
148 neg R8(%rcx)
150 mov %r8, 0(rp)
151 mov %r9, 8(rp)
152 mov %r10, 16(rp)
153 mov %r11, 24(rp)
155 lea 32(rp), rp
157 sub $4, %rax
158 jnc L(oop)
159 L(end):
160 neg R32(%rbx)
161 shl R8(%rcx), %rbx
162 adc %r15, %rbx
163 mov %rbx, %rax
164 pop %rbx
165 pop %r15
166 pop %r14
167 pop %r13
168 pop %r12
170 FUNC_EXIT()
172 EPILOGUE()