new beta-0.90.0
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / sec_tabselect.asm
blobe8aed261ef35a1b68ea03ff305b18ca02fa4880f
1 dnl AMD64 mpn_sec_tabselect.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2011-2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
36 C cycles/limb good for cpu
37 C AMD K8,K9 1.5 Y
38 C AMD K10 1.4
39 C AMD bd1 2.64
40 C AMD bobcat 2.15 Y
41 C Intel P4 4
42 C Intel core2 1.38
43 C Intel NHM 1.75
44 C Intel SBR 1.25
45 C Intel atom 2.5 Y
46 C VIA nano 1.75 Y
48 C NOTES
49 C * This has not been tuned for any specific processor. Its speed should not
50 C be too bad, though.
51 C * Using SSE2/AVX2 could result in many-fold speedup.
52 C * WORKS FOR n mod 4 = 0 ONLY!
54 C mpn_sec_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
55 define(`rp', `%rdi')
56 define(`tp', `%rsi')
57 define(`n', `%rdx')
58 define(`nents', `%rcx')
59 define(`which', `%r8')
61 define(`i', `%rbp')
62 define(`j', `%r9')
64 C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
65 C nents n rp tab i which j * * * * * *
67 ABI_SUPPORT(DOS64)
68 ABI_SUPPORT(STD64)
70 ASM_START()
71 TEXT
72 ALIGN(16)
73 PROLOGUE(mpn_sec_tabselect)
74 FUNC_ENTRY(4)
75 IFDOS(` mov 56(%rsp), %r8d ')
77 push %rbx
78 push %rbp
79 push %r12
80 push %r13
81 push %r14
82 push %r15
84 mov n, j
85 add $-4, j
86 js L(outer_end)
88 L(outer_top):
89 mov nents, i
90 push tp
91 xor R32(%r12), R32(%r12)
92 xor R32(%r13), R32(%r13)
93 xor R32(%r14), R32(%r14)
94 xor R32(%r15), R32(%r15)
95 mov which, %rbx
97 ALIGN(16)
98 L(top): sub $1, %rbx
99 sbb %rax, %rax
100 mov 0(tp), %r10
101 mov 8(tp), %r11
102 and %rax, %r10
103 and %rax, %r11
104 or %r10, %r12
105 or %r11, %r13
106 mov 16(tp), %r10
107 mov 24(tp), %r11
108 and %rax, %r10
109 and %rax, %r11
110 or %r10, %r14
111 or %r11, %r15
112 lea (tp,n,8), tp
113 add $-1, i
114 jne L(top)
116 mov %r12, 0(rp)
117 mov %r13, 8(rp)
118 mov %r14, 16(rp)
119 mov %r15, 24(rp)
120 pop tp
121 lea 32(tp), tp
122 lea 32(rp), rp
123 add $-4, j
124 jns L(outer_top)
125 L(outer_end):
127 test $2, R8(n)
128 jz L(b0x)
129 L(b1x): mov nents, i
130 push tp
131 xor R32(%r12), R32(%r12)
132 xor R32(%r13), R32(%r13)
133 mov which, %rbx
134 ALIGN(16)
135 L(tp2): sub $1, %rbx
136 sbb %rax, %rax
137 mov 0(tp), %r10
138 mov 8(tp), %r11
139 and %rax, %r10
140 and %rax, %r11
141 or %r10, %r12
142 or %r11, %r13
143 lea (tp,n,8), tp
144 add $-1, i
145 jne L(tp2)
146 mov %r12, 0(rp)
147 mov %r13, 8(rp)
148 pop tp
149 lea 16(tp), tp
150 lea 16(rp), rp
152 L(b0x): test $1, R8(n)
153 jz L(b00)
154 L(b01): mov nents, i
155 xor R32(%r12), R32(%r12)
156 mov which, %rbx
157 ALIGN(16)
158 L(tp1): sub $1, %rbx
159 sbb %rax, %rax
160 mov 0(tp), %r10
161 and %rax, %r10
162 or %r10, %r12
163 lea (tp,n,8), tp
164 add $-1, i
165 jne L(tp1)
166 mov %r12, 0(rp)
168 L(b00): pop %r15
169 pop %r14
170 pop %r13
171 pop %r12
172 pop %rbp
173 pop %rbx
174 FUNC_EXIT()
176 EPILOGUE()