beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / arm / neon / sec_tabselect.asm
blob69fceb00635cd7ac9b5b472f20eeaecb4a194a36
1 dnl ARM Neon mpn_sec_tabselect.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2011-2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
36 C cycles/limb
37 C StrongARM -
38 C XScale -
39 C Cortex-A7 ?
40 C Cortex-A8 ?
41 C Cortex-A9 1.15
42 C Cortex-A15 0.65
44 define(`rp', `r0')
45 define(`tp', `r1')
46 define(`n', `r2')
47 define(`nents', `r3')
48 C define(`which', on stack)
50 define(`i', `r4')
51 define(`j', `r5')
53 define(`maskq', `q10')
54 define(`maskd', `d20')
56 ASM_START()
57 PROLOGUE(mpn_sec_tabselect)
58 push {r4-r5}
60 add r4, sp, #8
61 vld1.32 {d30[], d31[]}, [r4] C 4 `which' copies
62 vmov.i32 q14, #1 C 4 copies of 1
64 subs j, n, #8
65 bmi L(outer_end)
67 L(outer_top):
68 mov i, nents
69 mov r12, tp C preserve tp
70 veor q13, q13, q13 C 4 counter copies
71 veor q2, q2, q2
72 veor q3, q3, q3
73 ALIGN(16)
74 L(top): vceq.i32 maskq, q13, q15 C compare idx copies to `which' copies
75 vld1.32 {q0,q1}, [tp]
76 vadd.i32 q13, q13, q14
77 vbit q2, q0, maskq
78 vbit q3, q1, maskq
79 add tp, tp, n, lsl #2
80 subs i, i, #1
81 bne L(top)
82 vst1.32 {q2,q3}, [rp]!
83 add tp, r12, #32 C restore tp, point to next slice
84 subs j, j, #8
85 bpl L(outer_top)
86 L(outer_end):
88 tst n, #4
89 beq L(b0xx)
90 L(b1xx):mov i, nents
91 mov r12, tp
92 veor q13, q13, q13
93 veor q2, q2, q2
94 ALIGN(16)
95 L(tp4): vceq.i32 maskq, q13, q15
96 vld1.32 {q0}, [tp]
97 vadd.i32 q13, q13, q14
98 vbit q2, q0, maskq
99 add tp, tp, n, lsl #2
100 subs i, i, #1
101 bne L(tp4)
102 vst1.32 {q2}, [rp]!
103 add tp, r12, #16
105 L(b0xx):tst n, #2
106 beq L(b00x)
107 L(b01x):mov i, nents
108 mov r12, tp
109 veor d26, d26, d26
110 veor d4, d4, d4
111 ALIGN(16)
112 L(tp2): vceq.i32 maskd, d26, d30
113 vld1.32 {d0}, [tp]
114 vadd.i32 d26, d26, d28
115 vbit d4, d0, maskd
116 add tp, tp, n, lsl #2
117 subs i, i, #1
118 bne L(tp2)
119 vst1.32 {d4}, [rp]!
120 add tp, r12, #8
122 L(b00x):tst n, #1
123 beq L(b000)
124 L(b001):mov i, nents
125 mov r12, tp
126 veor d26, d26, d26
127 veor d4, d4, d4
128 ALIGN(16)
129 L(tp1): vceq.i32 maskd, d26, d30
130 vld1.32 {d0[0]}, [tp]
131 vadd.i32 d26, d26, d28
132 vbit d4, d0, maskd
133 add tp, tp, n, lsl #2
134 subs i, i, #1
135 bne L(tp1)
136 vst1.32 {d4[0]}, [rp]
138 L(b000):pop {r4-r5}
139 bx r14
140 EPILOGUE()