beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / arm64 / sec_tabselect.asm
blobbf2962cfa1bb5ed191da4151f1b4dde397e8afa3
1 dnl ARM64 Neon mpn_sec_tabselect.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2011-2014 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
36 C cycles/limb
37 C Cortex-A53 ?
38 C Cortex-A57 ?
40 C void
41 C mpn_sec_tabselect (mp_ptr rp, mp_srcptr *tab,
42 C mp_size_t n, mp_size_t nents, mp_size_t which)
44 changecom(@&*$)
46 define(`rp', `x0')
47 define(`tp', `x1')
48 define(`n', `x2')
49 define(`nents', `x3')
50 define(`which', `x4')
52 define(`i', `x5')
53 define(`j', `x6')
55 define(`maskq', `v4')
57 ASM_START()
58 PROLOGUE(mpn_sec_tabselect)
59 dup v7.2d, x4 C 2 `which' copies
61 mov x10, #1
62 dup v6.2d, x10 C 2 copies of 1
64 subs j, n, #4
65 b.mi L(outer_end)
67 L(outer_top):
68 mov i, nents
69 mov x12, tp C preserve tp
70 movi v5.16b, #0 C zero 2 counter copies
71 movi v2.16b, #0
72 movi v3.16b, #0
73 ALIGN(16)
74 L(tp4): cmeq maskq.2d, v5.2d, v7.2d C compare idx copies to `which' copies
75 ld1 {v0.2d,v1.2d}, [tp]
76 add v5.2d, v5.2d, v6.2d
77 bit v2.16b, v0.16b, maskq.16b
78 bit v3.16b, v1.16b, maskq.16b
79 add tp, tp, n, lsl #3
80 sub i, i, #1
81 cbnz i, L(tp4)
82 st1 {v2.2d,v3.2d}, [rp], #32
83 add tp, x12, #32 C restore tp, point to next slice
84 subs j, j, #4
85 b.pl L(outer_top)
86 L(outer_end):
88 tbz n, #1, L(b0x)
89 mov i, nents
90 mov x12, tp
91 movi v5.16b, #0 C zero 2 counter copies
92 movi v2.16b, #0
93 ALIGN(16)
94 L(tp2): cmeq maskq.2d, v5.2d, v7.2d
95 ld1 {v0.2d}, [tp]
96 add v5.2d, v5.2d, v6.2d
97 bit v2.16b, v0.16b, maskq.16b
98 add tp, tp, n, lsl #3
99 sub i, i, #1
100 cbnz i, L(tp2)
101 st1 {v2.2d}, [rp], #16
102 add tp, x12, #16
104 L(b0x): tbz n, #0, L(b00)
105 mov i, nents
106 mov x12, tp
107 movi v5.16b, #0 C zero 2 counter copies
108 movi v2.16b, #0
109 ALIGN(16)
110 L(tp1): cmeq maskq.2d, v5.2d, v7.2d
111 ld1 {v0.1d}, [tp]
112 add v5.2d, v5.2d, v6.2d C FIXME size should be `1d'
113 bit v2.8b, v0.8b, maskq.8b
114 add tp, tp, n, lsl #3
115 sub i, i, #1
116 cbnz i, L(tp1)
117 st1 {v2.1d}, [rp], #8
118 add tp, x12, #8
120 L(b00): ret
121 EPILOGUE()