beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / arm / v7a / cora15 / logops_n.asm
blob06026143e146df3d26a4a85080cfebdd551fc518
1 dnl ARM mpn_and_n, mpn_andn_n. mpn_nand_n, etc, optimised for A15.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb cycles/limb
36 C and andn ior xor nand iorn nior xnor
37 C StrongARM ? ?
38 C XScale ? ?
39 C Cortex-A7 ? ?
40 C Cortex-A8 ? ?
41 C Cortex-A9 3.5 3.56
42 C Cortex-A15 1.27 1.64
44 C This is great A15 core register code, but it is a bit large.
45 C We use FEEDIN_VARIANT 1 to save some space, but use 8-way unrolling.
47 C Architecture requirements:
48 C v5 -
49 C v5t -
50 C v5te ldrd strd
51 C v6 -
52 C v6t2 -
53 C v7a -
55 define(`FEEDIN_VARIANT', 1) C alternatives: 0 1 2
56 define(`UNROLL', 4x2) C alternatives: 4 4x2
58 define(`rp', `r0')
59 define(`up', `r1')
60 define(`vp', `r2')
61 define(`n', `r3')
63 define(`POSTOP')
65 ifdef(`OPERATION_and_n',`
66 define(`func', `mpn_and_n')
67 define(`LOGOP', `and $1, $2, $3')')
68 ifdef(`OPERATION_andn_n',`
69 define(`func', `mpn_andn_n')
70 define(`LOGOP', `bic $1, $2, $3')')
71 ifdef(`OPERATION_nand_n',`
72 define(`func', `mpn_nand_n')
73 define(`POSTOP', `mvn $1, $1')
74 define(`LOGOP', `and $1, $2, $3')')
75 ifdef(`OPERATION_ior_n',`
76 define(`func', `mpn_ior_n')
77 define(`LOGOP', `orr $1, $2, $3')')
78 ifdef(`OPERATION_iorn_n',`
79 define(`func', `mpn_iorn_n')
80 define(`POSTOP', `mvn $1, $1')
81 define(`LOGOP', `bic $1, $3, $2')')
82 ifdef(`OPERATION_nior_n',`
83 define(`func', `mpn_nior_n')
84 define(`POSTOP', `mvn $1, $1')
85 define(`LOGOP', `orr $1, $2, $3')')
86 ifdef(`OPERATION_xor_n',`
87 define(`func', `mpn_xor_n')
88 define(`LOGOP', `eor $1, $2, $3')')
89 ifdef(`OPERATION_xnor_n',`
90 define(`func', `mpn_xnor_n')
91 define(`POSTOP', `mvn $1, $1')
92 define(`LOGOP', `eor $1, $2, $3')')
94 MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
96 ASM_START()
97 PROLOGUE(func)
98 push { r4-r9 }
100 ifelse(FEEDIN_VARIANT,0,`
101 ands r6, n, #3
102 mov n, n, lsr #2
103 beq L(b00a)
104 tst r6, #1
105 beq L(bx0)
106 ldr r5, [up], #4
107 ldr r7, [vp], #4
108 LOGOP( r9, r5, r7)
109 POSTOP( r9)
110 str r9, [rp], #4
111 tst r6, #2
112 beq L(b00)
113 L(bx0): ldrd r4, r5, [up, #0]
114 ldrd r6, r7, [vp, #0]
115 sub rp, rp, #8
116 b L(lo)
117 L(b00): tst n, n
118 beq L(wd1)
119 L(b00a):ldrd r4, r5, [up], #-8
120 ldrd r6, r7, [vp], #-8
121 sub rp, rp, #16
122 b L(mid)
124 ifelse(FEEDIN_VARIANT,1,`
125 and r6, n, #3
126 mov n, n, lsr #2
127 tst r6, #1
128 beq L(bx0)
129 ldr r5, [up], #4
130 ldr r7, [vp], #4
131 LOGOP( r9, r5, r7)
132 POSTOP( r9)
133 str r9, [rp], #4
134 L(bx0): tst r6, #2
135 beq L(b00)
136 ldrd r4, r5, [up, #0]
137 ldrd r6, r7, [vp, #0]
138 sub rp, rp, #8
139 b L(lo)
140 L(b00): tst n, n
141 beq L(wd1)
142 ldrd r4, r5, [up], #-8
143 ldrd r6, r7, [vp], #-8
144 sub rp, rp, #16
145 b L(mid)
147 ifelse(FEEDIN_VARIANT,2,`
148 ands r6, n, #3
149 mov n, n, lsr #2
150 beq L(b00)
151 cmp r6, #2
152 bcc L(b01)
153 beq L(b10)
155 L(b11): ldr r5, [up], #4
156 ldr r7, [vp], #4
157 LOGOP( r9, r5, r7)
158 ldrd r4, r5, [up, #0]
159 ldrd r6, r7, [vp, #0]
160 POSTOP( r9)
161 str r9, [rp], #-4
162 b L(lo)
164 L(b00): ldrd r4, r5, [up], #-8
165 ldrd r6, r7, [vp], #-8
166 sub rp, rp, #16
167 b L(mid)
169 L(b01): ldr r5, [up], #-4
170 ldr r7, [vp], #-4
171 LOGOP( r9, r5, r7)
172 POSTOP( r9)
173 str r9, [rp], #-12
174 tst n, n
175 beq L(wd1)
176 L(gt1): ldrd r4, r5, [up, #8]
177 ldrd r6, r7, [vp, #8]
178 b L(mid)
180 L(b10): ldrd r4, r5, [up]
181 ldrd r6, r7, [vp]
182 sub rp, rp, #8
183 b L(lo)
185 ALIGN(16)
186 ifelse(UNROLL,4,`
187 L(top): ldrd r4, r5, [up, #8]
188 ldrd r6, r7, [vp, #8]
189 POSTOP( r8)
190 POSTOP( r9)
191 strd r8, r9, [rp, #8]
192 L(mid): LOGOP( r8, r4, r6)
193 LOGOP( r9, r5, r7)
194 ldrd r4, r5, [up, #16]!
195 ldrd r6, r7, [vp, #16]!
196 POSTOP( r8)
197 POSTOP( r9)
198 strd r8, r9, [rp, #16]!
199 sub n, n, #1
200 L(lo): LOGOP( r8, r4, r6)
201 LOGOP( r9, r5, r7)
202 tst n, n
203 bne L(top)
205 ifelse(UNROLL,4x2,`
206 L(top): ldrd r4, r5, [up, #8]
207 ldrd r6, r7, [vp, #8]
208 POSTOP( r8)
209 POSTOP( r9)
210 strd r8, r9, [rp, #8]
211 L(mid): LOGOP( r8, r4, r6)
212 LOGOP( r9, r5, r7)
213 ldrd r4, r5, [up, #16]
214 ldrd r6, r7, [vp, #16]
215 POSTOP( r8)
216 POSTOP( r9)
217 strd r8, r9, [rp, #16]
218 LOGOP( r8, r4, r6)
219 LOGOP( r9, r5, r7)
220 sub n, n, #2
221 tst n, n
222 bmi L(dne)
223 ldrd r4, r5, [up, #24]
224 ldrd r6, r7, [vp, #24]
225 POSTOP( r8)
226 POSTOP( r9)
227 strd r8, r9, [rp, #24]
228 LOGOP( r8, r4, r6)
229 LOGOP( r9, r5, r7)
230 ldrd r4, r5, [up, #32]!
231 ldrd r6, r7, [vp, #32]!
232 POSTOP( r8)
233 POSTOP( r9)
234 strd r8, r9, [rp, #32]!
235 L(lo): LOGOP( r8, r4, r6)
236 LOGOP( r9, r5, r7)
237 tst n, n
238 bne L(top)
241 L(end): POSTOP( r8)
242 POSTOP( r9)
243 strd r8, r9, [rp, #8]
244 L(wd1): pop { r4-r9 }
245 bx r14
246 ifelse(UNROLL,4x2,`
247 L(dne): POSTOP( r8)
248 POSTOP( r9)
249 strd r8, r9, [rp, #24]
250 pop { r4-r9 }
251 bx r14
253 EPILOGUE()