beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / arm / v7a / cora15 / neon / rsh1aors_n.asm
blob2c11d6debd4b3cf4273ad50eea82ae9e5afc0f96
1 dnl ARM Neon mpn_rsh1add_n, mpn_rsh1sub_n.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C StrongARM -
37 C XScale -
38 C Cortex-A7 ?
39 C Cortex-A8 ?
40 C Cortex-A9 4-5
41 C Cortex-A15 2.5
43 C TODO
44 C * Try to make this smaller, its size (384 bytes) is excessive.
45 C * Try to reach 2.25 c/l on A15, to match the addlsh_1 family.
46 C * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps
47 C insufficiently for A7 and A8.
49 define(`rp', `r0')
50 define(`up', `r1')
51 define(`vp', `r2')
52 define(`n', `r3')
54 ifdef(`OPERATION_rsh1add_n', `
55 define(`ADDSUBS', `adds $1, $2, $3')
56 define(`ADCSBCS', `adcs $1, $2, $3')
57 define(`IFADD', `$1')
58 define(`IFSUB', `')
59 define(`func', mpn_rsh1add_n)')
60 ifdef(`OPERATION_rsh1sub_n', `
61 define(`ADDSUBS', `subs $1, $2, $3')
62 define(`ADCSBCS', `sbcs $1, $2, $3')
63 define(`IFADD', `')
64 define(`IFSUB', `$1')
65 define(`func', mpn_rsh1sub_n)')
67 MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
69 ASM_START()
70 PROLOGUE(func)
71 push {r4-r10}
73 ands r4, n, #3
74 beq L(b00)
75 cmp r4, #2
76 blo L(b01)
77 beq L(b10)
79 L(b11): ldmia up!, {r9,r10,r12}
80 ldmia vp!, {r5,r6,r7}
81 ADDSUBS( r9, r9, r5)
82 vmov d4, r9, r9
83 ADCSBCS( r10, r10, r6)
84 ADCSBCS( r12, r12, r7)
85 vshr.u64 d3, d4, #1
86 vmov d1, r10, r12
87 vsli.u64 d3, d1, #31
88 vshr.u64 d2, d1, #1
89 vst1.32 d3[0], [rp]!
90 bics n, n, #3
91 beq L(wd2)
92 L(gt3): ldmia up!, {r8,r9,r10,r12}
93 ldmia vp!, {r4,r5,r6,r7}
94 b L(mi0)
96 L(b10): ldmia up!, {r10,r12}
97 ldmia vp!, {r6,r7}
98 ADDSUBS( r10, r10, r6)
99 ADCSBCS( r12, r12, r7)
100 vmov d4, r10, r12
101 bics n, n, #2
102 vshr.u64 d2, d4, #1
103 beq L(wd2)
104 L(gt2): ldmia up!, {r8,r9,r10,r12}
105 ldmia vp!, {r4,r5,r6,r7}
106 b L(mi0)
108 L(b01): ldr r12, [up], #4
109 ldr r7, [vp], #4
110 ADDSUBS( r12, r12, r7)
111 vmov d4, r12, r12
112 bics n, n, #1
113 bne L(gt1)
114 mov r5, r12, lsr #1
115 IFADD(` adc r1, n, #0')
116 IFSUB(` adc r1, n, #1')
117 bfi r5, r1, #31, #1
118 str r5, [rp]
119 and r0, r12, #1
120 pop {r4-r10}
121 bx r14
122 L(gt1): ldmia up!, {r8,r9,r10,r12}
123 ldmia vp!, {r4,r5,r6,r7}
124 vshr.u64 d2, d4, #1
125 ADCSBCS( r8, r8, r4)
126 ADCSBCS( r9, r9, r5)
127 vmov d0, r8, r9
128 ADCSBCS( r10, r10, r6)
129 ADCSBCS( r12, r12, r7)
130 vsli.u64 d2, d0, #31
131 vshr.u64 d3, d0, #1
132 vst1.32 d2[0], [rp]!
133 b L(mi1)
135 L(b00): ldmia up!, {r8,r9,r10,r12}
136 ldmia vp!, {r4,r5,r6,r7}
137 ADDSUBS( r8, r8, r4)
138 ADCSBCS( r9, r9, r5)
139 vmov d4, r8, r9
140 ADCSBCS( r10, r10, r6)
141 ADCSBCS( r12, r12, r7)
142 vshr.u64 d3, d4, #1
143 b L(mi1)
145 ALIGN(16)
146 L(top): ldmia up!, {r8,r9,r10,r12}
147 ldmia vp!, {r4,r5,r6,r7}
148 vsli.u64 d3, d1, #63
149 vshr.u64 d2, d1, #1
150 vst1.32 d3, [rp]!
151 L(mi0): ADCSBCS( r8, r8, r4)
152 ADCSBCS( r9, r9, r5)
153 vmov d0, r8, r9
154 ADCSBCS( r10, r10, r6)
155 ADCSBCS( r12, r12, r7)
156 vsli.u64 d2, d0, #63
157 vshr.u64 d3, d0, #1
158 vst1.32 d2, [rp]!
159 L(mi1): vmov d1, r10, r12
160 sub n, n, #4
161 tst n, n
162 bne L(top)
164 L(end): vsli.u64 d3, d1, #63
165 vshr.u64 d2, d1, #1
166 vst1.32 d3, [rp]!
167 L(wd2): vmov r4, r5, d2
168 IFADD(` adc r1, n, #0')
169 IFSUB(` adc r1, n, #1')
170 bfi r5, r1, #31, #1
171 stm rp, {r4,r5}
173 L(rtn): vmov.32 r0, d4[0]
174 and r0, r0, #1
175 pop {r4-r10}
176 bx r14
177 EPILOGUE()