beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / arm / v7a / cora15 / aors_n.asm
blobdc3f83992e45231aef0e8a7e0a7cfc1fa3cb6523
1 dnl ARM mpn_add_n/mpn_sub_n optimised for A15.
3 dnl Copyright 2013 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb best
34 C StrongARM: -
35 C XScale ?
36 C Cortex-A7 ?
37 C Cortex-A8 ?
38 C Cortex-A9 3.55 2.5
39 C Cortex-A15 1.27 this
41 C This was a major improvement compared to the code we had before, but it might
42 C not be the best 8-way code possible. We've tried some permutations of auto-
43 C increments and separate pointer updates, but they all ran at the same speed
44 C on A15.
46 C Architecture requirements:
47 C v5 -
48 C v5t -
49 C v5te ldrd strd
50 C v6 -
51 C v6t2 -
52 C v7a -
54 define(`rp', `r0')
55 define(`up', `r1')
56 define(`vp', `r2')
57 define(`n', `r3')
59 ifdef(`OPERATION_add_n', `
60 define(`ADDSUBC', adcs)
61 define(`IFADD', `$1')
62 define(`SETCY', `cmp $1, #1')
63 define(`RETVAL', `adc r0, n, #0')
64 define(`RETVAL2', `adc r0, n, #1')
65 define(`func', mpn_add_n)
66 define(`func_nc', mpn_add_nc)')
67 ifdef(`OPERATION_sub_n', `
68 define(`ADDSUBC', sbcs)
69 define(`IFADD', `')
70 define(`SETCY', `rsbs $1, $1, #0')
71 define(`RETVAL', `sbc r0, r0, r0
72 and r0, r0, #1')
73 define(`RETVAL2', `RETVAL')
74 define(`func', mpn_sub_n)
75 define(`func_nc', mpn_sub_nc)')
77 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
79 ASM_START()
80 PROLOGUE(func_nc)
81 ldr r12, [sp]
82 b L(ent)
83 EPILOGUE()
84 PROLOGUE(func)
85 mov r12, #0
86 L(ent): push { r4-r9 }
88 ands r6, n, #3
89 mov n, n, lsr #2
90 beq L(b00)
91 cmp r6, #2
92 bcc L(b01)
93 beq L(b10)
95 L(b11): ldr r5, [up], #4
96 ldr r7, [vp], #4
97 SETCY( r12)
98 ADDSUBC r9, r5, r7
99 ldrd r4, r5, [up, #0]
100 ldrd r6, r7, [vp, #0]
101 str r9, [rp], #-4
102 b L(lo)
104 L(b00): ldrd r4, r5, [up], #-8
105 ldrd r6, r7, [vp], #-8
106 SETCY( r12)
107 sub rp, rp, #16
108 b L(mid)
110 L(b01): ldr r5, [up], #-4
111 ldr r7, [vp], #-4
112 SETCY( r12)
113 ADDSUBC r9, r5, r7
114 str r9, [rp], #-12
115 tst n, n
116 beq L(wd1)
117 L(gt1): ldrd r4, r5, [up, #8]
118 ldrd r6, r7, [vp, #8]
119 b L(mid)
121 L(b10): ldrd r4, r5, [up]
122 ldrd r6, r7, [vp]
123 SETCY( r12)
124 sub rp, rp, #8
125 b L(lo)
127 ALIGN(16)
128 L(top): ldrd r4, r5, [up, #8]
129 ldrd r6, r7, [vp, #8]
130 strd r8, r9, [rp, #8]
131 L(mid): ADDSUBC r8, r4, r6
132 ADDSUBC r9, r5, r7
133 ldrd r4, r5, [up, #16]
134 ldrd r6, r7, [vp, #16]
135 strd r8, r9, [rp, #16]
136 ADDSUBC r8, r4, r6
137 ADDSUBC r9, r5, r7
138 sub n, n, #2
139 tst n, n
140 bmi L(dne)
141 ldrd r4, r5, [up, #24]
142 ldrd r6, r7, [vp, #24]
143 strd r8, r9, [rp, #24]
144 ADDSUBC r8, r4, r6
145 ADDSUBC r9, r5, r7
146 ldrd r4, r5, [up, #32]!
147 ldrd r6, r7, [vp, #32]!
148 strd r8, r9, [rp, #32]!
149 L(lo): ADDSUBC r8, r4, r6
150 ADDSUBC r9, r5, r7
151 tst n, n
152 bne L(top)
154 L(end): strd r8, r9, [rp, #8]
155 L(wd1): RETVAL
156 pop { r4-r9 }
157 bx r14
158 L(dne): strd r8, r9, [rp, #24]
159 RETVAL2
160 pop { r4-r9 }
161 bx r14
162 EPILOGUE()