beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / arm / v7a / cora15 / neon / copyd.asm
blob98fe535def59bfd91c181503036f75739d0406ac
1 dnl ARM Neon mpn_copyd optimised for A15.
3 dnl Copyright 2013 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C StrongARM -
35 C XScale -
36 C Cortex-A7 ?
37 C Cortex-A8 ?
38 C Cortex-A9 1.75 slower than core register code
39 C Cortex-A15 0.52
41 define(`rp', `r0')
42 define(`up', `r1')
43 define(`n', `r2')
45 ASM_START()
46 PROLOGUE(mpn_copyd)
47 add rp, rp, n, lsl #2
48 add up, up, n, lsl #2
50 cmp n, #7
51 ble L(bc)
53 C Copy until rp is 128-bit aligned
54 tst rp, #4
55 beq L(al1)
56 sub up, up, #4
57 vld1.32 {d22[0]}, [up]
58 sub n, n, #1
59 sub rp, rp, #4
60 vst1.32 {d22[0]}, [rp]
61 L(al1): tst rp, #8
62 beq L(al2)
63 sub up, up, #8
64 vld1.32 {d22}, [up]
65 sub n, n, #2
66 sub rp, rp, #8
67 vst1.32 {d22}, [rp:64]
68 L(al2): sub up, up, #16
69 vld1.32 {d26-d27}, [up]
70 subs n, n, #12
71 sub rp, rp, #16 C offset rp for loop
72 blt L(end)
74 sub up, up, #16 C offset up for loop
75 mov r12, #-16
77 ALIGN(16)
78 L(top): vld1.32 {d22-d23}, [up], r12
79 vst1.32 {d26-d27}, [rp:128], r12
80 vld1.32 {d26-d27}, [up], r12
81 vst1.32 {d22-d23}, [rp:128], r12
82 subs n, n, #8
83 bge L(top)
85 add up, up, #16 C undo up offset
86 C rp offset undoing folded
87 L(end): vst1.32 {d26-d27}, [rp:128]
89 C Copy last 0-7 limbs. Note that rp is aligned after loop, but not when we
90 C arrive here via L(bc)
91 L(bc): tst n, #4
92 beq L(tl1)
93 sub up, up, #16
94 vld1.32 {d22-d23}, [up]
95 sub rp, rp, #16
96 vst1.32 {d22-d23}, [rp]
97 L(tl1): tst n, #2
98 beq L(tl2)
99 sub up, up, #8
100 vld1.32 {d22}, [up]
101 sub rp, rp, #8
102 vst1.32 {d22}, [rp]
103 L(tl2): tst n, #1
104 beq L(tl3)
105 sub up, up, #4
106 vld1.32 {d22[0]}, [up]
107 sub rp, rp, #4
108 vst1.32 {d22[0]}, [rp]
109 L(tl3): bx lr
110 EPILOGUE()