beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / arm / v7a / cora15 / submul_1.asm
blobed7bfe820b68e2af3eb3bcdc97092aad728e295e
1 dnl ARM mpn_submul_1 optimised for A15.
3 dnl Copyright 2012, 2013 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb best
34 C StrongARM: -
35 C XScale ?
36 C Cortex-A7 ?
37 C Cortex-A8 ?
38 C Cortex-A9 5.75 3.75
39 C Cortex-A15 2.32 this
41 C This code uses umlal and umaal for adding in the rp[] data, keeping the
42 C recurrency path separate from any multiply instructions. It performs well on
43 C A15, but not quite at the multiply bandwidth like the corresponding addmul_1
44 C code.
46 C We don't use r12 due to ldrd and strd limitations.
48 C This loop complements U on the fly,
49 C U' = B^n - 1 - U
50 C and then uses that
51 C R - U*v = R + U'*v + v - B^n v
53 C Architecture requirements:
54 C v5 -
55 C v5t -
56 C v5te ldrd strd
57 C v6 umaal
58 C v6t2 -
59 C v7a -
61 define(`rp', `r0')
62 define(`up', `r1')
63 define(`n', `r2')
64 define(`v0', `r3')
66 define(`w0', `r10') define(`w1', `r11')
67 define(`u0', `r8') define(`u1', `r9')
69 ASM_START()
70 PROLOGUE(mpn_submul_1)
71 sub sp, sp, #32
72 strd r10, r11, [sp, #24]
73 strd r8, r9, [sp, #16]
74 strd r6, r7, [sp, #8]
75 strd r4, r5, [sp, #0]
76 C push { r4-r11 }
78 ands r6, n, #3
79 sub n, n, #3
80 beq L(b00)
81 cmp r6, #2
82 bcc L(b01)
83 beq L(b10)
85 L(b11): mov r6, #0
86 ldr u1, [up], #-4
87 ldr w1, [rp], #-16
88 mvn u1, u1
89 adds r7, v0, #0
90 b L(mid)
92 L(b00): ldrd u0, u1, [up]
93 ldrd w0, w1, [rp], #-12
94 mvn u0, u0
95 mvn u1, u1
96 mov r6, v0
97 umaal w0, r6, u0, v0
98 cmn r13, #0 C carry clear
99 mov r7, #0
100 str w0, [rp, #12]
101 b L(mid)
103 L(b10): ldrd u0, u1, [up], #8
104 ldrd w0, w1, [rp]
105 mvn u0, u0
106 mvn u1, u1
107 mov r4, v0
108 umaal w0, r4, u0, v0
109 mov r5, #0
110 str w0, [rp], #-4
111 umlal w1, r5, u1, v0
112 adds n, n, #0
113 bmi L(end)
114 b L(top)
116 L(b01): ldr u1, [up], #4
117 ldr w1, [rp], #-8
118 mvn u1, u1
119 mov r5, v0
120 mov r4, #0
121 umaal w1, r5, u1, v0
122 tst n, n
123 bmi L(end)
125 C ALIGN(16)
126 L(top): ldrd u0, u1, [up, #0]
127 adcs r4, r4, w1
128 mvn u0, u0
129 ldrd w0, w1, [rp, #12]
130 mvn u1, u1
131 mov r6, #0
132 umlal w0, r6, u0, v0 C 1 2
133 adcs r5, r5, w0
134 mov r7, #0
135 strd r4, r5, [rp, #8]
136 L(mid): umaal w1, r7, u1, v0 C 2 3
137 ldrd u0, u1, [up, #8]
138 add up, up, #16
139 adcs r6, r6, w1
140 mvn u0, u0
141 ldrd w0, w1, [rp, #20]
142 mvn u1, u1
143 mov r4, #0
144 umlal w0, r4, u0, v0 C 3 4
145 adcs r7, r7, w0
146 mov r5, #0
147 strd r6, r7, [rp, #16]!
148 sub n, n, #4
149 umlal w1, r5, u1, v0 C 0 1
150 tst n, n
151 bpl L(top)
153 L(end): adcs r4, r4, w1
154 str r4, [rp, #8]
155 adc r0, r5, #0
156 sub r0, v0, r0
157 pop { r4-r11 }
158 bx r14
159 EPILOGUE()