beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / arm / v6 / addmul_3.asm
blobd3d183343d4042349767cc80ddf9b043cc90be9c
1 dnl ARM mpn_addmul_3.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C StrongARM: -
37 C XScale -
38 C ARM11 4.33
39 C Cortex-A7 3.23
40 C Cortex-A8 3.19
41 C Cortex-A9 2.125
42 C Cortex-A15 2
44 C TODO
45 C * Use a fast path for n <= KARATSUBA_MUL_THRESHOLD using a jump table,
46 C avoiding the current multiply.
47 C * Start the first multiply or multiplies early.
49 define(`rp',`r0')
50 define(`up',`r1')
51 define(`n', `r2')
52 define(`vp',`r3')
54 define(`v0',`r4') define(`v1',`r5') define(`v2',`r6')
55 define(`u0',`r3') define(`u1',`r14')
56 define(`w0',`r7') define(`w1',`r8') define(`w2',`r9')
57 define(`cy0',`r10') define(`cy1',`r11') define(`cy2',`r12')
60 ASM_START()
61 PROLOGUE(mpn_addmul_3)
62 push { r4-r11, r14 }
64 ldr w0, =0xaaaaaaab C 3^{-1} mod 2^32
65 ldm vp, { v0,v1,v2 }
66 mov cy0, #0
67 mov cy1, #0
68 mov cy2, #0
70 C Tricky n mod 6
71 mul w0, w0, n C n * 3^{-1} mod 2^32
72 and w0, w0, #0xc0000001 C pseudo-CRT mod 3,2
73 sub n, n, #3
74 ifdef(`PIC',`
75 add pc, pc, w0, ror $28
76 nop
77 b L(b0)
78 b L(b2)
79 b L(b4)
80 .word 0xe7f000f0 C udf
81 b L(b3)
82 b L(b5)
83 b L(b1)
84 ',`
85 ldr pc, [pc, w0, ror $28]
86 nop
87 .word L(b0), L(b2), L(b4), 0, L(b3), L(b5), L(b1)
90 L(b5): add up, up, #-8
91 ldr w1, [rp, #0]
92 ldr w2, [rp, #4]
93 ldr u1, [up, #8]
94 b L(lo5)
96 L(b4): add rp, rp, #-4
97 add up, up, #-12
98 ldr w2, [rp, #4]
99 ldr w0, [rp, #8]
100 ldr u0, [up, #12]
101 b L(lo4)
103 L(b3): add rp, rp, #-8
104 add up, up, #-16
105 ldr w0, [rp, #8]
106 ldr w1, [rp, #12]
107 ldr u1, [up, #16]
108 b L(lo3)
110 L(b1): add rp, rp, #8
111 ldr w2, [rp, #-8]
112 ldr w0, [rp, #-4]
113 ldr u1, [up, #0]
114 b L(lo1)
116 L(b0): add rp, rp, #4
117 add up, up, #-4
118 ldr w0, [rp, #-4]
119 ldr w1, [rp, #0]
120 ldr u0, [up, #4]
121 b L(lo0)
123 L(b2): add rp, rp, #12
124 add up, up, #4
125 ldr w1, [rp, #-12]
126 ldr w2, [rp, #-8]
127 ldr u0, [up, #-4]
129 ALIGN(16)
130 L(top): ldr w0, [rp, #-4]
131 umaal w1, cy0, u0, v0
132 ldr u1, [up, #0]
133 umaal w2, cy1, u0, v1
134 str w1, [rp, #-12]
135 umaal w0, cy2, u0, v2
136 L(lo1): ldr w1, [rp, #0]
137 umaal w2, cy0, u1, v0
138 ldr u0, [up, #4]
139 umaal w0, cy1, u1, v1
140 str w2, [rp, #-8]
141 umaal w1, cy2, u1, v2
142 L(lo0): ldr w2, [rp, #4]
143 umaal w0, cy0, u0, v0
144 ldr u1, [up, #8]
145 umaal w1, cy1, u0, v1
146 str w0, [rp, #-4]
147 umaal w2, cy2, u0, v2
148 L(lo5): ldr w0, [rp, #8]
149 umaal w1, cy0, u1, v0
150 ldr u0, [up, #12]
151 umaal w2, cy1, u1, v1
152 str w1, [rp, #0]
153 umaal w0, cy2, u1, v2
154 L(lo4): ldr w1, [rp, #12]
155 umaal w2, cy0, u0, v0
156 ldr u1, [up, #16]
157 umaal w0, cy1, u0, v1
158 str w2, [rp, #4]
159 umaal w1, cy2, u0, v2
160 L(lo3): ldr w2, [rp, #16]
161 umaal w0, cy0, u1, v0
162 ldr u0, [up, #20]
163 umaal w1, cy1, u1, v1
164 str w0, [rp, #8]
165 umaal w2, cy2, u1, v2
166 L(lo2): subs n, n, #6
167 add up, up, #24
168 add rp, rp, #24
169 bge L(top)
171 L(end): umaal w1, cy0, u0, v0
172 ldr u1, [up, #0]
173 umaal w2, cy1, u0, v1
174 str w1, [rp, #-12]
175 mov w0, #0
176 umaal w0, cy2, u0, v2
177 umaal w2, cy0, u1, v0
178 umaal w0, cy1, u1, v1
179 str w2, [rp, #-8]
180 umaal cy1, cy2, u1, v2
181 adds w0, w0, cy0
182 str w0, [rp, #-4]
183 adcs w1, cy1, #0
184 str w1, [rp, #0]
185 adc r0, cy2, #0
187 pop { r4-r11, pc }
188 EPILOGUE()