1 dnl ARM v6 mpn_sqr_basecase.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright
2012, 2013, 2015 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
38 C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4)
43 C ____________ ____________
46 C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4)
48 C \____________/ \____________/
58 C * Align more labels.
59 C * Further tweak counter and updates in outer loops. (This could save
61 C * Avoid sub-with-lsl in outer loops. We could keep n up-shifted, then
62 C initialise loop counter i with a right shift.
63 C * Try to use fewer register. Perhaps coalesce r9 branch target and n_saved.
64 C (This could save 2-3 cycles for n > 4.)
65 C * Optimise sqr_diag_addlsh1 loop. The current code uses old-style carry
67 C * Stop loops earlier suppressing writes of upper-most rp[] values.
68 C * The addmul_2 loops here runs well on all cores, but mul_2 runs poorly
69 C particularly on Cortex-A8.
79 define(`n_saved', r14
)
86 PROLOGUE
(mpn_sqr_basecase
)
90 add pc
, pc
, r12
, lsl #
2
102 L
(1m4
): push {r4-r11, r14}
106 add r10
, pc
, #L
(am2_2m4
)-.
-8
111 umull r4
, cya
, v1
, v0
116 L
(3m4
): push {r4-r11, r14}
120 add r10
, pc
, #L
(am2_0m4
)-.
-8
125 umull r4
, cya
, v1
, v0
130 L
(2m4
): push {r4-r11, r14}
134 add r10
, pc
, #L
(am2_3m4
)-.
-8
138 umull r5
, cya
, v1
, v0
143 L
(0m4
): push {r4-r11, r14}
147 add r10
, pc
, #L
(am2_1m4
)-.
-8
152 umull r5
, cya
, v1
, v0
156 L
(top
): ldr u0
, [up
, #
4]
157 umaal r4
, cya
, u1
, v0
160 umaal r5
, cyb
, u1
, v1
161 L
(ko2
): ldr u1
, [up
, #
8]
162 umaal r5
, cya
, u0
, v0
165 umaal r4
, cyb
, u0
, v1
166 L
(ko1
): ldr u0
, [up
, #
12]
167 umaal r4
, cya
, u1
, v0
170 umaal r5
, cyb
, u1
, v1
171 L
(ko0
): ldr u1
, [up
, #
16]!
172 umaal r5
, cya
, u0
, v0
175 umaal r4
, cyb
, u0
, v1
179 umaal r4
, cya
, u1
, v0
181 umaal r5
, cyb
, u1
, v1
183 umaal r5
, cya
, u0
, v0
184 umaal cya
, cyb
, u0
, v1
203 umaal r4
, cya
, v1
, v0
208 L
(ua2
): ldr r5
, [rp
, #
4]
209 umaal r4
, cya
, u1
, v0
211 umaal r5
, cyb
, u1
, v1
214 umaal r5
, cya
, u0
, v0
216 umaal r4
, cyb
, u0
, v1
219 umaal r4
, cya
, u1
, v0
221 umaal r5
, cyb
, u1
, v1
224 umaal r5
, cya
, u0
, v0
226 umaal r4
, cyb
, u0
, v1
231 umaal r4
, cya
, u1
, v0
232 umaal cya
, cyb
, u1
, v1
237 sub rp
, rp
, n
, lsl #
2
238 sub up
, up
, n
, lsl #
2
247 umaal r4
, cya
, v1
, v0
253 L
(ua0
): ldr r5
, [rp
, #
4]
254 umaal r4
, cya
, u1
, v0
256 umaal r5
, cyb
, u1
, v1
259 umaal r5
, cya
, u0
, v0
261 umaal r4
, cyb
, u0
, v1
263 L
(lo0
): ldr r5
, [rp
, #
12]
264 umaal r4
, cya
, u1
, v0
266 umaal r5
, cyb
, u1
, v1
269 umaal r5
, cya
, u0
, v0
271 umaal r4
, cyb
, u0
, v1
276 umaal r4
, cya
, u1
, v0
277 umaal cya
, cyb
, u1
, v1
282 sub rp
, rp
, n
, lsl #
2
283 sub up
, up
, n
, lsl #
2
295 umaal r5
, cya
, v1
, v0
302 L
(ua1
): ldr r5
, [rp
, #
4]
303 umaal r4
, cya
, u1
, v0
305 umaal r5
, cyb
, u1
, v1
307 L
(lo1
): ldr r4
, [rp
, #
8]
308 umaal r5
, cya
, u0
, v0
310 umaal r4
, cyb
, u0
, v1
313 umaal r4
, cya
, u1
, v0
315 umaal r5
, cyb
, u1
, v1
318 umaal r5
, cya
, u0
, v0
320 umaal r4
, cyb
, u0
, v1
325 umaal r4
, cya
, u1
, v0
326 umaal cya
, cyb
, u1
, v1
331 sub rp
, rp
, n
, lsl #
2
332 sub up
, up
, n
, lsl #
2
343 umaal r5
, cya
, v1
, v0
349 L
(ua3
): ldr r5
, [rp
, #
4]
350 umaal r4
, cya
, u1
, v0
352 umaal r5
, cyb
, u1
, v1
355 umaal r5
, cya
, u0
, v0
357 umaal r4
, cyb
, u0
, v1
360 umaal r4
, cya
, u1
, v0
362 umaal r5
, cyb
, u1
, v1
364 L
(lo3
): ldr r4
, [rp
, #
16]!
365 umaal r5
, cya
, u0
, v0
367 umaal r4
, cyb
, u0
, v1
372 umaal r4
, cya
, u1
, v0
373 umaal cya
, cyb
, u1
, v1
378 sub rp
, rp
, n
, lsl #
2
379 sub up
, up
, n
, lsl #
2
384 L
(cor3
):ldm up
, {v0,v1,u0}
388 umaal r5
, cya
, v1
, v0
392 umaal r5
, cya
, u0
, v0
394 umaal r4
, cyb
, u0
, v1
396 umaal r4
, cya
, u1
, v0
397 umaal cya
, cyb
, u1
, v1
403 adds rp
, rp
, #
36 C clear cy
405 umaal cya
, cyb
, u1
, u0
406 b L
(sqr_diag_addlsh1
)
413 umaal r4
, cya
, v1
, v0
415 umaal r5
, cya
, u0
, v0
416 strd r4
, r5
, [rp
, #
-4]
417 umaal cya
, cyb
, u0
, v1
419 C b L
(sqr_diag_addlsh1
)
431 sub up
, up
, n_saved
, lsl #
2
432 sub rp
, rp
, n_saved
, lsl #
3
437 C cmn r0
, #
0 C clear cy
(already clear
)
440 L
(tsd
): adds w0
, w0
, rbx
443 L
(lm
): ldr w0
, [rp
, #
4]
463 C Straight line code for n
<= 4
465 L
(1): ldr r3
, [up
, #
0]
473 umull r3
, r4
, r12
, r12
474 umull r5
, r12
, r5
, r12
481 stm rp
, {r1,r2,r3,r4}
490 umull r10
, r11
, r7
, r8
492 umlal r11
, r12
, r7
, r9
494 umlal r12
, r7
, r8
, r9
505 stm rp
, {r1,r2,r3,r4,r5,r6}
509 L
(4): push {r4-r11, r14}
510 ldm up
, {r9,r10,r11,r12}
512 umull r3
, r4
, r10
, r10
513 umull r5
, r6
, r11
, r11
514 umull r7
, r8
, r12
, r12
515 stm rp
, {r1,r2,r3,r4,r5,r6,r7}
516 umull r1
, r2
, r9
, r10
518 umlal r2
, r3
, r9
, r11
520 umlal r3
, r4
, r9
, r12
522 umlal r3
, r5
, r10
, r11
523 umaal r4
, r5
, r10
, r12
525 umlal r5
, r6
, r11
, r12
534 ldm rp
, {r8,r9,r10,r11,r12,r14}
542 stm rp
, {r1,r2,r3,r4,r5,r6,r7}