beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / arm / v6 / sqr_basecase.asm
blob0fc4f13dadb6191d0a6f24446811e3701e37ae7a
1 dnl ARM v6 mpn_sqr_basecase.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C Code structure:
38 C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4)
39 C | | | |
40 C | | | |
41 C | | | |
42 C \|/ \|/ \|/ \|/
43 C ____________ ____________
44 C / \ / \
45 C \|/ \ \|/ \
46 C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4)
47 C \ /|\ \ /|\
48 C \____________/ \____________/
49 C \ /
50 C \ /
51 C \ /
52 C cor3 cor2
53 C \ /
54 C \ /
55 C sqr_diag_addlsh1
57 C TODO
58 C * Align more labels.
59 C * Further tweak counter and updates in outer loops. (This could save
60 C perhaps 5n cycles).
61 C * Avoid sub-with-lsl in outer loops. We could keep n up-shifted, then
62 C initialise loop counter i with a right shift.
63 C * Try to use fewer register. Perhaps coalesce r9 branch target and n_saved.
64 C (This could save 2-3 cycles for n > 4.)
65 C * Optimise sqr_diag_addlsh1 loop. The current code uses old-style carry
66 C propagation.
67 C * Stop loops earlier suppressing writes of upper-most rp[] values.
68 C * The addmul_2 loops here runs well on all cores, but mul_2 runs poorly
69 C particularly on Cortex-A8.
72 define(`rp', r0)
73 define(`up', r1)
74 define(`n', r2)
76 define(`v0', r3)
77 define(`v1', r6)
78 define(`i', r8)
79 define(`n_saved', r14)
80 define(`cya', r11)
81 define(`cyb', r12)
82 define(`u0', r7)
83 define(`u1', r9)
85 ASM_START()
86 PROLOGUE(mpn_sqr_basecase)
87 and r12, n, #3
88 cmp n, #4
89 addgt r12, r12, #4
90 add pc, pc, r12, lsl #2
91 nop
92 b L(4)
93 b L(1)
94 b L(2)
95 b L(3)
96 b L(0m4)
97 b L(1m4)
98 b L(2m4)
99 b L(3m4)
102 L(1m4): push {r4-r11, r14}
103 mov n_saved, n
104 sub i, n, #4
105 sub n, n, #2
106 add r10, pc, #L(am2_2m4)-.-8
107 ldm up, {v0,v1,u0}
108 sub up, up, #4
109 mov cyb, #0
110 mov r5, #0
111 umull r4, cya, v1, v0
112 str r4, [rp], #-12
113 mov r4, #0
114 b L(ko0)
116 L(3m4): push {r4-r11, r14}
117 mov n_saved, n
118 sub i, n, #4
119 sub n, n, #2
120 add r10, pc, #L(am2_0m4)-.-8
121 ldm up, {v0,v1,u0}
122 add up, up, #4
123 mov cyb, #0
124 mov r5, #0
125 umull r4, cya, v1, v0
126 str r4, [rp], #-4
127 mov r4, #0
128 b L(ko2)
130 L(2m4): push {r4-r11, r14}
131 mov n_saved, n
132 sub i, n, #4
133 sub n, n, #2
134 add r10, pc, #L(am2_3m4)-.-8
135 ldm up, {v0,v1,u1}
136 mov cyb, #0
137 mov r4, #0
138 umull r5, cya, v1, v0
139 str r5, [rp], #-8
140 mov r5, #0
141 b L(ko1)
143 L(0m4): push {r4-r11, r14}
144 mov n_saved, n
145 sub i, n, #4
146 sub n, n, #2
147 add r10, pc, #L(am2_1m4)-.-8
148 ldm up, {v0,v1,u1}
149 mov cyb, #0
150 mov r4, #0
151 add up, up, #8
152 umull r5, cya, v1, v0
153 str r5, [rp, #0]
154 mov r5, #0
156 L(top): ldr u0, [up, #4]
157 umaal r4, cya, u1, v0
158 str r4, [rp, #4]
159 mov r4, #0
160 umaal r5, cyb, u1, v1
161 L(ko2): ldr u1, [up, #8]
162 umaal r5, cya, u0, v0
163 str r5, [rp, #8]
164 mov r5, #0
165 umaal r4, cyb, u0, v1
166 L(ko1): ldr u0, [up, #12]
167 umaal r4, cya, u1, v0
168 str r4, [rp, #12]
169 mov r4, #0
170 umaal r5, cyb, u1, v1
171 L(ko0): ldr u1, [up, #16]!
172 umaal r5, cya, u0, v0
173 str r5, [rp, #16]!
174 mov r5, #0
175 umaal r4, cyb, u0, v1
176 subs i, i, #4
177 bhi L(top)
179 umaal r4, cya, u1, v0
180 ldr u0, [up, #4]
181 umaal r5, cyb, u1, v1
182 str r4, [rp, #4]
183 umaal r5, cya, u0, v0
184 umaal cya, cyb, u0, v1
185 str r5, [rp, #8]
186 str cya, [rp, #12]
187 str cyb, [rp, #16]
189 add up, up, #4
190 sub n, n, #1
191 add rp, rp, #8
192 bx r10
194 L(evnloop):
195 subs i, n, #6
196 sub n, n, #2
197 blt L(cor2)
198 ldm up, {v0,v1,u1}
199 add up, up, #8
200 mov cya, #0
201 mov cyb, #0
202 ldr r4, [rp, #-4]
203 umaal r4, cya, v1, v0
204 str r4, [rp, #-4]
205 ldr r4, [rp, #0]
207 ALIGN(16)
208 L(ua2): ldr r5, [rp, #4]
209 umaal r4, cya, u1, v0
210 ldr u0, [up, #4]
211 umaal r5, cyb, u1, v1
212 str r4, [rp, #0]
213 ldr r4, [rp, #8]
214 umaal r5, cya, u0, v0
215 ldr u1, [up, #8]
216 umaal r4, cyb, u0, v1
217 str r5, [rp, #4]
218 ldr r5, [rp, #12]
219 umaal r4, cya, u1, v0
220 ldr u0, [up, #12]
221 umaal r5, cyb, u1, v1
222 str r4, [rp, #8]
223 ldr r4, [rp, #16]!
224 umaal r5, cya, u0, v0
225 ldr u1, [up, #16]!
226 umaal r4, cyb, u0, v1
227 str r5, [rp, #-4]
228 subs i, i, #4
229 bhs L(ua2)
231 umaal r4, cya, u1, v0
232 umaal cya, cyb, u1, v1
233 str r4, [rp, #0]
234 str cya, [rp, #4]
235 str cyb, [rp, #8]
236 L(am2_0m4):
237 sub rp, rp, n, lsl #2
238 sub up, up, n, lsl #2
239 add rp, rp, #8
241 sub i, n, #4
242 sub n, n, #2
243 ldm up, {v0,v1,u1}
244 mov cya, #0
245 mov cyb, #0
246 ldr r4, [rp, #4]
247 umaal r4, cya, v1, v0
248 str r4, [rp, #4]
249 ldr r4, [rp, #8]
250 b L(lo0)
252 ALIGN(16)
253 L(ua0): ldr r5, [rp, #4]
254 umaal r4, cya, u1, v0
255 ldr u0, [up, #4]
256 umaal r5, cyb, u1, v1
257 str r4, [rp, #0]
258 ldr r4, [rp, #8]
259 umaal r5, cya, u0, v0
260 ldr u1, [up, #8]
261 umaal r4, cyb, u0, v1
262 str r5, [rp, #4]
263 L(lo0): ldr r5, [rp, #12]
264 umaal r4, cya, u1, v0
265 ldr u0, [up, #12]
266 umaal r5, cyb, u1, v1
267 str r4, [rp, #8]
268 ldr r4, [rp, #16]!
269 umaal r5, cya, u0, v0
270 ldr u1, [up, #16]!
271 umaal r4, cyb, u0, v1
272 str r5, [rp, #-4]
273 subs i, i, #4
274 bhs L(ua0)
276 umaal r4, cya, u1, v0
277 umaal cya, cyb, u1, v1
278 str r4, [rp, #0]
279 str cya, [rp, #4]
280 str cyb, [rp, #8]
281 L(am2_2m4):
282 sub rp, rp, n, lsl #2
283 sub up, up, n, lsl #2
284 add rp, rp, #16
285 b L(evnloop)
288 L(oddloop):
289 sub i, n, #5
290 sub n, n, #2
291 ldm up, {v0,v1,u0}
292 mov cya, #0
293 mov cyb, #0
294 ldr r5, [rp, #0]
295 umaal r5, cya, v1, v0
296 str r5, [rp, #0]
297 ldr r5, [rp, #4]
298 add up, up, #4
299 b L(lo1)
301 ALIGN(16)
302 L(ua1): ldr r5, [rp, #4]
303 umaal r4, cya, u1, v0
304 ldr u0, [up, #4]
305 umaal r5, cyb, u1, v1
306 str r4, [rp, #0]
307 L(lo1): ldr r4, [rp, #8]
308 umaal r5, cya, u0, v0
309 ldr u1, [up, #8]
310 umaal r4, cyb, u0, v1
311 str r5, [rp, #4]
312 ldr r5, [rp, #12]
313 umaal r4, cya, u1, v0
314 ldr u0, [up, #12]
315 umaal r5, cyb, u1, v1
316 str r4, [rp, #8]
317 ldr r4, [rp, #16]!
318 umaal r5, cya, u0, v0
319 ldr u1, [up, #16]!
320 umaal r4, cyb, u0, v1
321 str r5, [rp, #-4]
322 subs i, i, #4
323 bhs L(ua1)
325 umaal r4, cya, u1, v0
326 umaal cya, cyb, u1, v1
327 str r4, [rp, #0]
328 str cya, [rp, #4]
329 str cyb, [rp, #8]
330 L(am2_3m4):
331 sub rp, rp, n, lsl #2
332 sub up, up, n, lsl #2
333 add rp, rp, #4
335 subs i, n, #3
336 beq L(cor3)
337 sub n, n, #2
338 ldm up, {v0,v1,u0}
339 mov cya, #0
340 mov cyb, #0
341 ldr r5, [rp, #8]
342 sub up, up, #4
343 umaal r5, cya, v1, v0
344 str r5, [rp, #8]
345 ldr r5, [rp, #12]
346 b L(lo3)
348 ALIGN(16)
349 L(ua3): ldr r5, [rp, #4]
350 umaal r4, cya, u1, v0
351 ldr u0, [up, #4]
352 umaal r5, cyb, u1, v1
353 str r4, [rp, #0]
354 ldr r4, [rp, #8]
355 umaal r5, cya, u0, v0
356 ldr u1, [up, #8]
357 umaal r4, cyb, u0, v1
358 str r5, [rp, #4]
359 ldr r5, [rp, #12]
360 umaal r4, cya, u1, v0
361 ldr u0, [up, #12]
362 umaal r5, cyb, u1, v1
363 str r4, [rp, #8]
364 L(lo3): ldr r4, [rp, #16]!
365 umaal r5, cya, u0, v0
366 ldr u1, [up, #16]!
367 umaal r4, cyb, u0, v1
368 str r5, [rp, #-4]
369 subs i, i, #4
370 bhs L(ua3)
372 umaal r4, cya, u1, v0
373 umaal cya, cyb, u1, v1
374 str r4, [rp, #0]
375 str cya, [rp, #4]
376 str cyb, [rp, #8]
377 L(am2_1m4):
378 sub rp, rp, n, lsl #2
379 sub up, up, n, lsl #2
380 add rp, rp, #12
381 b L(oddloop)
384 L(cor3):ldm up, {v0,v1,u0}
385 ldr r5, [rp, #8]
386 mov cya, #0
387 mov cyb, #0
388 umaal r5, cya, v1, v0
389 str r5, [rp, #8]
390 ldr r5, [rp, #12]
391 ldr r4, [rp, #16]
392 umaal r5, cya, u0, v0
393 ldr u1, [up, #12]
394 umaal r4, cyb, u0, v1
395 str r5, [rp, #12]
396 umaal r4, cya, u1, v0
397 umaal cya, cyb, u1, v1
398 str r4, [rp, #16]
399 str cya, [rp, #20]
400 str cyb, [rp, #24]
401 add up, up, #16
402 mov cya, cyb
403 adds rp, rp, #36 C clear cy
404 mov cyb, #0
405 umaal cya, cyb, u1, u0
406 b L(sqr_diag_addlsh1)
408 L(cor2):
409 ldm up!, {v0,v1,u0}
410 mov r4, cya
411 mov r5, cyb
412 mov cya, #0
413 umaal r4, cya, v1, v0
414 mov cyb, #0
415 umaal r5, cya, u0, v0
416 strd r4, r5, [rp, #-4]
417 umaal cya, cyb, u0, v1
418 add rp, rp, #16
419 C b L(sqr_diag_addlsh1)
422 define(`w0', r6)
423 define(`w1', r7)
424 define(`w2', r8)
425 define(`rbx', r9)
427 L(sqr_diag_addlsh1):
428 str cya, [rp, #-12]
429 str cyb, [rp, #-8]
430 sub n, n_saved, #1
431 sub up, up, n_saved, lsl #2
432 sub rp, rp, n_saved, lsl #3
433 ldr r3, [up], #4
434 umull w1, r5, r3, r3
435 mov w2, #0
436 mov r10, #0
437 C cmn r0, #0 C clear cy (already clear)
438 b L(lm)
440 L(tsd): adds w0, w0, rbx
441 adcs w1, w1, r4
442 str w0, [rp, #0]
443 L(lm): ldr w0, [rp, #4]
444 str w1, [rp, #4]
445 ldr w1, [rp, #8]!
446 add rbx, r5, w2
447 adcs w0, w0, w0
448 ldr r3, [up], #4
449 adcs w1, w1, w1
450 adc w2, r10, r10
451 umull r4, r5, r3, r3
452 subs n, n, #1
453 bne L(tsd)
455 adds w0, w0, rbx
456 adcs w1, w1, r4
457 adc w2, r5, w2
458 stm rp, {w0,w1,w2}
460 pop {r4-r11, pc}
463 C Straight line code for n <= 4
465 L(1): ldr r3, [up, #0]
466 umull r1, r2, r3, r3
467 stm rp, {r1,r2}
468 bx r14
470 L(2): push {r4-r5}
471 ldm up, {r5,r12}
472 umull r1, r2, r5, r5
473 umull r3, r4, r12, r12
474 umull r5, r12, r5, r12
475 adds r5, r5, r5
476 adcs r12, r12, r12
477 adc r4, r4, #0
478 adds r2, r2, r5
479 adcs r3, r3, r12
480 adc r4, r4, #0
481 stm rp, {r1,r2,r3,r4}
482 pop {r4-r5}
483 bx r14
485 L(3): push {r4-r11}
486 ldm up, {r7,r8,r9}
487 umull r1, r2, r7, r7
488 umull r3, r4, r8, r8
489 umull r5, r6, r9, r9
490 umull r10, r11, r7, r8
491 mov r12, #0
492 umlal r11, r12, r7, r9
493 mov r7, #0
494 umlal r12, r7, r8, r9
495 adds r10, r10, r10
496 adcs r11, r11, r11
497 adcs r12, r12, r12
498 adcs r7, r7, r7
499 adc r6, r6, #0
500 adds r2, r2, r10
501 adcs r3, r3, r11
502 adcs r4, r4, r12
503 adcs r5, r5, r7
504 adc r6, r6, #0
505 stm rp, {r1,r2,r3,r4,r5,r6}
506 pop {r4-r11}
507 bx r14
509 L(4): push {r4-r11, r14}
510 ldm up, {r9,r10,r11,r12}
511 umull r1, r2, r9, r9
512 umull r3, r4, r10, r10
513 umull r5, r6, r11, r11
514 umull r7, r8, r12, r12
515 stm rp, {r1,r2,r3,r4,r5,r6,r7}
516 umull r1, r2, r9, r10
517 mov r3, #0
518 umlal r2, r3, r9, r11
519 mov r4, #0
520 umlal r3, r4, r9, r12
521 mov r5, #0
522 umlal r3, r5, r10, r11
523 umaal r4, r5, r10, r12
524 mov r6, #0
525 umlal r5, r6, r11, r12
526 adds r1, r1, r1
527 adcs r2, r2, r2
528 adcs r3, r3, r3
529 adcs r4, r4, r4
530 adcs r5, r5, r5
531 adcs r6, r6, r6
532 add rp, rp, #4
533 adc r7, r8, #0
534 ldm rp, {r8,r9,r10,r11,r12,r14}
535 adds r1, r1, r8
536 adcs r2, r2, r9
537 adcs r3, r3, r10
538 adcs r4, r4, r11
539 adcs r5, r5, r12
540 adcs r6, r6, r14
541 adc r7, r7, #0
542 stm rp, {r1,r2,r3,r4,r5,r6,r7}
543 pop {r4-r11, pc}
544 EPILOGUE()