1 dnl S
/390-32 mpn_sqr_basecase.
3 dnl Copyright
2011 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
42 C * Stop iterating addmul_1 loop at latest for n = 2, implement longer tail.
43 C This will ask for basecase handling of n = 3.
44 C * Update counters and pointers more straightforwardly, possibly lowering
46 C * Should we use this allocation-free style for more sqr_basecase asm
47 C implementations? The only disadvantage is that it requires R != U.
48 C * Replace loops by faster code. The mul_1 and addmul_1 loops could be sped
49 C up by about 10%. The sqr_diag_addlsh1 loop could probably be sped up even
58 define(`rp_saved', `
%r9
')
59 define(`up_saved', `
%r13
')
60 define(`n_saved', `
%r14
')
63 PROLOGUE(mpn_sqr_basecase)
77 stm %r6, %r8, 24(%r15)
81 mlr %r4, %r5 C u0 * u0
83 mlr %r0, %r1 C u1 * u1
87 ml %r6, 4(up) C u0 * u1
103 C mul_1 =======================================================================
105 stm %r6, %r14, 24(%r15)
113 lhi %r12, 8 C init index register
117 cr %r15, %r15 C clear carry flag
119 L(tm): l %r1, 0(%r12,up)
122 lr %r10, %r0 C copy high part to carry limb
130 C addmul_1 loop ===============================================================
136 la rp, 8(rp) C rp += 2
137 la up, 4(up) C up += 1
140 lhi %r12, 8 C init index register
146 L(tam): l %r1, 0(%r12,up)
160 brct n, L(outer_loop)
165 lr %r7, %r0 C Same as: l %r7, 12(,rp)
172 C sqr_dia_addlsh1 ============================================================
174 define(`up', `up_saved
')
175 define(`rp', `rp_saved
')
181 C clr %r15, %r15 C clear carry (already clear per above)
183 L(top): l %r11, 4(up)
190 alcr %r10, zero C propagate carry to high product limb
195 lr %r0, %r10 C copy carry limb
201 lm %r6, %r14, 24(%r15)