beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / s390_32 / esame / sqr_basecase.asm
blobdcc13112bf3b28a95abcb7e4afeede02bef01dcd
1 dnl S/390-32 mpn_sqr_basecase.
3 dnl Copyright 2011 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C z900 ?
35 C z990 23
36 C z9 ?
37 C z10 ?
38 C z196 ?
40 C TODO
41 C * Clean up.
42 C * Stop iterating addmul_1 loop at latest for n = 2, implement longer tail.
43 C This will ask for basecase handling of n = 3.
44 C * Update counters and pointers more straightforwardly, possibly lowering
45 C register usage.
46 C * Should we use this allocation-free style for more sqr_basecase asm
47 C implementations? The only disadvantage is that it requires R != U.
48 C * Replace loops by faster code. The mul_1 and addmul_1 loops could be sped
49 C up by about 10%. The sqr_diag_addlsh1 loop could probably be sped up even
50 C more.
52 C INPUT PARAMETERS
53 define(`rp', `%r2')
54 define(`up', `%r3')
55 define(`n', `%r4')
57 define(`zero', `%r8')
58 define(`rp_saved', `%r9')
59 define(`up_saved', `%r13')
60 define(`n_saved', `%r14')
62 ASM_START()
63 PROLOGUE(mpn_sqr_basecase)
64 ahi n, -2
65 jhe L(ge2)
67 C n = 1
68 l %r5, 0(up)
69 mlr %r4, %r5
70 st %r5, 0(rp)
71 st %r4, 4(rp)
72 br %r14
74 L(ge2): jne L(gen)
76 C n = 2
77 stm %r6, %r8, 24(%r15)
78 lhi zero, 0
80 l %r5, 0(up)
81 mlr %r4, %r5 C u0 * u0
82 l %r1, 4(up)
83 mlr %r0, %r1 C u1 * u1
84 st %r5, 0(rp)
86 l %r7, 0(up)
87 ml %r6, 4(up) C u0 * u1
88 alr %r7, %r7
89 alcr %r6, %r6
90 alcr %r0, zero
92 alr %r4, %r7
93 alcr %r1, %r6
94 alcr %r0, zero
95 st %r4, 4(rp)
96 st %r1, 8(rp)
97 st %r0, 12(rp)
99 lm %r6, %r8, 24(%r15)
100 br %r14
102 L(gen):
103 C mul_1 =======================================================================
105 stm %r6, %r14, 24(%r15)
106 lhi zero, 0
107 lr up_saved, up
108 lr rp_saved, rp
109 lr n_saved, n
111 l %r6, 0(up)
112 l %r11, 4(up)
113 lhi %r12, 8 C init index register
114 mlr %r10, %r6
115 lr %r5, n
116 st %r11, 4(rp)
117 cr %r15, %r15 C clear carry flag
119 L(tm): l %r1, 0(%r12,up)
120 mlr %r0, %r6
121 alcr %r1, %r10
122 lr %r10, %r0 C copy high part to carry limb
123 st %r1, 0(%r12,rp)
124 la %r12, 4(%r12)
125 brct %r5, L(tm)
127 alcr %r0, zero
128 st %r0, 0(%r12,rp)
130 C addmul_1 loop ===============================================================
132 ahi n, -1
133 je L(outer_end)
134 L(outer_loop):
136 la rp, 8(rp) C rp += 2
137 la up, 4(up) C up += 1
138 l %r6, 0(up)
139 l %r11, 4(up)
140 lhi %r12, 8 C init index register
141 mlr %r10, %r6
142 lr %r5, n
143 al %r11, 4(rp)
144 st %r11, 4(rp)
146 L(tam): l %r1, 0(%r12,up)
147 l %r7, 0(%r12,rp)
148 mlr %r0, %r6
149 alcr %r1, %r7
150 alcr %r0, zero
151 alr %r1, %r10
152 lr %r10, %r0
153 st %r1, 0(%r12,rp)
154 la %r12, 4(%r12)
155 brct %r5, L(tam)
157 alcr %r0, zero
158 st %r0, 0(%r12,rp)
160 brct n, L(outer_loop)
161 L(outer_end):
163 l %r6, 4(up)
164 l %r1, 8(up)
165 lr %r7, %r0 C Same as: l %r7, 12(,rp)
166 mlr %r0, %r6
167 alr %r1, %r7
168 alcr %r0, zero
169 st %r1, 12(rp)
170 st %r0, 16(rp)
172 C sqr_dia_addlsh1 ============================================================
174 define(`up', `up_saved')
175 define(`rp', `rp_saved')
176 la n, 1(n_saved)
178 l %r1, 0(up)
179 mlr %r0, %r1
180 st %r1, 0(rp)
181 C clr %r15, %r15 C clear carry (already clear per above)
183 L(top): l %r11, 4(up)
184 la up, 4(up)
185 l %r6, 4(rp)
186 l %r7, 8(rp)
187 mlr %r10, %r11
188 alcr %r6, %r6
189 alcr %r7, %r7
190 alcr %r10, zero C propagate carry to high product limb
191 alr %r6, %r0
192 alcr %r7, %r11
193 stm %r6, %r7, 4(rp)
194 la rp, 8(rp)
195 lr %r0, %r10 C copy carry limb
196 brct n, L(top)
198 alcr %r0, zero
199 st %r0, 4(rp)
201 lm %r6, %r14, 24(%r15)
202 br %r14
203 EPILOGUE()