beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / sqr_basecase.asm
blob39f8a89805fb3529cb1428114883c41099fe1675
1 dnl x86 generic mpn_sqr_basecase -- square an mpn number.
3 dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
35 C cycles/crossproduct cycles/triangleproduct
36 C P5
37 C P6
38 C K6
39 C K7
40 C P4
43 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
45 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
46 C lot of function call overheads are avoided, especially when the size is
47 C small.
49 C The mul1 loop is not unrolled like mul_1.asm, it doesn't seem worth the
50 C code size to do so here.
52 C Enhancements:
54 C The addmul loop here is also not unrolled like aorsmul_1.asm and
55 C mul_basecase.asm are. Perhaps it should be done. It'd add to the
56 C complexity, but if it's worth doing in the other places then it should be
57 C worthwhile here.
59 C A fully-unrolled style like other sqr_basecase.asm versions (k6, k7, p6)
60 C might be worth considering. That'd add quite a bit to the code size, but
61 C only as much as is used would be dragged into L1 cache.
63 defframe(PARAM_SIZE,12)
64 defframe(PARAM_SRC, 8)
65 defframe(PARAM_DST, 4)
67 TEXT
68 ALIGN(8)
69 PROLOGUE(mpn_sqr_basecase)
70 deflit(`FRAME',0)
72 movl PARAM_SIZE, %edx
74 movl PARAM_SRC, %eax
76 cmpl $2, %edx
77 movl PARAM_DST, %ecx
79 je L(two_limbs)
80 ja L(three_or_more)
83 C -----------------------------------------------------------------------------
84 C one limb only
85 C eax src
86 C ebx
87 C ecx dst
88 C edx
90 movl (%eax), %eax
91 mull %eax
92 movl %eax, (%ecx)
93 movl %edx, 4(%ecx)
94 ret
97 C -----------------------------------------------------------------------------
98 ALIGN(8)
99 L(two_limbs):
100 C eax src
101 C ebx
102 C ecx dst
103 C edx
105 pushl %ebx
106 pushl %ebp
108 movl %eax, %ebx
109 movl (%eax), %eax
111 mull %eax C src[0]^2
113 pushl %esi
114 pushl %edi
116 movl %edx, %esi C dst[1]
117 movl %eax, (%ecx) C dst[0]
119 movl 4(%ebx), %eax
120 mull %eax C src[1]^2
122 movl %eax, %edi C dst[2]
123 movl %edx, %ebp C dst[3]
125 movl (%ebx), %eax
126 mull 4(%ebx) C src[0]*src[1]
128 addl %eax, %esi
130 adcl %edx, %edi
132 adcl $0, %ebp
133 addl %esi, %eax
135 adcl %edi, %edx
136 movl %eax, 4(%ecx)
138 adcl $0, %ebp
140 movl %edx, 8(%ecx)
141 movl %ebp, 12(%ecx)
143 popl %edi
144 popl %esi
146 popl %ebp
147 popl %ebx
152 C -----------------------------------------------------------------------------
153 ALIGN(8)
154 L(three_or_more):
155 deflit(`FRAME',0)
156 C eax src
157 C ebx
158 C ecx dst
159 C edx size
161 pushl %ebx FRAME_pushl()
162 pushl %edi FRAME_pushl()
164 pushl %esi FRAME_pushl()
165 pushl %ebp FRAME_pushl()
167 leal (%ecx,%edx,4), %edi C &dst[size], end of this mul1
168 leal (%eax,%edx,4), %esi C &src[size]
170 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
172 movl (%eax), %ebp C src[0], multiplier
173 movl %edx, %ecx
175 negl %ecx C -size
176 xorl %ebx, %ebx C clear carry limb
178 incl %ecx C -(size-1)
180 L(mul1):
181 C eax scratch
182 C ebx carry
183 C ecx counter, limbs, negative
184 C edx scratch
185 C esi &src[size]
186 C edi &dst[size]
187 C ebp multiplier
189 movl (%esi,%ecx,4), %eax
190 mull %ebp
191 addl %eax, %ebx
192 adcl $0, %edx
193 movl %ebx, (%edi,%ecx,4)
194 movl %edx, %ebx
195 incl %ecx
196 jnz L(mul1)
198 movl %ebx, (%edi)
201 C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
202 C n=1..size-2.
204 C The last products src[size-2]*src[size-1], which is the end corner
205 C of the product triangle, is handled separately at the end to save
206 C looping overhead. If size is 3 then it's only this that needs to
207 C be done.
209 C In the outer loop %esi is a constant, and %edi just advances by 1
210 C limb each time. The size of the operation decreases by 1 limb
211 C each time.
213 C eax
214 C ebx carry (needing carry flag added)
215 C ecx
216 C edx
217 C esi &src[size]
218 C edi &dst[size]
219 C ebp
221 movl PARAM_SIZE, %ecx
222 subl $3, %ecx
223 jz L(corner)
225 negl %ecx
227 dnl re-use parameter space
228 define(VAR_OUTER,`PARAM_DST')
230 L(outer):
231 C eax
232 C ebx
233 C ecx
234 C edx outer loop counter, -(size-3) to -1
235 C esi &src[size]
236 C edi dst, pointing at stored carry limb of previous loop
237 C ebp
239 movl %ecx, VAR_OUTER
240 addl $4, %edi C advance dst end
242 movl -8(%esi,%ecx,4), %ebp C next multiplier
243 subl $1, %ecx
245 xorl %ebx, %ebx C initial carry limb
247 L(inner):
248 C eax scratch
249 C ebx carry (needing carry flag added)
250 C ecx counter, -n-1 to -1
251 C edx scratch
252 C esi &src[size]
253 C edi dst end of this addmul
254 C ebp multiplier
256 movl (%esi,%ecx,4), %eax
257 mull %ebp
258 addl %ebx, %eax
259 adcl $0, %edx
260 addl %eax, (%edi,%ecx,4)
261 adcl $0, %edx
262 movl %edx, %ebx
263 addl $1, %ecx
264 jl L(inner)
267 movl %ebx, (%edi)
268 movl VAR_OUTER, %ecx
269 incl %ecx
270 jnz L(outer)
273 L(corner):
274 C esi &src[size]
275 C edi &dst[2*size-3]
277 movl -4(%esi), %eax
278 mull -8(%esi) C src[size-1]*src[size-2]
279 addl %eax, 0(%edi)
280 adcl $0, %edx
281 movl %edx, 4(%edi) C dst high limb
284 C -----------------------------------------------------------------------------
285 C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
287 movl PARAM_SIZE, %eax
288 negl %eax
289 addl $1, %eax C -(size-1) and clear carry
291 L(lshift):
292 C eax counter, negative
293 C ebx next limb
294 C ecx
295 C edx
296 C esi
297 C edi &dst[2*size-4]
298 C ebp
300 rcll 8(%edi,%eax,8)
301 rcll 12(%edi,%eax,8)
302 incl %eax
303 jnz L(lshift)
306 adcl %eax, %eax C high bit out
307 movl %eax, 8(%edi) C dst most significant limb
310 C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
311 C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
312 C low limb of src[0]^2.
314 movl PARAM_SRC, %esi
315 movl (%esi), %eax C src[0]
316 mull %eax C src[0]^2
318 movl PARAM_SIZE, %ecx
319 leal (%esi,%ecx,4), %esi C src end
321 negl %ecx C -size
322 movl %edx, %ebx C initial carry
324 movl %eax, 12(%edi,%ecx,8) C dst[0]
325 incl %ecx C -(size-1)
327 L(diag):
328 C eax scratch (low product)
329 C ebx carry limb
330 C ecx counter, -(size-1) to -1
331 C edx scratch (high product)
332 C esi &src[size]
333 C edi &dst[2*size-3]
334 C ebp scratch (fetched dst limbs)
336 movl (%esi,%ecx,4), %eax
337 mull %eax
339 addl %ebx, 8(%edi,%ecx,8)
340 movl %edx, %ebx
342 adcl %eax, 12(%edi,%ecx,8)
343 adcl $0, %ebx
345 incl %ecx
346 jnz L(diag)
349 addl %ebx, 8(%edi) C dst most significant limb
351 popl %ebp
352 popl %esi
354 popl %edi
355 popl %ebx
359 EPILOGUE()