beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / pentium / sqr_basecase.asm
blobb11d767da2fe8df40890ad21238252aa1b1c762e
1 dnl Intel P5 mpn_sqr_basecase -- square an mpn number.
3 dnl Copyright 1999-2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
35 C product at around 20x20 limbs.
38 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
40 C Calculate src,size squared, storing the result in dst,2*size.
42 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
43 C lot of function call overheads are avoided, especially when the size is
44 C small.
46 defframe(PARAM_SIZE,12)
47 defframe(PARAM_SRC, 8)
48 defframe(PARAM_DST, 4)
50 TEXT
51 ALIGN(8)
52 PROLOGUE(mpn_sqr_basecase)
53 deflit(`FRAME',0)
55 movl PARAM_SIZE, %edx
56 movl PARAM_SRC, %eax
58 cmpl $2, %edx
59 movl PARAM_DST, %ecx
61 je L(two_limbs)
63 movl (%eax), %eax
64 ja L(three_or_more)
66 C -----------------------------------------------------------------------------
67 C one limb only
68 C eax src
69 C ebx
70 C ecx dst
71 C edx
73 mull %eax
75 movl %eax, (%ecx)
76 movl %edx, 4(%ecx)
78 ret
80 C -----------------------------------------------------------------------------
81 ALIGN(8)
82 L(two_limbs):
83 C eax src
84 C ebx
85 C ecx dst
86 C edx size
88 pushl %ebp
89 pushl %edi
91 pushl %esi
92 pushl %ebx
94 movl %eax, %ebx
95 movl (%eax), %eax
97 mull %eax C src[0]^2
99 movl %eax, (%ecx) C dst[0]
100 movl %edx, %esi C dst[1]
102 movl 4(%ebx), %eax
104 mull %eax C src[1]^2
106 movl %eax, %edi C dst[2]
107 movl %edx, %ebp C dst[3]
109 movl (%ebx), %eax
111 mull 4(%ebx) C src[0]*src[1]
113 addl %eax, %esi
114 popl %ebx
116 adcl %edx, %edi
118 adcl $0, %ebp
119 addl %esi, %eax
121 adcl %edi, %edx
122 movl %eax, 4(%ecx)
124 adcl $0, %ebp
125 popl %esi
127 movl %edx, 8(%ecx)
128 movl %ebp, 12(%ecx)
130 popl %edi
131 popl %ebp
136 C -----------------------------------------------------------------------------
137 ALIGN(8)
138 L(three_or_more):
139 C eax src low limb
140 C ebx
141 C ecx dst
142 C edx size
144 cmpl $4, %edx
145 pushl %ebx
146 deflit(`FRAME',4)
148 movl PARAM_SRC, %ebx
149 jae L(four_or_more)
152 C -----------------------------------------------------------------------------
153 C three limbs
154 C eax src low limb
155 C ebx src
156 C ecx dst
157 C edx size
159 pushl %ebp
160 pushl %edi
162 mull %eax C src[0] ^ 2
164 movl %eax, (%ecx)
165 movl %edx, 4(%ecx)
167 movl 4(%ebx), %eax
168 xorl %ebp, %ebp
170 mull %eax C src[1] ^ 2
172 movl %eax, 8(%ecx)
173 movl %edx, 12(%ecx)
175 movl 8(%ebx), %eax
176 pushl %esi C risk of cache bank clash
178 mull %eax C src[2] ^ 2
180 movl %eax, 16(%ecx)
181 movl %edx, 20(%ecx)
183 movl (%ebx), %eax
185 mull 4(%ebx) C src[0] * src[1]
187 movl %eax, %esi
188 movl %edx, %edi
190 movl (%ebx), %eax
192 mull 8(%ebx) C src[0] * src[2]
194 addl %eax, %edi
195 movl %edx, %ebp
197 adcl $0, %ebp
198 movl 4(%ebx), %eax
200 mull 8(%ebx) C src[1] * src[2]
202 xorl %ebx, %ebx
203 addl %eax, %ebp
205 C eax
206 C ebx zero, will be dst[5]
207 C ecx dst
208 C edx dst[4]
209 C esi dst[1]
210 C edi dst[2]
211 C ebp dst[3]
213 adcl $0, %edx
214 addl %esi, %esi
216 adcl %edi, %edi
218 adcl %ebp, %ebp
220 adcl %edx, %edx
221 movl 4(%ecx), %eax
223 adcl $0, %ebx
224 addl %esi, %eax
226 movl %eax, 4(%ecx)
227 movl 8(%ecx), %eax
229 adcl %edi, %eax
230 movl 12(%ecx), %esi
232 adcl %ebp, %esi
233 movl 16(%ecx), %edi
235 movl %eax, 8(%ecx)
236 movl %esi, 12(%ecx)
238 adcl %edx, %edi
239 popl %esi
241 movl 20(%ecx), %eax
242 movl %edi, 16(%ecx)
244 popl %edi
245 popl %ebp
247 adcl %ebx, %eax C no carry out of this
248 popl %ebx
250 movl %eax, 20(%ecx)
255 C -----------------------------------------------------------------------------
256 ALIGN(8)
257 L(four_or_more):
258 C eax src low limb
259 C ebx src
260 C ecx dst
261 C edx size
262 C esi
263 C edi
264 C ebp
266 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
268 deflit(`FRAME',4)
270 pushl %edi
271 FRAME_pushl()
272 pushl %esi
273 FRAME_pushl()
275 pushl %ebp
276 FRAME_pushl()
277 leal (%ecx,%edx,4), %edi C dst end of this mul1
279 leal (%ebx,%edx,4), %esi C src end
280 movl %ebx, %ebp C src
282 negl %edx C -size
283 xorl %ebx, %ebx C clear carry limb and carry flag
285 leal 1(%edx), %ecx C -(size-1)
287 L(mul1):
288 C eax scratch
289 C ebx carry
290 C ecx counter, negative
291 C edx scratch
292 C esi &src[size]
293 C edi &dst[size]
294 C ebp src
296 adcl $0, %ebx
297 movl (%esi,%ecx,4), %eax
299 mull (%ebp)
301 addl %eax, %ebx
303 movl %ebx, (%edi,%ecx,4)
304 incl %ecx
306 movl %edx, %ebx
307 jnz L(mul1)
310 C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
311 C n=1..size-2.
313 C The last two products, which are the end corner of the product
314 C triangle, are handled separately to save looping overhead. These
315 C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
316 C If size is 4 then it's only these that need to be done.
318 C In the outer loop %esi is a constant, and %edi just advances by 1
319 C limb each time. The size of the operation decreases by 1 limb
320 C each time.
322 C eax
323 C ebx carry (needing carry flag added)
324 C ecx
325 C edx
326 C esi &src[size]
327 C edi &dst[size]
328 C ebp
330 adcl $0, %ebx
331 movl PARAM_SIZE, %edx
333 movl %ebx, (%edi)
334 subl $4, %edx
336 negl %edx
337 jz L(corner)
340 L(outer):
341 C ebx previous carry limb to store
342 C edx outer loop counter (negative)
343 C esi &src[size]
344 C edi dst, pointing at stored carry limb of previous loop
346 pushl %edx C new outer loop counter
347 leal -2(%edx), %ecx
349 movl %ebx, (%edi)
350 addl $4, %edi
352 addl $4, %ebp
353 xorl %ebx, %ebx C initial carry limb, clear carry flag
355 L(inner):
356 C eax scratch
357 C ebx carry (needing carry flag added)
358 C ecx counter, negative
359 C edx scratch
360 C esi &src[size]
361 C edi dst end of this addmul
362 C ebp &src[j]
364 adcl $0, %ebx
365 movl (%esi,%ecx,4), %eax
367 mull (%ebp)
369 addl %ebx, %eax
370 movl (%edi,%ecx,4), %ebx
372 adcl $0, %edx
373 addl %eax, %ebx
375 movl %ebx, (%edi,%ecx,4)
376 incl %ecx
378 movl %edx, %ebx
379 jnz L(inner)
382 adcl $0, %ebx
383 popl %edx C outer loop counter
385 incl %edx
386 jnz L(outer)
389 movl %ebx, (%edi)
391 L(corner):
392 C esi &src[size]
393 C edi &dst[2*size-4]
395 movl -8(%esi), %eax
396 movl -4(%edi), %ebx C risk of data cache bank clash here
398 mull -12(%esi) C src[size-2]*src[size-3]
400 addl %eax, %ebx
401 movl %edx, %ecx
403 adcl $0, %ecx
404 movl -4(%esi), %eax
406 mull -12(%esi) C src[size-1]*src[size-3]
408 addl %ecx, %eax
409 movl (%edi), %ecx
411 adcl $0, %edx
412 movl %ebx, -4(%edi)
414 addl %eax, %ecx
415 movl %edx, %ebx
417 adcl $0, %ebx
418 movl -4(%esi), %eax
420 mull -8(%esi) C src[size-1]*src[size-2]
422 movl %ecx, (%edi)
423 addl %eax, %ebx
425 adcl $0, %edx
426 movl PARAM_SIZE, %eax
428 negl %eax
429 movl %ebx, 4(%edi)
431 addl $1, %eax C -(size-1) and clear carry
432 movl %edx, 8(%edi)
435 C -----------------------------------------------------------------------------
436 C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
438 L(lshift):
439 C eax counter, negative
440 C ebx next limb
441 C ecx
442 C edx
443 C esi
444 C edi &dst[2*size-4]
445 C ebp
447 movl 12(%edi,%eax,8), %ebx
449 rcll %ebx
450 movl 16(%edi,%eax,8), %ecx
452 rcll %ecx
453 movl %ebx, 12(%edi,%eax,8)
455 movl %ecx, 16(%edi,%eax,8)
456 incl %eax
458 jnz L(lshift)
461 adcl %eax, %eax C high bit out
462 movl PARAM_SRC, %esi
464 movl PARAM_SIZE, %ecx C risk of cache bank clash
465 movl %eax, 12(%edi) C dst most significant limb
468 C -----------------------------------------------------------------------------
469 C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
470 C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
471 C low limb of src[0]^2.
473 movl (%esi), %eax C src[0]
474 leal (%esi,%ecx,4), %esi C src end
476 negl %ecx
478 mull %eax
480 movl %eax, 16(%edi,%ecx,8) C dst[0]
481 movl %edx, %ebx
483 addl $1, %ecx C size-1 and clear carry
485 L(diag):
486 C eax scratch (low product)
487 C ebx carry limb
488 C ecx counter, negative
489 C edx scratch (high product)
490 C esi &src[size]
491 C edi &dst[2*size-4]
492 C ebp scratch (fetched dst limbs)
494 movl (%esi,%ecx,4), %eax
495 adcl $0, %ebx
497 mull %eax
499 movl 16-4(%edi,%ecx,8), %ebp
501 addl %ebp, %ebx
502 movl 16(%edi,%ecx,8), %ebp
504 adcl %eax, %ebp
505 movl %ebx, 16-4(%edi,%ecx,8)
507 movl %ebp, 16(%edi,%ecx,8)
508 incl %ecx
510 movl %edx, %ebx
511 jnz L(diag)
514 adcl $0, %edx
515 movl 16-4(%edi), %eax C dst most significant limb
517 addl %eax, %edx
518 popl %ebp
520 movl %edx, 16-4(%edi)
521 popl %esi C risk of cache bank clash
523 popl %edi
524 popl %ebx
528 EPILOGUE()