1 dnl Intel P5 mpn_sqr_basecase
-- square an mpn number.
3 dnl Copyright
1999-2002 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
35 C product at around 20x20 limbs.
38 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
40 C Calculate src,size squared, storing the result in dst,2*size.
42 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
43 C lot of function call overheads are avoided, especially when the size is
46 defframe(PARAM_SIZE,12)
47 defframe(PARAM_SRC, 8)
48 defframe(PARAM_DST, 4)
52 PROLOGUE(mpn_sqr_basecase)
66 C
-----------------------------------------------------------------------------
80 C
-----------------------------------------------------------------------------
99 movl
%eax, (%ecx) C dst
[0]
100 movl
%edx, %esi C dst
[1]
106 movl
%eax, %edi C dst
[2]
107 movl
%edx, %ebp C dst
[3]
111 mull
4(%ebx) C src
[0]*src
[1]
136 C
-----------------------------------------------------------------------------
152 C -----------------------------------------------------------------------------
162 mull %eax C src[0] ^ 2
170 mull %eax C src[1] ^ 2
176 pushl %esi C risk of cache bank clash
178 mull %eax C src[2] ^ 2
185 mull 4(%ebx) C src[0] * src[1]
192 mull 8(%ebx) C src[0] * src[2]
200 mull 8(%ebx) C src[1] * src[2]
206 C ebx zero, will be dst[5]
247 adcl %ebx, %eax C no carry out of this
255 C -----------------------------------------------------------------------------
266 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
277 leal
(%ecx,%edx,4), %edi C dst
end of
this mul1
279 leal
(%ebx,%edx,4), %esi C src
end
280 movl
%ebx, %ebp C src
283 xorl
%ebx, %ebx C clear carry limb
and carry flag
285 leal
1(%edx), %ecx C
-(size-1)
290 C
ecx counter
, negative
297 movl
(%esi,%ecx,4), %eax
303 movl
%ebx, (%edi,%ecx,4)
310 C
Add products src
[n
]*src
[n
+1..
size-1] at dst
[2*n
-1...
], for
313 C The last two products
, which are the
end corner of the product
314 C triangle
, are handled separately to save looping overhead. These
315 C are src
[size-3]*src
[size-2,size-1] and src
[size-2]*src
[size-1].
316 C If
size is
4 then it
's only these that need to be done.
318 C In the outer loop %esi is a constant, and %edi just advances by 1
319 C limb each time. The size of the operation decreases by 1 limb
323 C ebx carry (needing carry flag added)
331 movl PARAM_SIZE, %edx
341 C ebx previous carry limb to store
342 C edx outer loop counter (negative)
344 C edi dst, pointing at stored carry limb of previous loop
346 pushl %edx C new outer loop counter
353 xorl %ebx, %ebx C initial carry limb, clear carry flag
357 C ebx carry (needing carry flag added)
358 C ecx counter, negative
361 C edi dst end of this addmul
365 movl (%esi,%ecx,4), %eax
370 movl (%edi,%ecx,4), %ebx
375 movl %ebx, (%edi,%ecx,4)
383 popl %edx C outer loop counter
396 movl -4(%edi), %ebx C risk of data cache bank clash here
398 mull -12(%esi) C src[size-2]*src[size-3]
406 mull -12(%esi) C src[size-1]*src[size-3]
420 mull -8(%esi) C src[size-1]*src[size-2]
426 movl PARAM_SIZE, %eax
431 addl $1, %eax C -(size-1) and clear carry
435 C -----------------------------------------------------------------------------
436 C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
439 C eax counter, negative
447 movl 12(%edi,%eax,8), %ebx
450 movl 16(%edi,%eax,8), %ecx
453 movl %ebx, 12(%edi,%eax,8)
455 movl %ecx, 16(%edi,%eax,8)
461 adcl %eax, %eax C high bit out
464 movl PARAM_SIZE, %ecx C risk of cache bank clash
465 movl %eax, 12(%edi) C dst most significant limb
468 C -----------------------------------------------------------------------------
469 C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
470 C src[size-1]^2. dst[0] hasn't yet been set at all yet
, and just gets the
471 C
low limb of src
[0]^
2.
473 movl
(%esi), %eax C src
[0]
474 leal
(%esi,%ecx,4), %esi C src
end
480 movl
%eax, 16(%edi,%ecx,8) C dst
[0]
483 addl
$1, %ecx C
size-1 and clear carry
486 C
eax scratch
(low product
)
488 C
ecx counter
, negative
489 C
edx scratch
(high product
)
492 C
ebp scratch
(fetched dst limbs
)
494 movl
(%esi,%ecx,4), %eax
499 movl
16-4(%edi,%ecx,8), %ebp
502 movl
16(%edi,%ecx,8), %ebp
505 movl
%ebx, 16-4(%edi,%ecx,8)
507 movl
%ebp, 16(%edi,%ecx,8)
515 movl
16-4(%edi), %eax C dst most significant limb
520 movl
%edx, 16-4(%edi)
521 popl
%esi C risk of cache bank clash