1 dnl x86 generic mpn_sqr_basecase
-- square an mpn number.
3 dnl Copyright
1999, 2000, 2002, 2003 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
32 include(`..
/config.m4
')
35 C cycles/crossproduct cycles/triangleproduct
43 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
45 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
46 C lot of function call overheads are avoided, especially when the size is
49 C The mul1 loop is not unrolled like mul_1.asm, it doesn't seem worth the
50 C code
size to do so here.
54 C The addmul
loop here is also
not unrolled like aorsmul_1.asm
and
55 C mul_basecase.asm are. Perhaps it should be done. It
'd add to the
56 C complexity, but if it's worth doing
in the other places then it should be
59 C A fully
-unrolled style like other sqr_basecase.asm versions
(k6
, k7
, p6
)
60 C might be worth considering. That
'd add quite a bit to the code size, but
61 C only as much as is used would be dragged into L1 cache.
63 defframe(PARAM_SIZE,12)
64 defframe(PARAM_SRC, 8)
65 defframe(PARAM_DST, 4)
69 PROLOGUE(mpn_sqr_basecase)
83 C
-----------------------------------------------------------------------------
97 C
-----------------------------------------------------------------------------
116 movl
%edx, %esi C dst
[1]
117 movl
%eax, (%ecx) C dst
[0]
122 movl
%eax, %edi C dst
[2]
123 movl
%edx, %ebp C dst
[3]
126 mull
4(%ebx) C src
[0]*src
[1]
152 C
-----------------------------------------------------------------------------
161 pushl %ebx FRAME_pushl()
162 pushl %edi FRAME_pushl()
164 pushl %esi FRAME_pushl()
165 pushl %ebp FRAME_pushl()
167 leal (%ecx,%edx,4), %edi C &dst[size], end of this mul1
168 leal (%eax,%edx,4), %esi C &src[size]
170 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
172 movl (%eax), %ebp C src[0], multiplier
176 xorl %ebx, %ebx C clear carry limb
178 incl %ecx C -(size-1)
183 C ecx counter, limbs, negative
189 movl (%esi,%ecx,4), %eax
193 movl %ebx, (%edi,%ecx,4)
201 C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
204 C The last products src[size-2]*src[size-1], which is the end corner
205 C of the product triangle, is handled separately at the end to save
206 C looping overhead. If size is 3 then it's only
this that needs to
209 C
In the outer
loop %esi is a constant
, and %edi just advances by
1
210 C limb each time. The
size of the operation decreases by
1 limb
214 C
ebx carry
(needing carry flag added
)
221 movl PARAM_SIZE
, %ecx
227 dnl re
-use parameter space
228 define
(VAR_OUTER
,`PARAM_DST
')
234 C edx outer loop counter, -(size-3) to -1
236 C edi dst, pointing at stored carry limb of previous loop
240 addl $4, %edi C advance dst end
242 movl -8(%esi,%ecx,4), %ebp C next multiplier
245 xorl %ebx, %ebx C initial carry limb
249 C ebx carry (needing carry flag added)
250 C ecx counter, -n-1 to -1
253 C edi dst end of this addmul
256 movl (%esi,%ecx,4), %eax
260 addl %eax, (%edi,%ecx,4)
278 mull -8(%esi) C src[size-1]*src[size-2]
281 movl %edx, 4(%edi) C dst high limb
284 C -----------------------------------------------------------------------------
285 C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
287 movl PARAM_SIZE, %eax
289 addl $1, %eax C -(size-1) and clear carry
292 C eax counter, negative
306 adcl %eax, %eax C high bit out
307 movl %eax, 8(%edi) C dst most significant limb
310 C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
311 C src[size-1]^2. dst[0] hasn't yet been set at all yet
, and just gets the
312 C
low limb of src
[0]^
2.
315 movl
(%esi), %eax C src
[0]
318 movl PARAM_SIZE
, %ecx
319 leal
(%esi,%ecx,4), %esi C src
end
322 movl
%edx, %ebx C initial carry
324 movl
%eax, 12(%edi,%ecx,8) C dst
[0]
325 incl
%ecx C
-(size-1)
328 C
eax scratch
(low product
)
330 C
ecx counter
, -(size-1) to
-1
331 C
edx scratch
(high product
)
334 C
ebp scratch
(fetched dst limbs
)
336 movl
(%esi,%ecx,4), %eax
339 addl
%ebx, 8(%edi,%ecx,8)
342 adcl
%eax, 12(%edi,%ecx,8)
349 addl
%ebx, 8(%edi) C dst most significant limb