1 dnl x86 mpn_mul_basecase
-- Multiply two limb vectors
and store the result
2 dnl
in a third limb vector.
4 dnl Copyright
1996-2002 Free Software Foundation
, Inc.
6 dnl
This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
9 dnl it under the terms of
either:
11 dnl
* the GNU Lesser General
Public License as published by the Free
12 dnl Software Foundation
; either version 3 of the License, or (at your
13 dnl option
) any later version.
17 dnl
* the GNU General
Public License as published by the Free Software
18 dnl Foundation
; either version 2 of the License, or (at your option) any
21 dnl
or both
in parallel
, as here.
23 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
24 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
25 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
28 dnl You should have received copies of the GNU General
Public License
and the
29 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
30 dnl see
https://www.gnu.
org/licenses
/.
32 include(`..
/config.m4
')
43 C void mpn_mul_basecase (mp_ptr wp,
44 C mp_srcptr xp, mp_size_t xsize,
45 C mp_srcptr yp, mp_size_t ysize);
47 C This was written in a haste since the Pentium optimized code that was used
48 C for all x86 machines was slow for the Pentium II. This code would benefit
51 C To shave off some percentage of the run-time, one should make 4 variants
52 C of the Louter loop, for the four different outcomes of un mod 4. That
53 C would avoid Loop0 altogether. Code expansion would be > 4-fold for that
54 C part of the function, but since it is not very large, that would be
57 C The mul loop (at L(oopM)) might need some tweaking. It's current speed is
60 defframe
(PARAM_YSIZE
,20)
61 defframe
(PARAM_YP
, 16)
62 defframe
(PARAM_XSIZE
,12)
66 defframe
(VAR_MULTIPLIER
, -4)
67 defframe
(VAR_COUNTER
, -8)
68 deflit
(VAR_STACK_SPACE
, 8)
73 PROLOGUE
(mpn_mul_basecase
)
76 subl $VAR_STACK_SPACE,%esp
80 deflit(`FRAME',eval
(VAR_STACK_SPACE
+12))
86 movl
(%esi),%eax C load xp
[0]
87 mull
(%ebp) C multiply by yp
[0]
88 movl
%eax,(%edi) C store to wp
[0]
89 movl PARAM_XSIZE
,%ecx C xsize
90 decl
%ecx C If xsize
= 1, ysize
= 1 too
101 movl
(%esi),%eax C load next limb at xp
[j
]
112 movl
%ebx,(%edi) C most significant limb of product
113 addl
$4,%edi C increment wp
114 movl PARAM_XSIZE
,%eax
119 movl PARAM_YSIZE
,%eax C ysize
122 movl
%eax,VAR_COUNTER C set index i to ysize
125 movl PARAM_YP
,%ebp C yp
126 addl
$4,%ebp C make
ebp point to next v limb
128 movl
(%ebp),%eax C copy y limb ...
129 movl
%eax,VAR_MULTIPLIER C ... to stack slot
130 movl PARAM_XSIZE
,%ecx
144 adcl
%edx,%ebx C propagate carry
into cylimb
151 movl PARAM_XSIZE
,%ecx
166 adcl
%eax,%ebp C new lo
+ cylimb
173 adcl
%eax,%ebx C new lo
+ cylimb
180 adcl
%eax,%ebp C new lo
+ cylimb
185 adcl
$0,%ebx C propagate carry
into cylimb
196 C we incremented wp
and xp
in the
loop above
; compensate
197 movl PARAM_XSIZE
,%eax
202 movl VAR_COUNTER
,%eax
204 movl
%eax,VAR_COUNTER
216 movl
%edx,4(%edi) C store to wp
[1]