beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / mul_basecase.asm
blob8339732a80c0d6629758b8aec2804ddd7cc8a59b
1 dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result
2 dnl in a third limb vector.
4 dnl Copyright 1996-2002 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
35 C cycles/crossproduct
36 C P5 15
37 C P6 7.5
38 C K6 12.5
39 C K7 5.5
40 C P4 24
43 C void mpn_mul_basecase (mp_ptr wp,
44 C mp_srcptr xp, mp_size_t xsize,
45 C mp_srcptr yp, mp_size_t ysize);
47 C This was written in a haste since the Pentium optimized code that was used
48 C for all x86 machines was slow for the Pentium II. This code would benefit
49 C from some cleanup.
51 C To shave off some percentage of the run-time, one should make 4 variants
52 C of the Louter loop, for the four different outcomes of un mod 4. That
53 C would avoid Loop0 altogether. Code expansion would be > 4-fold for that
54 C part of the function, but since it is not very large, that would be
55 C acceptable.
57 C The mul loop (at L(oopM)) might need some tweaking. It's current speed is
58 C unknown.
60 defframe(PARAM_YSIZE,20)
61 defframe(PARAM_YP, 16)
62 defframe(PARAM_XSIZE,12)
63 defframe(PARAM_XP, 8)
64 defframe(PARAM_WP, 4)
66 defframe(VAR_MULTIPLIER, -4)
67 defframe(VAR_COUNTER, -8)
68 deflit(VAR_STACK_SPACE, 8)
70 TEXT
71 ALIGN(8)
73 PROLOGUE(mpn_mul_basecase)
74 deflit(`FRAME',0)
76 subl $VAR_STACK_SPACE,%esp
77 pushl %esi
78 pushl %ebp
79 pushl %edi
80 deflit(`FRAME',eval(VAR_STACK_SPACE+12))
82 movl PARAM_XP,%esi
83 movl PARAM_WP,%edi
84 movl PARAM_YP,%ebp
86 movl (%esi),%eax C load xp[0]
87 mull (%ebp) C multiply by yp[0]
88 movl %eax,(%edi) C store to wp[0]
89 movl PARAM_XSIZE,%ecx C xsize
90 decl %ecx C If xsize = 1, ysize = 1 too
91 jz L(done)
93 pushl %ebx
94 FRAME_pushl()
95 movl %edx,%ebx
97 leal 4(%esi),%esi
98 leal 4(%edi),%edi
100 L(oopM):
101 movl (%esi),%eax C load next limb at xp[j]
102 leal 4(%esi),%esi
103 mull (%ebp)
104 addl %ebx,%eax
105 movl %edx,%ebx
106 adcl $0,%ebx
107 movl %eax,(%edi)
108 leal 4(%edi),%edi
109 decl %ecx
110 jnz L(oopM)
112 movl %ebx,(%edi) C most significant limb of product
113 addl $4,%edi C increment wp
114 movl PARAM_XSIZE,%eax
115 shll $2,%eax
116 subl %eax,%edi
117 subl %eax,%esi
119 movl PARAM_YSIZE,%eax C ysize
120 decl %eax
121 jz L(skip)
122 movl %eax,VAR_COUNTER C set index i to ysize
124 L(outer):
125 movl PARAM_YP,%ebp C yp
126 addl $4,%ebp C make ebp point to next v limb
127 movl %ebp,PARAM_YP
128 movl (%ebp),%eax C copy y limb ...
129 movl %eax,VAR_MULTIPLIER C ... to stack slot
130 movl PARAM_XSIZE,%ecx
132 xorl %ebx,%ebx
133 andl $3,%ecx
134 jz L(end0)
136 L(oop0):
137 movl (%esi),%eax
138 mull VAR_MULTIPLIER
139 leal 4(%esi),%esi
140 addl %ebx,%eax
141 movl $0,%ebx
142 adcl %ebx,%edx
143 addl %eax,(%edi)
144 adcl %edx,%ebx C propagate carry into cylimb
146 leal 4(%edi),%edi
147 decl %ecx
148 jnz L(oop0)
150 L(end0):
151 movl PARAM_XSIZE,%ecx
152 shrl $2,%ecx
153 jz L(endX)
155 ALIGN(8)
156 L(oopX):
157 movl (%esi),%eax
158 mull VAR_MULTIPLIER
159 addl %eax,%ebx
160 movl $0,%ebp
161 adcl %edx,%ebp
163 movl 4(%esi),%eax
164 mull VAR_MULTIPLIER
165 addl %ebx,(%edi)
166 adcl %eax,%ebp C new lo + cylimb
167 movl $0,%ebx
168 adcl %edx,%ebx
170 movl 8(%esi),%eax
171 mull VAR_MULTIPLIER
172 addl %ebp,4(%edi)
173 adcl %eax,%ebx C new lo + cylimb
174 movl $0,%ebp
175 adcl %edx,%ebp
177 movl 12(%esi),%eax
178 mull VAR_MULTIPLIER
179 addl %ebx,8(%edi)
180 adcl %eax,%ebp C new lo + cylimb
181 movl $0,%ebx
182 adcl %edx,%ebx
184 addl %ebp,12(%edi)
185 adcl $0,%ebx C propagate carry into cylimb
187 leal 16(%esi),%esi
188 leal 16(%edi),%edi
189 decl %ecx
190 jnz L(oopX)
192 L(endX):
193 movl %ebx,(%edi)
194 addl $4,%edi
196 C we incremented wp and xp in the loop above; compensate
197 movl PARAM_XSIZE,%eax
198 shll $2,%eax
199 subl %eax,%edi
200 subl %eax,%esi
202 movl VAR_COUNTER,%eax
203 decl %eax
204 movl %eax,VAR_COUNTER
205 jnz L(outer)
207 L(skip):
208 popl %ebx
209 popl %edi
210 popl %ebp
211 popl %esi
212 addl $8,%esp
215 L(done):
216 movl %edx,4(%edi) C store to wp[1]
217 popl %edi
218 popl %ebp
219 popl %esi
220 addl $8,%esp
223 EPILOGUE()