beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / pentium4 / sse2 / mul_basecase.asm
blob6e3775ae096f655a036b67cd4fad7e1035d18e18
1 dnl mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
3 dnl Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C TODO:
34 C * Improve ad-hoc outer loop code and register handling. Some feed-in
35 C scheduling could improve things by several cycles per outer iteration.
36 C * In code for un <= 3, try keeping accumulation operands in registers,
37 C without storing intermediates to rp.
38 C * We might want to keep 32 in a free mm register, since the register form is
39 C 3 bytes and the immediate form is 4 bytes. About 70 bytes to save.
40 C * Look into different loop alignment, we now expand the code about 50 bytes
41 C with possibly needless alignment.
42 C * Perhaps rewrap loops 00,01,02 (6 loops) to allow fall-through entry.
43 C * Use OSP, should solve feed-in latency problems.
44 C * Save a few tens of bytes by doing cross-jumping for Loel0, etc.
45 C * Save around 120 bytes by remapping "m 0", "m 1", "m 2" and "m 3" registers
46 C so that they can share feed-in code, and changing the branch targets from
47 C L<n> to Lm<nn>.
49 C cycles/limb
50 C P6 model 9 (Banias) ?
51 C P6 model 13 (Dothan) 5.24
52 C P6 model 14 (Yonah) ?
53 C P4 model 0-1 (Willamette): 5
54 C P4 model 2 (Northwood): 4.60 at 32 limbs
55 C P4 model 3-4 (Prescott): 4.94 at 32 limbs
57 C INPUT PARAMETERS
58 C rp sp + 4
59 C up sp + 8
60 C un sp + 12
61 C vp sp + 16
62 C vn sp + 20
64 TEXT
65 ALIGN(16)
66 PROLOGUE(mpn_mul_basecase)
67 push %esi
68 push %ebx
69 mov 12(%esp), %edx C rp
70 mov 16(%esp), %eax C up
71 mov 20(%esp), %ecx C un
72 mov 24(%esp), %esi C vp
73 mov 28(%esp), %ebx C vn
74 movd (%esi), %mm7 C
75 L(ent): cmp $3, %ecx
76 ja L(big)
77 movd (%eax), %mm6
78 pmuludq %mm7, %mm6
79 jz L(un3)
80 cmp $2, %ecx
81 jz L(un2)
83 L(un1): movd %mm6, (%edx) C un=1
84 psrlq $32, %mm6 C un=1
85 movd %mm6, 4(%edx) C un=1
86 jmp L(rtr) C un=1
88 L(un2): movd 4(%eax), %mm1 C un=2
89 pmuludq %mm7, %mm1 C un=2
90 movd %mm6, (%edx) C un=2
91 psrlq $32, %mm6 C un=2
92 paddq %mm1, %mm6 C un=2
93 movd %mm6, 4(%edx) C un=2
94 psrlq $32, %mm6 C un=2
95 movd %mm6, 8(%edx) C un=2
96 dec %ebx C un=2
97 jz L(rtr) C un=2
98 movd 4(%esi), %mm7 C un=2
99 movd (%eax), %mm6 C un=2
100 pmuludq %mm7, %mm6 C un=2
101 movd 4(%eax), %mm1 C un=2
102 movd 4(%edx), %mm4 C un=2
103 pmuludq %mm7, %mm1 C un=2
104 movd 8(%edx), %mm5 C un=2
105 paddq %mm4, %mm6 C un=2
106 paddq %mm1, %mm5 C un=2
107 movd %mm6, 4(%edx) C un=2
108 psrlq $32, %mm6 C un=2
109 paddq %mm5, %mm6 C un=2
110 movd %mm6, 8(%edx) C un=2
111 psrlq $32, %mm6 C un=2
112 movd %mm6, 12(%edx) C un=2
113 L(rtr): emms
114 pop %ebx
115 pop %esi
118 L(un3): movd 4(%eax), %mm1 C un=3
119 pmuludq %mm7, %mm1 C un=3
120 movd 8(%eax), %mm2 C un=3
121 pmuludq %mm7, %mm2 C un=3
122 movd %mm6, (%edx) C un=3
123 psrlq $32, %mm6 C un=3
124 paddq %mm1, %mm6 C un=3
125 movd %mm6, 4(%edx) C un=3
126 psrlq $32, %mm6 C un=3
127 paddq %mm2, %mm6 C un=3
128 movd %mm6, 8(%edx) C un=3
129 psrlq $32, %mm6 C un=3
130 movd %mm6, 12(%edx) C un=3
131 dec %ebx C un=3
132 jz L(rtr) C un=3
133 movd 4(%esi), %mm7 C un=3
134 movd (%eax), %mm6 C un=3
135 pmuludq %mm7, %mm6 C un=3
136 movd 4(%eax), %mm1 C un=3
137 movd 4(%edx), %mm4 C un=3
138 pmuludq %mm7, %mm1 C un=3
139 movd 8(%eax), %mm2 C un=3
140 movd 8(%edx), %mm5 C un=3
141 pmuludq %mm7, %mm2 C un=3
142 paddq %mm4, %mm6 C un=3
143 paddq %mm1, %mm5 C un=3
144 movd 12(%edx), %mm4 C un=3
145 movd %mm6, 4(%edx) C un=3
146 psrlq $32, %mm6 C un=3
147 paddq %mm5, %mm6 C un=3
148 paddq %mm2, %mm4 C un=3
149 movd %mm6, 8(%edx) C un=3
150 psrlq $32, %mm6 C un=3
151 paddq %mm4, %mm6 C un=3
152 movd %mm6, 12(%edx) C un=3
153 psrlq $32, %mm6 C un=3
154 movd %mm6, 16(%edx) C un=3
155 dec %ebx C un=3
156 jz L(rtr) C un=3
157 movd 8(%esi), %mm7 C un=3
158 movd (%eax), %mm6 C un=3
159 pmuludq %mm7, %mm6 C un=3
160 movd 4(%eax), %mm1 C un=3
161 movd 8(%edx), %mm4 C un=3
162 pmuludq %mm7, %mm1 C un=3
163 movd 8(%eax), %mm2 C un=3
164 movd 12(%edx), %mm5 C un=3
165 pmuludq %mm7, %mm2 C un=3
166 paddq %mm4, %mm6 C un=3
167 paddq %mm1, %mm5 C un=3
168 movd 16(%edx), %mm4 C un=3
169 movd %mm6, 8(%edx) C un=3
170 psrlq $32, %mm6 C un=3
171 paddq %mm5, %mm6 C un=3
172 paddq %mm2, %mm4 C un=3
173 movd %mm6, 12(%edx) C un=3
174 psrlq $32, %mm6 C un=3
175 paddq %mm4, %mm6 C un=3
176 movd %mm6, 16(%edx) C un=3
177 psrlq $32, %mm6 C un=3
178 movd %mm6, 20(%edx) C un=3
179 jmp L(rtr)
182 L(big): push %edi
183 pxor %mm6, %mm6
184 lea 4(%esi), %esi
185 and $3, %ecx
186 jz L(0)
187 cmp $2, %ecx
188 jc L(1)
189 jz L(2)
190 jmp L(3) C FIXME: one case should fall through
193 L(0): movd (%eax), %mm3 C m 0
194 sub 24(%esp), %ecx C inner loop count m 0
195 mov %ecx, 24(%esp) C update loop count for later m 0
196 pmuludq %mm7, %mm3 C m 0
197 movd 4(%eax), %mm0 C m 0
198 pmuludq %mm7, %mm0 C m 0
199 movd 8(%eax), %mm1 C m 0
200 jmp L(m00) C m 0
201 ALIGN(16) C m 0
202 L(lpm0):
203 pmuludq %mm7, %mm4 C m 0
204 paddq %mm0, %mm6 C m 0
205 movd (%eax), %mm3 C m 0
206 movd %mm6, -12(%edx) C m 0
207 psrlq $32, %mm6 C m 0
208 pmuludq %mm7, %mm3 C m 0
209 paddq %mm1, %mm6 C m 0
210 movd 4(%eax), %mm0 C m 0
211 movd %mm6, -8(%edx) C m 0
212 psrlq $32, %mm6 C m 0
213 pmuludq %mm7, %mm0 C m 0
214 paddq %mm4, %mm6 C m 0
215 movd 8(%eax), %mm1 C m 0
216 movd %mm6, -4(%edx) C m 0
217 psrlq $32, %mm6 C m 0
218 L(m00): pmuludq %mm7, %mm1 C m 0
219 paddq %mm3, %mm6 C m 0
220 movd 12(%eax), %mm4 C m 0
221 movd %mm6, (%edx) C m 0
222 psrlq $32, %mm6 C m 0
223 lea 16(%eax), %eax C m 0
224 lea 16(%edx), %edx C m 0
225 add $4, %ecx C m 0
226 ja L(lpm0) C m 0
227 pmuludq %mm7, %mm4 C m 0
228 paddq %mm0, %mm6 C m 0
229 movd %mm6, -12(%edx) C m 0
230 psrlq $32, %mm6 C m 0
231 paddq %mm1, %mm6 C m 0
232 mov 16(%esp), %edi C rp 0
233 jmp L(x0)
235 L(olp0):
236 lea 4(%edi), %edi C am 0
237 movd (%esi), %mm7 C am 0
238 lea 4(%esi), %esi C am 0
239 mov %edi, %edx C rp am 0
240 mov 20(%esp), %eax C up am 0
241 movd (%eax), %mm3 C am 0
242 mov 24(%esp), %ecx C inner loop count am 0
243 pxor %mm6, %mm6 C am 0
244 pmuludq %mm7, %mm3 C am 0
245 movd 4(%eax), %mm0 C am 0
246 movd (%edx), %mm5 C am 0
247 pmuludq %mm7, %mm0 C am 0
248 movd 8(%eax), %mm1 C am 0
249 paddq %mm3, %mm5 C am 0
250 movd 4(%edx), %mm4 C am 0
251 jmp L(am00) C am 0
252 ALIGN(16) C mm 0
253 L(lam0):
254 pmuludq %mm7, %mm2 C am 0
255 paddq %mm4, %mm6 C am 0
256 movd (%eax), %mm3 C am 0
257 paddq %mm1, %mm5 C am 0
258 movd -4(%edx), %mm4 C am 0
259 movd %mm6, -12(%edx) C am 0
260 psrlq $32, %mm6 C am 0
261 pmuludq %mm7, %mm3 C am 0
262 paddq %mm5, %mm6 C am 0
263 movd 4(%eax), %mm0 C am 0
264 paddq %mm2, %mm4 C am 0
265 movd (%edx), %mm5 C am 0
266 movd %mm6, -8(%edx) C am 0
267 psrlq $32, %mm6 C am 0
268 pmuludq %mm7, %mm0 C am 0
269 paddq %mm4, %mm6 C am 0
270 movd 8(%eax), %mm1 C am 0
271 paddq %mm3, %mm5 C am 0
272 movd 4(%edx), %mm4 C am 0
273 movd %mm6, -4(%edx) C am 0
274 psrlq $32, %mm6 C am 0
275 L(am00):
276 pmuludq %mm7, %mm1 C am 0
277 paddq %mm5, %mm6 C am 0
278 movd 12(%eax), %mm2 C am 0
279 paddq %mm0, %mm4 C am 0
280 movd 8(%edx), %mm5 C am 0
281 movd %mm6, (%edx) C am 0
282 psrlq $32, %mm6 C am 0
283 lea 16(%eax), %eax C am 0
284 lea 16(%edx), %edx C am 0
285 add $4, %ecx C am 0
286 jnz L(lam0) C am 0
287 pmuludq %mm7, %mm2 C am 0
288 paddq %mm4, %mm6 C am 0
289 paddq %mm1, %mm5 C am 0
290 movd -4(%edx), %mm4 C am 0
291 movd %mm6, -12(%edx) C am 0
292 psrlq $32, %mm6 C am 0
293 paddq %mm5, %mm6 C am 0
294 paddq %mm2, %mm4 C am 0
295 L(x0): movd %mm6, -8(%edx) C am 0
296 psrlq $32, %mm6 C am 0
297 paddq %mm4, %mm6 C am 0
298 movd %mm6, -4(%edx) C am 0
299 psrlq $32, %mm6 C am 0
300 movd %mm6, (%edx) C am 0
301 dec %ebx C am 0
302 jnz L(olp0) C am 0
303 L(oel0):
304 emms C 0
305 pop %edi C 0
306 pop %ebx C 0
307 pop %esi C 0
308 ret C 0
311 L(1): movd (%eax), %mm4 C m 1
312 sub 24(%esp), %ecx C m 1
313 mov %ecx, 24(%esp) C update loop count for later m 1
314 pmuludq %mm7, %mm4 C m 1
315 movd 4(%eax), %mm3 C m 1
316 pmuludq %mm7, %mm3 C m 1
317 movd 8(%eax), %mm0 C m 1
318 jmp L(m01) C m 1
319 ALIGN(16) C m 1
320 L(lpm1):
321 pmuludq %mm7, %mm4 C m 1
322 paddq %mm0, %mm6 C m 1
323 movd 4(%eax), %mm3 C m 1
324 movd %mm6, -8(%edx) C m 1
325 psrlq $32, %mm6 C m 1
326 pmuludq %mm7, %mm3 C m 1
327 paddq %mm1, %mm6 C m 1
328 movd 8(%eax), %mm0 C m 1
329 movd %mm6, -4(%edx) C m 1
330 psrlq $32, %mm6 C m 1
331 L(m01): pmuludq %mm7, %mm0 C m 1
332 paddq %mm4, %mm6 C m 1
333 movd 12(%eax), %mm1 C m 1
334 movd %mm6, (%edx) C m 1
335 psrlq $32, %mm6 C m 1
336 pmuludq %mm7, %mm1 C m 1
337 paddq %mm3, %mm6 C m 1
338 movd 16(%eax), %mm4 C m 1
339 movd %mm6, 4(%edx) C m 1
340 psrlq $32, %mm6 C m 1
341 lea 16(%eax), %eax C m 1
342 lea 16(%edx), %edx C m 1
343 add $4, %ecx C m 1
344 ja L(lpm1) C m 1
345 pmuludq %mm7, %mm4 C m 1
346 paddq %mm0, %mm6 C m 1
347 movd %mm6, -8(%edx) C m 1
348 psrlq $32, %mm6 C m 1
349 paddq %mm1, %mm6 C m 1
350 mov 16(%esp), %edi C rp 1
351 jmp L(x1)
353 L(olp1):
354 lea 4(%edi), %edi C am 1
355 movd (%esi), %mm7 C am 1
356 lea 4(%esi), %esi C am 1
357 mov %edi, %edx C rp am 1
358 mov 20(%esp), %eax C up am 1
359 movd (%eax), %mm2 C am 1
360 mov 24(%esp), %ecx C inner loop count am 1
361 pxor %mm6, %mm6 C am 1
362 pmuludq %mm7, %mm2 C am 1
363 movd 4(%eax), %mm3 C am 1
364 movd (%edx), %mm4 C am 1
365 pmuludq %mm7, %mm3 C am 1
366 movd 8(%eax), %mm0 C am 1
367 paddq %mm2, %mm4 C am 1
368 movd 4(%edx), %mm5 C am 1
369 jmp L(am01) C am 1
370 ALIGN(16) C am 1
371 L(lam1):
372 pmuludq %mm7, %mm2 C am 1
373 paddq %mm4, %mm6 C am 1
374 movd 4(%eax), %mm3 C am 1
375 paddq %mm1, %mm5 C am 1
376 movd (%edx), %mm4 C am 1
377 movd %mm6, -8(%edx) C am 1
378 psrlq $32, %mm6 C am 1
379 pmuludq %mm7, %mm3 C am 1
380 paddq %mm5, %mm6 C am 1
381 movd 8(%eax), %mm0 C am 1
382 paddq %mm2, %mm4 C am 1
383 movd 4(%edx), %mm5 C am 1
384 movd %mm6, -4(%edx) C am 1
385 psrlq $32, %mm6 C am 1
386 L(am01):
387 pmuludq %mm7, %mm0 C am 1
388 paddq %mm4, %mm6 C am 1
389 movd 12(%eax), %mm1 C am 1
390 paddq %mm3, %mm5 C am 1
391 movd 8(%edx), %mm4 C am 1
392 movd %mm6, (%edx) C am 1
393 psrlq $32, %mm6 C am 1
394 pmuludq %mm7, %mm1 C am 1
395 paddq %mm5, %mm6 C am 1
396 movd 16(%eax), %mm2 C am 1
397 paddq %mm0, %mm4 C am 1
398 movd 12(%edx), %mm5 C am 1
399 movd %mm6, 4(%edx) C am 1
400 psrlq $32, %mm6 C am 1
401 lea 16(%eax), %eax C am 1
402 lea 16(%edx), %edx C am 1
403 add $4, %ecx C am 1
404 jnz L(lam1) C am 1
405 pmuludq %mm7, %mm2 C am 1
406 paddq %mm4, %mm6 C am 1
407 paddq %mm1, %mm5 C am 1
408 movd (%edx), %mm4 C am 1
409 movd %mm6, -8(%edx) C am 1
410 psrlq $32, %mm6 C am 1
411 paddq %mm5, %mm6 C am 1
412 paddq %mm2, %mm4 C am 1
413 L(x1): movd %mm6, -4(%edx) C am 1
414 psrlq $32, %mm6 C am 1
415 paddq %mm4, %mm6 C am 1
416 movd %mm6, (%edx) C am 1
417 psrlq $32, %mm6 C am 1
418 movd %mm6, 4(%edx) C am 1
419 dec %ebx C am 1
420 jnz L(olp1) C am 1
421 L(oel1):
422 emms C 1
423 pop %edi C 1
424 pop %ebx C 1
425 pop %esi C 1
426 ret C 1
429 L(2): movd (%eax), %mm1 C m 2
430 sub 24(%esp), %ecx C m 2
431 mov %ecx, 24(%esp) C update loop count for later m 2
432 pmuludq %mm7, %mm1 C m 2
433 movd 4(%eax), %mm4 C m 2
434 pmuludq %mm7, %mm4 C m 2
435 movd 8(%eax), %mm3 C m 2
436 jmp L(m10) C m 2
437 ALIGN(16) C m 2
438 L(lpm2):
439 pmuludq %mm7, %mm4 C m 2
440 paddq %mm0, %mm6 C m 2
441 movd 8(%eax), %mm3 C m 2
442 movd %mm6, -4(%edx) C m 2
443 psrlq $32, %mm6 C m 2
444 L(m10): pmuludq %mm7, %mm3 C m 2
445 paddq %mm1, %mm6 C m 2
446 movd 12(%eax), %mm0 C m 2
447 movd %mm6, (%edx) C m 2
448 psrlq $32, %mm6 C m 2
449 pmuludq %mm7, %mm0 C m 2
450 paddq %mm4, %mm6 C m 2
451 movd 16(%eax), %mm1 C m 2
452 movd %mm6, 4(%edx) C m 2
453 psrlq $32, %mm6 C m 2
454 pmuludq %mm7, %mm1 C m 2
455 paddq %mm3, %mm6 C m 2
456 movd 20(%eax), %mm4 C m 2
457 movd %mm6, 8(%edx) C m 2
458 psrlq $32, %mm6 C m 2
459 lea 16(%eax), %eax C m 2
460 lea 16(%edx), %edx C m 2
461 add $4, %ecx C m 2
462 ja L(lpm2) C m 2
463 pmuludq %mm7, %mm4 C m 2
464 paddq %mm0, %mm6 C m 2
465 movd %mm6, -4(%edx) C m 2
466 psrlq $32, %mm6 C m 2
467 paddq %mm1, %mm6 C m 2
468 mov 16(%esp), %edi C rp 2
469 jmp L(x2)
471 L(olp2):
472 lea 4(%edi), %edi C am 2
473 movd (%esi), %mm7 C am 2
474 lea 4(%esi), %esi C am 2
475 mov %edi, %edx C rp am 2
476 mov 20(%esp), %eax C up am 2
477 movd (%eax), %mm1 C am 2
478 mov 24(%esp), %ecx C inner loop count am 2
479 pxor %mm6, %mm6 C am 2
480 pmuludq %mm7, %mm1 C am 2
481 movd 4(%eax), %mm2 C am 2
482 movd (%edx), %mm5 C am 2
483 pmuludq %mm7, %mm2 C am 2
484 movd 8(%eax), %mm3 C am 2
485 paddq %mm1, %mm5 C am 2
486 movd 4(%edx), %mm4 C am 2
487 jmp L(am10) C am 2
488 ALIGN(16) C am 2
489 L(lam2):
490 pmuludq %mm7, %mm2 C am 2
491 paddq %mm4, %mm6 C am 2
492 movd 8(%eax), %mm3 C am 2
493 paddq %mm1, %mm5 C am 2
494 movd 4(%edx), %mm4 C am 2
495 movd %mm6, -4(%edx) C am 2
496 psrlq $32, %mm6 C am 2
497 L(am10):
498 pmuludq %mm7, %mm3 C am 2
499 paddq %mm5, %mm6 C am 2
500 movd 12(%eax), %mm0 C am 2
501 paddq %mm2, %mm4 C am 2
502 movd 8(%edx), %mm5 C am 2
503 movd %mm6, (%edx) C am 2
504 psrlq $32, %mm6 C am 2
505 pmuludq %mm7, %mm0 C am 2
506 paddq %mm4, %mm6 C am 2
507 movd 16(%eax), %mm1 C am 2
508 paddq %mm3, %mm5 C am 2
509 movd 12(%edx), %mm4 C am 2
510 movd %mm6, 4(%edx) C am 2
511 psrlq $32, %mm6 C am 2
512 pmuludq %mm7, %mm1 C am 2
513 paddq %mm5, %mm6 C am 2
514 movd 20(%eax), %mm2 C am 2
515 paddq %mm0, %mm4 C am 2
516 movd 16(%edx), %mm5 C am 2
517 movd %mm6, 8(%edx) C am 2
518 psrlq $32, %mm6 C am 2
519 lea 16(%eax), %eax C am 2
520 lea 16(%edx), %edx C am 2
521 add $4, %ecx C am 2
522 jnz L(lam2) C am 2
523 pmuludq %mm7, %mm2 C am 2
524 paddq %mm4, %mm6 C am 2
525 paddq %mm1, %mm5 C am 2
526 movd 4(%edx), %mm4 C am 2
527 movd %mm6, -4(%edx) C am 2
528 psrlq $32, %mm6 C am 2
529 paddq %mm5, %mm6 C am 2
530 paddq %mm2, %mm4 C am 2
531 L(x2): movd %mm6, (%edx) C am 2
532 psrlq $32, %mm6 C am 2
533 paddq %mm4, %mm6 C am 2
534 movd %mm6, 4(%edx) C am 2
535 psrlq $32, %mm6 C am 2
536 movd %mm6, 8(%edx) C am 2
537 dec %ebx C am 2
538 jnz L(olp2) C am 2
539 L(oel2):
540 emms C 2
541 pop %edi C 2
542 pop %ebx C 2
543 pop %esi C 2
544 ret C 2
547 L(3): movd (%eax), %mm0 C m 3
548 sub 24(%esp), %ecx C m 3
549 mov %ecx, 24(%esp) C update loop count for later m 3
550 pmuludq %mm7, %mm0 C m 3
551 movd 4(%eax), %mm1 C m 3
552 pmuludq %mm7, %mm1 C m 3
553 movd 8(%eax), %mm4 C m 3
554 jmp L(lpm3) C m 3
555 ALIGN(16) C m 3
556 L(lpm3):
557 pmuludq %mm7, %mm4 C m 3
558 paddq %mm0, %mm6 C m 3
559 movd 12(%eax), %mm3 C m 3
560 movd %mm6, (%edx) C m 3
561 psrlq $32, %mm6 C m 3
562 pmuludq %mm7, %mm3 C m 3
563 paddq %mm1, %mm6 C m 3
564 movd 16(%eax), %mm0 C m 3
565 movd %mm6, 4(%edx) C m 3
566 psrlq $32, %mm6 C m 3
567 pmuludq %mm7, %mm0 C m 3
568 paddq %mm4, %mm6 C m 3
569 movd 20(%eax), %mm1 C m 3
570 movd %mm6, 8(%edx) C m 3
571 psrlq $32, %mm6 C m 3
572 pmuludq %mm7, %mm1 C m 3
573 paddq %mm3, %mm6 C m 3
574 movd 24(%eax), %mm4 C m 3
575 movd %mm6, 12(%edx) C m 3
576 psrlq $32, %mm6 C m 3
577 lea 16(%eax), %eax C m 3
578 lea 16(%edx), %edx C m 3
579 add $4, %ecx C m 3
580 ja L(lpm3) C m 3
581 pmuludq %mm7, %mm4 C m 3
582 paddq %mm0, %mm6 C m 3
583 movd %mm6, (%edx) C m 3
584 psrlq $32, %mm6 C m 3
585 paddq %mm1, %mm6 C m 3
586 mov 16(%esp), %edi C rp 3
587 jmp L(x3)
589 L(olp3):
590 lea 4(%edi), %edi C am 3
591 movd (%esi), %mm7 C am 3
592 lea 4(%esi), %esi C am 3
593 mov %edi, %edx C rp am 3
594 mov 20(%esp), %eax C up am 3
595 movd (%eax), %mm0 C am 3
596 mov 24(%esp), %ecx C inner loop count am 3
597 pxor %mm6, %mm6 C am 3
598 pmuludq %mm7, %mm0 C am 3
599 movd 4(%eax), %mm1 C am 3
600 movd (%edx), %mm4 C am 3
601 pmuludq %mm7, %mm1 C am 3
602 movd 8(%eax), %mm2 C am 3
603 paddq %mm0, %mm4 C am 3
604 movd 4(%edx), %mm5 C am 3
605 jmp L(lam3) C am 3
606 ALIGN(16) C am 3
607 L(lam3):
608 pmuludq %mm7, %mm2 C am 3
609 paddq %mm4, %mm6 C am 3
610 movd 12(%eax), %mm3 C am 3
611 paddq %mm1, %mm5 C am 3
612 movd 8(%edx), %mm4 C am 3
613 movd %mm6, (%edx) C am 3
614 psrlq $32, %mm6 C am 3
615 pmuludq %mm7, %mm3 C am 3
616 paddq %mm5, %mm6 C am 3
617 movd 16(%eax), %mm0 C am 3
618 paddq %mm2, %mm4 C am 3
619 movd 12(%edx), %mm5 C am 3
620 movd %mm6, 4(%edx) C am 3
621 psrlq $32, %mm6 C am 3
622 pmuludq %mm7, %mm0 C am 3
623 paddq %mm4, %mm6 C am 3
624 movd 20(%eax), %mm1 C am 3
625 paddq %mm3, %mm5 C am 3
626 movd 16(%edx), %mm4 C am 3
627 movd %mm6, 8(%edx) C am 3
628 psrlq $32, %mm6 C am 3
629 pmuludq %mm7, %mm1 C am 3
630 paddq %mm5, %mm6 C am 3
631 movd 24(%eax), %mm2 C am 3
632 paddq %mm0, %mm4 C am 3
633 movd 20(%edx), %mm5 C am 3
634 movd %mm6, 12(%edx) C am 3
635 psrlq $32, %mm6 C am 3
636 lea 16(%eax), %eax C am 3
637 lea 16(%edx), %edx C am 3
638 add $4, %ecx C am 3
639 jnz L(lam3) C am 3
640 pmuludq %mm7, %mm2 C am 3
641 paddq %mm4, %mm6 C am 3
642 paddq %mm1, %mm5 C am 3
643 movd 8(%edx), %mm4 C am 3
644 movd %mm6, (%edx) C am 3
645 psrlq $32, %mm6 C am 3
646 paddq %mm5, %mm6 C am 3
647 paddq %mm2, %mm4 C am 3
648 L(x3): movd %mm6, 4(%edx) C am 3
649 psrlq $32, %mm6 C am 3
650 paddq %mm4, %mm6 C am 3
651 movd %mm6, 8(%edx) C am 3
652 psrlq $32, %mm6 C am 3
653 movd %mm6, 12(%edx) C am 3
654 dec %ebx C am 3
655 jnz L(olp3) C am 3
656 L(oel3):
657 emms C 3
658 pop %edi C 3
659 pop %ebx C 3
660 pop %esi C 3
661 ret C 3
662 EPILOGUE()