beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / pentium4 / sse2 / sqr_basecase.asm
blob2dd57d25d9af113662484df6f99de9cc501a932c
1 dnl mpn_sqr_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
3 dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C TODO:
34 C * Improve ad-hoc outer loop code and register handling. Some feed-in
35 C scheduling could improve things by several cycles per outer iteration.
36 C * In Lam3...Lam1 code for, keep accumulation operands in registers, without
37 C storing intermediates to rp.
38 C * We might want to keep 32 in a free mm register, since the register form is
39 C 3 bytes and the immediate form is 4 bytes. About 80 bytes to save.
40 C * Look into different loop alignment, we now expand the code about 50 bytes
41 C with possibly needless alignment.
42 C * Use OSP, should solve feed-in latency problems.
43 C * Address relative slowness for un<=3 for Pentium M. The old code is there
44 C considerably faster. (1:20/14, 2:34:32, 3:66/57)
46 C INPUT PARAMETERS
47 C rp sp + 4
48 C up sp + 8
49 C un sp + 12
51 TEXT
52 ALIGN(16)
53 PROLOGUE(mpn_sqr_basecase)
54 mov 4(%esp), %edx C rp
55 mov 8(%esp), %eax C up
56 mov 12(%esp), %ecx C un
58 cmp $2, %ecx
59 jc L(un1)
60 jz L(un2)
61 cmp $4, %ecx
62 jc L(un3)
63 jz L(un4)
64 jmp L(big)
66 L(un1): mov (%eax), %eax
67 mov %edx, %ecx
68 mul %eax
69 mov %eax, (%ecx)
70 mov %edx, 4(%ecx)
71 ret
72 L(un2): movd (%eax), %mm0 C un=2
73 movd (%eax), %mm2 C un=2
74 movd 4(%eax), %mm1 C un=2
75 pmuludq %mm0, %mm0 C 64b weight 0 un=2
76 pmuludq %mm1, %mm2 C 64b weight 32 un=2
77 pmuludq %mm1, %mm1 C 64b weight 64 un=2
78 movd %mm0, (%edx) C un=2
79 psrlq $32, %mm0 C 32b weight 32 un=2
80 pcmpeqd %mm7, %mm7 C un=2
81 psrlq $33, %mm7 C 0x000000007FFFFFFF un=2
82 pand %mm2, %mm7 C 31b weight 32 un=2
83 psrlq $31, %mm2 C 33b weight 65 un=2
84 psllq $1, %mm7 C 31b weight 33 un=2
85 paddq %mm7, %mm0 C un=2
86 movd %mm0, 4(%edx) C un=2
87 psrlq $32, %mm0 C un=2
88 paddq %mm2, %mm1 C un=2
89 paddq %mm0, %mm1 C un=2
90 movd %mm1, 8(%edx) C un=2
91 psrlq $32, %mm1 C un=2
92 movd %mm1, 12(%edx) C un=2
93 emms
94 ret
95 L(un3): movd (%eax), %mm7 C un=3
96 movd 4(%eax), %mm6 C un=3
97 pmuludq %mm7, %mm6 C un=3
98 movd 8(%eax), %mm2 C un=3
99 pmuludq %mm7, %mm2 C un=3
100 movd %mm6, 4(%edx) C un=3
101 psrlq $32, %mm6 C un=3
102 paddq %mm2, %mm6 C un=3
103 movd %mm6, 8(%edx) C un=3
104 psrlq $32, %mm6 C un=3
105 movd %mm6, 12(%edx) C un=3
106 lea 4(%edx), %edx C un=3
107 lea 4(%eax), %eax C un=3
108 jmp L(am1)
109 L(un4): movd (%eax), %mm7 C un=4
110 movd 4(%eax), %mm6 C un=4
111 pmuludq %mm7, %mm6 C un=4
112 movd 8(%eax), %mm0 C un=4
113 pmuludq %mm7, %mm0 C un=4
114 movd 12(%eax), %mm1 C un=4
115 pmuludq %mm7, %mm1 C un=4
116 movd %mm6, 4(%edx) C un=4
117 psrlq $32, %mm6 C un=4
118 paddq %mm0, %mm6 C un=4
119 movd %mm6, 8(%edx) C un=4
120 psrlq $32, %mm6 C un=4
121 paddq %mm1, %mm6 C un=4
122 movd %mm6, 12(%edx) C un=4
123 psrlq $32, %mm6 C un=4
124 movd %mm6, 16(%edx) C un=4
125 lea 4(%edx), %edx C un=4
126 lea 4(%eax), %eax C un=4
127 jmp L(am2)
129 L(big): push %esi
130 push %ebx
131 push %edi
132 pxor %mm6, %mm6
133 movd (%eax), %mm7 C
134 lea 4(%eax), %esi C init up, up++
135 lea 4(%eax), %eax C up2++ FIXME: should fix offsets
136 lea 4(%edx), %edi C init rp, rp++
137 lea 4(%edx), %edx C rp2++
138 lea -4(%ecx), %ebx C loop count
139 and $3, %ecx
140 jz L(3m)
141 cmp $2, %ecx
142 ja L(2m)
143 jb L(0m)
145 L(1m):
146 movd (%eax), %mm4 C m 1
147 lea (%ebx), %ecx C inner loop count m 1
148 pmuludq %mm7, %mm4 C m 1
149 movd 4(%eax), %mm3 C m 1
150 pmuludq %mm7, %mm3 C m 1
151 movd 8(%eax), %mm0 C m 1
152 jmp L(m01) C m 1
153 ALIGN(16) C m 1
154 L(lpm1):
155 pmuludq %mm7, %mm4 C m 1
156 paddq %mm0, %mm6 C m 1
157 movd 4(%eax), %mm3 C m 1
158 movd %mm6, -8(%edx) C m 1
159 psrlq $32, %mm6 C m 1
160 pmuludq %mm7, %mm3 C m 1
161 paddq %mm1, %mm6 C m 1
162 movd 8(%eax), %mm0 C m 1
163 movd %mm6, -4(%edx) C m 1
164 psrlq $32, %mm6 C m 1
165 L(m01): pmuludq %mm7, %mm0 C m 1
166 paddq %mm4, %mm6 C m 1
167 movd 12(%eax), %mm1 C m 1
168 movd %mm6, (%edx) C m 1
169 psrlq $32, %mm6 C m 1
170 pmuludq %mm7, %mm1 C m 1
171 paddq %mm3, %mm6 C m 1
172 movd 16(%eax), %mm4 C m 1
173 movd %mm6, 4(%edx) C m 1
174 psrlq $32, %mm6 C m 1
175 lea 16(%eax), %eax C m 1
176 lea 16(%edx), %edx C m 1
177 sub $4, %ecx C m 1
178 ja L(lpm1) C m 1
179 pmuludq %mm7, %mm4 C m 1
180 paddq %mm0, %mm6 C m 1
181 movd %mm6, -8(%edx) C m 1
182 psrlq $32, %mm6 C m 1
183 paddq %mm1, %mm6 C m 1
184 jmp L(0)
186 L(2m):
187 movd (%eax), %mm1 C m 2
188 lea (%ebx), %ecx C inner loop count m 2
189 pmuludq %mm7, %mm1 C m 2
190 movd 4(%eax), %mm4 C m 2
191 pmuludq %mm7, %mm4 C m 2
192 movd 8(%eax), %mm3 C m 2
193 jmp L(m10) C m 2
194 ALIGN(16) C m 2
195 L(lpm2):
196 pmuludq %mm7, %mm4 C m 2
197 paddq %mm0, %mm6 C m 2
198 movd 8(%eax), %mm3 C m 2
199 movd %mm6, -4(%edx) C m 2
200 psrlq $32, %mm6 C m 2
201 L(m10): pmuludq %mm7, %mm3 C m 2
202 paddq %mm1, %mm6 C m 2
203 movd 12(%eax), %mm0 C m 2
204 movd %mm6, (%edx) C m 2
205 psrlq $32, %mm6 C m 2
206 pmuludq %mm7, %mm0 C m 2
207 paddq %mm4, %mm6 C m 2
208 movd 16(%eax), %mm1 C m 2
209 movd %mm6, 4(%edx) C m 2
210 psrlq $32, %mm6 C m 2
211 pmuludq %mm7, %mm1 C m 2
212 paddq %mm3, %mm6 C m 2
213 movd 20(%eax), %mm4 C m 2
214 movd %mm6, 8(%edx) C m 2
215 psrlq $32, %mm6 C m 2
216 lea 16(%eax), %eax C m 2
217 lea 16(%edx), %edx C m 2
218 sub $4, %ecx C m 2
219 ja L(lpm2) C m 2
220 pmuludq %mm7, %mm4 C m 2
221 paddq %mm0, %mm6 C m 2
222 movd %mm6, -4(%edx) C m 2
223 psrlq $32, %mm6 C m 2
224 paddq %mm1, %mm6 C m 2
225 jmp L(1)
227 L(3m):
228 movd (%eax), %mm0 C m 3
229 lea (%ebx), %ecx C inner loop count m 3
230 pmuludq %mm7, %mm0 C m 3
231 movd 4(%eax), %mm1 C m 3
232 pmuludq %mm7, %mm1 C m 3
233 movd 8(%eax), %mm4 C m 3
234 jmp L(lpm3) C m 3
235 ALIGN(16) C m 3
236 L(lpm3):
237 pmuludq %mm7, %mm4 C m 3
238 paddq %mm0, %mm6 C m 3
239 movd 12(%eax), %mm3 C m 3
240 movd %mm6, (%edx) C m 3
241 psrlq $32, %mm6 C m 3
242 pmuludq %mm7, %mm3 C m 3
243 paddq %mm1, %mm6 C m 3
244 movd 16(%eax), %mm0 C m 3
245 movd %mm6, 4(%edx) C m 3
246 psrlq $32, %mm6 C m 3
247 pmuludq %mm7, %mm0 C m 3
248 paddq %mm4, %mm6 C m 3
249 movd 20(%eax), %mm1 C m 3
250 movd %mm6, 8(%edx) C m 3
251 psrlq $32, %mm6 C m 3
252 pmuludq %mm7, %mm1 C m 3
253 paddq %mm3, %mm6 C m 3
254 movd 24(%eax), %mm4 C m 3
255 movd %mm6, 12(%edx) C m 3
256 psrlq $32, %mm6 C m 3
257 lea 16(%eax), %eax C m 3
258 lea 16(%edx), %edx C m 3
259 sub $4, %ecx C m 3
260 ja L(lpm3) C m 3
261 pmuludq %mm7, %mm4 C m 3
262 paddq %mm0, %mm6 C m 3
263 movd %mm6, (%edx) C m 3
264 psrlq $32, %mm6 C m 3
265 paddq %mm1, %mm6 C m 3
266 jmp L(2)
268 L(0m):
269 movd (%eax), %mm3 C m 0
270 lea (%ebx), %ecx C inner loop count m 0
271 pmuludq %mm7, %mm3 C m 0
272 movd 4(%eax), %mm0 C m 0
273 pmuludq %mm7, %mm0 C m 0
274 movd 8(%eax), %mm1 C m 0
275 jmp L(m00) C m 0
276 ALIGN(16) C m 0
277 L(lpm0):
278 pmuludq %mm7, %mm4 C m 0
279 paddq %mm0, %mm6 C m 0
280 movd (%eax), %mm3 C m 0
281 movd %mm6, -12(%edx) C m 0
282 psrlq $32, %mm6 C m 0
283 pmuludq %mm7, %mm3 C m 0
284 paddq %mm1, %mm6 C m 0
285 movd 4(%eax), %mm0 C m 0
286 movd %mm6, -8(%edx) C m 0
287 psrlq $32, %mm6 C m 0
288 pmuludq %mm7, %mm0 C m 0
289 paddq %mm4, %mm6 C m 0
290 movd 8(%eax), %mm1 C m 0
291 movd %mm6, -4(%edx) C m 0
292 psrlq $32, %mm6 C m 0
293 L(m00): pmuludq %mm7, %mm1 C m 0
294 paddq %mm3, %mm6 C m 0
295 movd 12(%eax), %mm4 C m 0
296 movd %mm6, (%edx) C m 0
297 psrlq $32, %mm6 C m 0
298 lea 16(%eax), %eax C m 0
299 lea 16(%edx), %edx C m 0
300 sub $4, %ecx C m 0
301 ja L(lpm0) C m 0
302 pmuludq %mm7, %mm4 C m 0
303 paddq %mm0, %mm6 C m 0
304 movd %mm6, -12(%edx) C m 0
305 psrlq $32, %mm6 C m 0
306 paddq %mm1, %mm6 C m 0
307 jmp L(3)
309 L(outer):
310 lea 8(%edi), %edi C rp += 2
311 movd (%esi), %mm7 C am 3
312 mov %edi, %edx C rp2 = rp am 3
313 lea 4(%esi), %esi C up++ am 3
314 lea (%esi), %eax C up2 = up am 3
315 movd (%eax), %mm0 C am 3
316 lea (%ebx), %ecx C inner loop count am 3
317 pxor %mm6, %mm6 C am 3
318 pmuludq %mm7, %mm0 C am 3
319 movd 4(%eax), %mm1 C am 3
320 movd (%edx), %mm4 C am 3
321 pmuludq %mm7, %mm1 C am 3
322 movd 8(%eax), %mm2 C am 3
323 paddq %mm0, %mm4 C am 3
324 movd 4(%edx), %mm5 C am 3
325 jmp L(lam3) C am 3
326 ALIGN(16) C am 3
327 L(lam3):
328 pmuludq %mm7, %mm2 C am 3
329 paddq %mm4, %mm6 C am 3
330 movd 12(%eax), %mm3 C am 3
331 paddq %mm1, %mm5 C am 3
332 movd 8(%edx), %mm4 C am 3
333 movd %mm6, (%edx) C am 3
334 psrlq $32, %mm6 C am 3
335 pmuludq %mm7, %mm3 C am 3
336 paddq %mm5, %mm6 C am 3
337 movd 16(%eax), %mm0 C am 3
338 paddq %mm2, %mm4 C am 3
339 movd 12(%edx), %mm5 C am 3
340 movd %mm6, 4(%edx) C am 3
341 psrlq $32, %mm6 C am 3
342 pmuludq %mm7, %mm0 C am 3
343 paddq %mm4, %mm6 C am 3
344 movd 20(%eax), %mm1 C am 3
345 paddq %mm3, %mm5 C am 3
346 movd 16(%edx), %mm4 C am 3
347 movd %mm6, 8(%edx) C am 3
348 psrlq $32, %mm6 C am 3
349 pmuludq %mm7, %mm1 C am 3
350 paddq %mm5, %mm6 C am 3
351 movd 24(%eax), %mm2 C am 3
352 paddq %mm0, %mm4 C am 3
353 movd 20(%edx), %mm5 C am 3
354 movd %mm6, 12(%edx) C am 3
355 psrlq $32, %mm6 C am 3
356 lea 16(%eax), %eax C am 3
357 lea 16(%edx), %edx C am 3
358 sub $4, %ecx C am 3
359 ja L(lam3) C am 3
360 pmuludq %mm7, %mm2 C am 3
361 paddq %mm4, %mm6 C am 3
362 paddq %mm1, %mm5 C am 3
363 movd 8(%edx), %mm4 C am 3
364 movd %mm6, (%edx) C am 3
365 psrlq $32, %mm6 C am 3
366 paddq %mm5, %mm6 C am 3
367 paddq %mm2, %mm4 C am 3
368 L(2): movd %mm6, 4(%edx) C am 3
369 psrlq $32, %mm6 C am 3
370 paddq %mm4, %mm6 C am 3
371 movd %mm6, 8(%edx) C am 3
372 psrlq $32, %mm6 C am 3
373 movd %mm6, 12(%edx) C am 3
375 lea 8(%edi), %edi C rp += 2
376 movd (%esi), %mm7 C am 2
377 mov %edi, %edx C rp2 = rp am 2
378 lea 4(%esi), %esi C up++ am 2
379 lea (%esi), %eax C up2 = up am 2
380 movd (%eax), %mm1 C am 2
381 lea (%ebx), %ecx C inner loop count am 2
382 pxor %mm6, %mm6 C am 2
383 pmuludq %mm7, %mm1 C am 2
384 movd 4(%eax), %mm2 C am 2
385 movd (%edx), %mm5 C am 2
386 pmuludq %mm7, %mm2 C am 2
387 movd 8(%eax), %mm3 C am 2
388 paddq %mm1, %mm5 C am 2
389 movd 4(%edx), %mm4 C am 2
390 jmp L(am10) C am 2
391 ALIGN(16) C am 2
392 L(lam2):
393 pmuludq %mm7, %mm2 C am 2
394 paddq %mm4, %mm6 C am 2
395 movd 8(%eax), %mm3 C am 2
396 paddq %mm1, %mm5 C am 2
397 movd 4(%edx), %mm4 C am 2
398 movd %mm6, -4(%edx) C am 2
399 psrlq $32, %mm6 C am 2
400 L(am10):
401 pmuludq %mm7, %mm3 C am 2
402 paddq %mm5, %mm6 C am 2
403 movd 12(%eax), %mm0 C am 2
404 paddq %mm2, %mm4 C am 2
405 movd 8(%edx), %mm5 C am 2
406 movd %mm6, (%edx) C am 2
407 psrlq $32, %mm6 C am 2
408 pmuludq %mm7, %mm0 C am 2
409 paddq %mm4, %mm6 C am 2
410 movd 16(%eax), %mm1 C am 2
411 paddq %mm3, %mm5 C am 2
412 movd 12(%edx), %mm4 C am 2
413 movd %mm6, 4(%edx) C am 2
414 psrlq $32, %mm6 C am 2
415 pmuludq %mm7, %mm1 C am 2
416 paddq %mm5, %mm6 C am 2
417 movd 20(%eax), %mm2 C am 2
418 paddq %mm0, %mm4 C am 2
419 movd 16(%edx), %mm5 C am 2
420 movd %mm6, 8(%edx) C am 2
421 psrlq $32, %mm6 C am 2
422 lea 16(%eax), %eax C am 2
423 lea 16(%edx), %edx C am 2
424 sub $4, %ecx C am 2
425 ja L(lam2) C am 2
426 pmuludq %mm7, %mm2 C am 2
427 paddq %mm4, %mm6 C am 2
428 paddq %mm1, %mm5 C am 2
429 movd 4(%edx), %mm4 C am 2
430 movd %mm6, -4(%edx) C am 2
431 psrlq $32, %mm6 C am 2
432 paddq %mm5, %mm6 C am 2
433 paddq %mm2, %mm4 C am 2
434 L(1): movd %mm6, (%edx) C am 2
435 psrlq $32, %mm6 C am 2
436 paddq %mm4, %mm6 C am 2
437 movd %mm6, 4(%edx) C am 2
438 psrlq $32, %mm6 C am 2
439 movd %mm6, 8(%edx) C am 2
441 lea 8(%edi), %edi C rp += 2
442 movd (%esi), %mm7 C am 1
443 mov %edi, %edx C rp2 = rp am 1
444 lea 4(%esi), %esi C up++ am 1
445 lea (%esi), %eax C up2 = up am 1
446 movd (%eax), %mm2 C am 1
447 lea (%ebx), %ecx C inner loop count am 1
448 pxor %mm6, %mm6 C am 1
449 pmuludq %mm7, %mm2 C am 1
450 movd 4(%eax), %mm3 C am 1
451 movd (%edx), %mm4 C am 1
452 pmuludq %mm7, %mm3 C am 1
453 movd 8(%eax), %mm0 C am 1
454 paddq %mm2, %mm4 C am 1
455 movd 4(%edx), %mm5 C am 1
456 jmp L(am01) C am 1
457 ALIGN(16) C am 1
458 L(lam1):
459 pmuludq %mm7, %mm2 C am 1
460 paddq %mm4, %mm6 C am 1
461 movd 4(%eax), %mm3 C am 1
462 paddq %mm1, %mm5 C am 1
463 movd (%edx), %mm4 C am 1
464 movd %mm6, -8(%edx) C am 1
465 psrlq $32, %mm6 C am 1
466 pmuludq %mm7, %mm3 C am 1
467 paddq %mm5, %mm6 C am 1
468 movd 8(%eax), %mm0 C am 1
469 paddq %mm2, %mm4 C am 1
470 movd 4(%edx), %mm5 C am 1
471 movd %mm6, -4(%edx) C am 1
472 psrlq $32, %mm6 C am 1
473 L(am01):
474 pmuludq %mm7, %mm0 C am 1
475 paddq %mm4, %mm6 C am 1
476 movd 12(%eax), %mm1 C am 1
477 paddq %mm3, %mm5 C am 1
478 movd 8(%edx), %mm4 C am 1
479 movd %mm6, (%edx) C am 1
480 psrlq $32, %mm6 C am 1
481 pmuludq %mm7, %mm1 C am 1
482 paddq %mm5, %mm6 C am 1
483 movd 16(%eax), %mm2 C am 1
484 paddq %mm0, %mm4 C am 1
485 movd 12(%edx), %mm5 C am 1
486 movd %mm6, 4(%edx) C am 1
487 psrlq $32, %mm6 C am 1
488 lea 16(%eax), %eax C am 1
489 lea 16(%edx), %edx C am 1
490 sub $4, %ecx C am 1
491 ja L(lam1) C am 1
492 pmuludq %mm7, %mm2 C am 1
493 paddq %mm4, %mm6 C am 1
494 paddq %mm1, %mm5 C am 1
495 movd (%edx), %mm4 C am 1
496 movd %mm6, -8(%edx) C am 1
497 psrlq $32, %mm6 C am 1
498 paddq %mm5, %mm6 C am 1
499 paddq %mm2, %mm4 C am 1
500 L(0): movd %mm6, -4(%edx) C am 1
501 psrlq $32, %mm6 C am 1
502 paddq %mm4, %mm6 C am 1
503 movd %mm6, (%edx) C am 1
504 psrlq $32, %mm6 C am 1
505 movd %mm6, 4(%edx) C am 1
507 lea 8(%edi), %edi C rp += 2
508 movd (%esi), %mm7 C am 0
509 mov %edi, %edx C rp2 = rp am 0
510 lea 4(%esi), %esi C up++ am 0
511 lea (%esi), %eax C up2 = up am 0
512 movd (%eax), %mm3 C am 0
513 lea (%ebx), %ecx C inner loop count am 0
514 pxor %mm6, %mm6 C am 0
515 pmuludq %mm7, %mm3 C am 0
516 movd 4(%eax), %mm0 C am 0
517 movd (%edx), %mm5 C am 0
518 pmuludq %mm7, %mm0 C am 0
519 movd 8(%eax), %mm1 C am 0
520 paddq %mm3, %mm5 C am 0
521 movd 4(%edx), %mm4 C am 0
522 jmp L(am00) C am 0
523 ALIGN(16) C am 0
524 L(lam0):
525 pmuludq %mm7, %mm2 C am 0
526 paddq %mm4, %mm6 C am 0
527 movd (%eax), %mm3 C am 0
528 paddq %mm1, %mm5 C am 0
529 movd -4(%edx), %mm4 C am 0
530 movd %mm6, -12(%edx) C am 0
531 psrlq $32, %mm6 C am 0
532 pmuludq %mm7, %mm3 C am 0
533 paddq %mm5, %mm6 C am 0
534 movd 4(%eax), %mm0 C am 0
535 paddq %mm2, %mm4 C am 0
536 movd (%edx), %mm5 C am 0
537 movd %mm6, -8(%edx) C am 0
538 psrlq $32, %mm6 C am 0
539 pmuludq %mm7, %mm0 C am 0
540 paddq %mm4, %mm6 C am 0
541 movd 8(%eax), %mm1 C am 0
542 paddq %mm3, %mm5 C am 0
543 movd 4(%edx), %mm4 C am 0
544 movd %mm6, -4(%edx) C am 0
545 psrlq $32, %mm6 C am 0
546 L(am00):
547 pmuludq %mm7, %mm1 C am 0
548 paddq %mm5, %mm6 C am 0
549 movd 12(%eax), %mm2 C am 0
550 paddq %mm0, %mm4 C am 0
551 movd 8(%edx), %mm5 C am 0
552 movd %mm6, (%edx) C am 0
553 psrlq $32, %mm6 C am 0
554 lea 16(%eax), %eax C am 0
555 lea 16(%edx), %edx C am 0
556 sub $4, %ecx C am 0
557 ja L(lam0) C am 0
558 pmuludq %mm7, %mm2 C am 0
559 paddq %mm4, %mm6 C am 0
560 paddq %mm1, %mm5 C am 0
561 movd -4(%edx), %mm4 C am 0
562 movd %mm6, -12(%edx) C am 0
563 psrlq $32, %mm6 C am 0
564 paddq %mm5, %mm6 C am 0
565 paddq %mm2, %mm4 C am 0
566 L(3): movd %mm6, -8(%edx) C am 0
567 psrlq $32, %mm6 C am 0
568 paddq %mm4, %mm6 C am 0
569 movd %mm6, -4(%edx) C am 0
570 psrlq $32, %mm6 C am 0
571 movd %mm6, (%edx) C am 0
572 sub $4, %ebx C am 0
573 ja L(outer) C am 0
575 mov %edi, %edx
576 mov %esi, %eax
577 pop %edi
578 pop %ebx
579 pop %esi
581 L(am3): C up[un-1..un-3] x up[un-4]
582 lea 8(%edx), %edx C rp2 += 2
583 movd (%eax), %mm7
584 movd 4(%eax), %mm1
585 movd 8(%eax), %mm2
586 movd 12(%eax), %mm3
587 movd (%edx), %mm4
588 pmuludq %mm7, %mm1
589 movd 4(%edx), %mm5
590 pmuludq %mm7, %mm2
591 movd 8(%edx), %mm6
592 pmuludq %mm7, %mm3
593 paddq %mm1, %mm4
594 paddq %mm2, %mm5
595 paddq %mm3, %mm6
596 movd %mm4, (%edx)
597 psrlq $32, %mm4
598 paddq %mm5, %mm4
599 movd %mm4, 4(%edx)
600 psrlq $32, %mm4
601 paddq %mm6, %mm4
602 movd %mm4, 8(%edx)
603 psrlq $32, %mm4
604 movd %mm4, 12(%edx) C FIXME feed through!
605 lea 4(%eax), %eax
607 L(am2): C up[un-1..un-2] x up[un-3]
608 lea 8(%edx), %edx C rp2 += 2
609 movd (%eax), %mm7
610 movd 4(%eax), %mm1
611 movd 8(%eax), %mm2
612 movd (%edx), %mm4
613 movd 4(%edx), %mm5
614 pmuludq %mm7, %mm1
615 pmuludq %mm7, %mm2
616 paddq %mm1, %mm4
617 paddq %mm2, %mm5
618 movd %mm4, (%edx)
619 psrlq $32, %mm4
620 paddq %mm5, %mm4
621 movd %mm4, 4(%edx)
622 psrlq $32, %mm4
623 movd %mm4, 8(%edx) C FIXME feed through!
624 lea 4(%eax), %eax
626 L(am1): C up[un-1] x up[un-2]
627 lea 8(%edx), %edx C rp2 += 2
628 movd (%eax), %mm7
629 movd 4(%eax), %mm2
630 movd (%edx), %mm4
631 pmuludq %mm7, %mm2
632 paddq %mm2, %mm4
633 movd %mm4, (%edx)
634 psrlq $32, %mm4
635 movd %mm4, 4(%edx)
637 C *** diag stuff, use elementary code for now
639 mov 4(%esp), %edx C rp
640 mov 8(%esp), %eax C up
641 mov 12(%esp), %ecx C un
643 movd (%eax), %mm2
644 pmuludq %mm2, %mm2 C src[0]^2
646 pcmpeqd %mm7, %mm7
647 psrlq $32, %mm7
649 movd 4(%edx), %mm3 C dst[1]
651 movd %mm2, (%edx)
652 psrlq $32, %mm2
654 psllq $1, %mm3 C 2*dst[1]
655 paddq %mm3, %mm2
656 movd %mm2, 4(%edx)
657 psrlq $32, %mm2
659 sub $2, %ecx
661 L(diag):
662 movd 4(%eax), %mm0 C src limb
663 add $4, %eax
664 pmuludq %mm0, %mm0
665 movq %mm7, %mm1
666 pand %mm0, %mm1 C diagonal low
667 psrlq $32, %mm0 C diagonal high
669 movd 8(%edx), %mm3
670 psllq $1, %mm3 C 2*dst[i]
671 paddq %mm3, %mm1
672 paddq %mm1, %mm2
673 movd %mm2, 8(%edx)
674 psrlq $32, %mm2
676 movd 12(%edx), %mm3
677 psllq $1, %mm3 C 2*dst[i+1]
678 paddq %mm3, %mm0
679 paddq %mm0, %mm2
680 movd %mm2, 12(%edx)
681 add $8, %edx
682 psrlq $32, %mm2
684 sub $1, %ecx
685 jnz L(diag)
687 movd 4(%eax), %mm0 C src[size-1]
688 pmuludq %mm0, %mm0
689 pand %mm0, %mm7 C diagonal low
690 psrlq $32, %mm0 C diagonal high
692 movd 8(%edx), %mm3 C dst[2*size-2]
693 psllq $1, %mm3
694 paddq %mm3, %mm7
695 paddq %mm7, %mm2
696 movd %mm2, 8(%edx)
697 psrlq $32, %mm2
699 paddq %mm0, %mm2
700 movd %mm2, 12(%edx) C dst[2*size-1]
702 emms
705 EPILOGUE()