1 dnl mpn_sqr_basecase for Pentium
4 and P6 models with SSE2
(i.e.
, 9,D
,E
,F
).
3 dnl Copyright
2001, 2002, 2007 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C * Improve ad-hoc outer loop code and register handling. Some feed-in
35 C scheduling could improve things by several cycles per outer iteration.
36 C * In Lam3...Lam1 code for, keep accumulation operands in registers, without
37 C storing intermediates to rp.
38 C * We might want to keep 32 in a free mm register, since the register form is
39 C 3 bytes and the immediate form is 4 bytes. About 80 bytes to save.
40 C * Look into different loop alignment, we now expand the code about 50 bytes
41 C with possibly needless alignment.
42 C * Use OSP, should solve feed-in latency problems.
43 C * Address relative slowness for un<=3 for Pentium M. The old code is there
44 C considerably faster. (1:20/14, 2:34:32, 3:66/57)
53 PROLOGUE(mpn_sqr_basecase)
54 mov 4(%esp), %edx C rp
55 mov 8(%esp), %eax C up
56 mov 12(%esp), %ecx C un
66 L(un1): mov (%eax), %eax
72 L(un2): movd (%eax), %mm0 C un=2
73 movd (%eax), %mm2 C un=2
74 movd 4(%eax), %mm1 C un=2
75 pmuludq %mm0, %mm0 C 64b weight 0 un=2
76 pmuludq %mm1, %mm2 C 64b weight 32 un=2
77 pmuludq %mm1, %mm1 C 64b weight 64 un=2
78 movd %mm0, (%edx) C un=2
79 psrlq $32, %mm0 C 32b weight 32 un=2
80 pcmpeqd %mm7, %mm7 C un=2
81 psrlq $33, %mm7 C 0x000000007FFFFFFF un=2
82 pand %mm2, %mm7 C 31b weight 32 un=2
83 psrlq $31, %mm2 C 33b weight 65 un=2
84 psllq $1, %mm7 C 31b weight 33 un=2
85 paddq %mm7, %mm0 C un=2
86 movd %mm0, 4(%edx) C un=2
87 psrlq $32, %mm0 C un=2
88 paddq %mm2, %mm1 C un=2
89 paddq %mm0, %mm1 C un=2
90 movd %mm1, 8(%edx) C un=2
91 psrlq $32, %mm1 C un=2
92 movd %mm1, 12(%edx) C un=2
95 L(un3): movd (%eax), %mm7 C un=3
96 movd 4(%eax), %mm6 C un=3
97 pmuludq %mm7, %mm6 C un=3
98 movd 8(%eax), %mm2 C un=3
99 pmuludq %mm7, %mm2 C un=3
100 movd %mm6, 4(%edx) C un=3
101 psrlq $32, %mm6 C un=3
102 paddq %mm2, %mm6 C un=3
103 movd %mm6, 8(%edx) C un=3
104 psrlq $32, %mm6 C un=3
105 movd %mm6, 12(%edx) C un=3
106 lea 4(%edx), %edx C un=3
107 lea 4(%eax), %eax C un=3
109 L(un4): movd (%eax), %mm7 C un=4
110 movd 4(%eax), %mm6 C un=4
111 pmuludq %mm7, %mm6 C un=4
112 movd 8(%eax), %mm0 C un=4
113 pmuludq %mm7, %mm0 C un=4
114 movd 12(%eax), %mm1 C un=4
115 pmuludq %mm7, %mm1 C un=4
116 movd %mm6, 4(%edx) C un=4
117 psrlq $32, %mm6 C un=4
118 paddq %mm0, %mm6 C un=4
119 movd %mm6, 8(%edx) C un=4
120 psrlq $32, %mm6 C un=4
121 paddq %mm1, %mm6 C un=4
122 movd %mm6, 12(%edx) C un=4
123 psrlq $32, %mm6 C un=4
124 movd %mm6, 16(%edx) C un=4
125 lea 4(%edx), %edx C un=4
126 lea 4(%eax), %eax C un=4
134 lea 4(%eax), %esi C init up, up++
135 lea 4(%eax), %eax C up2++ FIXME: should fix offsets
136 lea 4(%edx), %edi C init rp, rp++
137 lea 4(%edx), %edx C rp2++
138 lea -4(%ecx), %ebx C loop count
146 movd (%eax), %mm4 C m 1
147 lea (%ebx), %ecx C inner loop count m 1
148 pmuludq %mm7, %mm4 C m 1
149 movd 4(%eax), %mm3 C m 1
150 pmuludq %mm7, %mm3 C m 1
151 movd 8(%eax), %mm0 C m 1
155 pmuludq %mm7, %mm4 C m 1
156 paddq %mm0, %mm6 C m 1
157 movd 4(%eax), %mm3 C m 1
158 movd %mm6, -8(%edx) C m 1
159 psrlq $32, %mm6 C m 1
160 pmuludq %mm7, %mm3 C m 1
161 paddq %mm1, %mm6 C m 1
162 movd 8(%eax), %mm0 C m 1
163 movd %mm6, -4(%edx) C m 1
164 psrlq $32, %mm6 C m 1
165 L(m01): pmuludq %mm7, %mm0 C m 1
166 paddq %mm4, %mm6 C m 1
167 movd 12(%eax), %mm1 C m 1
168 movd %mm6, (%edx) C m 1
169 psrlq $32, %mm6 C m 1
170 pmuludq %mm7, %mm1 C m 1
171 paddq %mm3, %mm6 C m 1
172 movd 16(%eax), %mm4 C m 1
173 movd %mm6, 4(%edx) C m 1
174 psrlq $32, %mm6 C m 1
175 lea 16(%eax), %eax C m 1
176 lea 16(%edx), %edx C m 1
179 pmuludq %mm7, %mm4 C m 1
180 paddq %mm0, %mm6 C m 1
181 movd %mm6, -8(%edx) C m 1
182 psrlq $32, %mm6 C m 1
183 paddq %mm1, %mm6 C m 1
187 movd (%eax), %mm1 C m 2
188 lea (%ebx), %ecx C inner loop count m 2
189 pmuludq %mm7, %mm1 C m 2
190 movd 4(%eax), %mm4 C m 2
191 pmuludq %mm7, %mm4 C m 2
192 movd 8(%eax), %mm3 C m 2
196 pmuludq %mm7, %mm4 C m 2
197 paddq %mm0, %mm6 C m 2
198 movd 8(%eax), %mm3 C m 2
199 movd %mm6, -4(%edx) C m 2
200 psrlq $32, %mm6 C m 2
201 L(m10): pmuludq %mm7, %mm3 C m 2
202 paddq %mm1, %mm6 C m 2
203 movd 12(%eax), %mm0 C m 2
204 movd %mm6, (%edx) C m 2
205 psrlq $32, %mm6 C m 2
206 pmuludq %mm7, %mm0 C m 2
207 paddq %mm4, %mm6 C m 2
208 movd 16(%eax), %mm1 C m 2
209 movd %mm6, 4(%edx) C m 2
210 psrlq $32, %mm6 C m 2
211 pmuludq %mm7, %mm1 C m 2
212 paddq %mm3, %mm6 C m 2
213 movd 20(%eax), %mm4 C m 2
214 movd %mm6, 8(%edx) C m 2
215 psrlq $32, %mm6 C m 2
216 lea 16(%eax), %eax C m 2
217 lea 16(%edx), %edx C m 2
220 pmuludq %mm7, %mm4 C m 2
221 paddq %mm0, %mm6 C m 2
222 movd %mm6, -4(%edx) C m 2
223 psrlq $32, %mm6 C m 2
224 paddq %mm1, %mm6 C m 2
228 movd (%eax), %mm0 C m 3
229 lea (%ebx), %ecx C inner loop count m 3
230 pmuludq %mm7, %mm0 C m 3
231 movd 4(%eax), %mm1 C m 3
232 pmuludq %mm7, %mm1 C m 3
233 movd 8(%eax), %mm4 C m 3
237 pmuludq %mm7, %mm4 C m 3
238 paddq %mm0, %mm6 C m 3
239 movd 12(%eax), %mm3 C m 3
240 movd %mm6, (%edx) C m 3
241 psrlq $32, %mm6 C m 3
242 pmuludq %mm7, %mm3 C m 3
243 paddq %mm1, %mm6 C m 3
244 movd 16(%eax), %mm0 C m 3
245 movd %mm6, 4(%edx) C m 3
246 psrlq $32, %mm6 C m 3
247 pmuludq %mm7, %mm0 C m 3
248 paddq %mm4, %mm6 C m 3
249 movd 20(%eax), %mm1 C m 3
250 movd %mm6, 8(%edx) C m 3
251 psrlq $32, %mm6 C m 3
252 pmuludq %mm7, %mm1 C m 3
253 paddq %mm3, %mm6 C m 3
254 movd 24(%eax), %mm4 C m 3
255 movd %mm6, 12(%edx) C m 3
256 psrlq $32, %mm6 C m 3
257 lea 16(%eax), %eax C m 3
258 lea 16(%edx), %edx C m 3
261 pmuludq %mm7, %mm4 C m 3
262 paddq %mm0, %mm6 C m 3
263 movd %mm6, (%edx) C m 3
264 psrlq $32, %mm6 C m 3
265 paddq %mm1, %mm6 C m 3
269 movd (%eax), %mm3 C m 0
270 lea (%ebx), %ecx C inner loop count m 0
271 pmuludq %mm7, %mm3 C m 0
272 movd 4(%eax), %mm0 C m 0
273 pmuludq %mm7, %mm0 C m 0
274 movd 8(%eax), %mm1 C m 0
278 pmuludq %mm7, %mm4 C m 0
279 paddq %mm0, %mm6 C m 0
280 movd (%eax), %mm3 C m 0
281 movd %mm6, -12(%edx) C m 0
282 psrlq $32, %mm6 C m 0
283 pmuludq %mm7, %mm3 C m 0
284 paddq %mm1, %mm6 C m 0
285 movd 4(%eax), %mm0 C m 0
286 movd %mm6, -8(%edx) C m 0
287 psrlq $32, %mm6 C m 0
288 pmuludq %mm7, %mm0 C m 0
289 paddq %mm4, %mm6 C m 0
290 movd 8(%eax), %mm1 C m 0
291 movd %mm6, -4(%edx) C m 0
292 psrlq $32, %mm6 C m 0
293 L(m00): pmuludq %mm7, %mm1 C m 0
294 paddq %mm3, %mm6 C m 0
295 movd 12(%eax), %mm4 C m 0
296 movd %mm6, (%edx) C m 0
297 psrlq $32, %mm6 C m 0
298 lea 16(%eax), %eax C m 0
299 lea 16(%edx), %edx C m 0
302 pmuludq %mm7, %mm4 C m 0
303 paddq %mm0, %mm6 C m 0
304 movd %mm6, -12(%edx) C m 0
305 psrlq $32, %mm6 C m 0
306 paddq %mm1, %mm6 C m 0
310 lea 8(%edi), %edi C rp += 2
311 movd (%esi), %mm7 C am 3
312 mov %edi, %edx C rp2 = rp am 3
313 lea 4(%esi), %esi C up++ am 3
314 lea (%esi), %eax C up2 = up am 3
315 movd (%eax), %mm0 C am 3
316 lea (%ebx), %ecx C inner loop count am 3
317 pxor %mm6, %mm6 C am 3
318 pmuludq %mm7, %mm0 C am 3
319 movd 4(%eax), %mm1 C am 3
320 movd (%edx), %mm4 C am 3
321 pmuludq %mm7, %mm1 C am 3
322 movd 8(%eax), %mm2 C am 3
323 paddq %mm0, %mm4 C am 3
324 movd 4(%edx), %mm5 C am 3
328 pmuludq %mm7, %mm2 C am 3
329 paddq %mm4, %mm6 C am 3
330 movd 12(%eax), %mm3 C am 3
331 paddq %mm1, %mm5 C am 3
332 movd 8(%edx), %mm4 C am 3
333 movd %mm6, (%edx) C am 3
334 psrlq $32, %mm6 C am 3
335 pmuludq %mm7, %mm3 C am 3
336 paddq %mm5, %mm6 C am 3
337 movd 16(%eax), %mm0 C am 3
338 paddq %mm2, %mm4 C am 3
339 movd 12(%edx), %mm5 C am 3
340 movd %mm6, 4(%edx) C am 3
341 psrlq $32, %mm6 C am 3
342 pmuludq %mm7, %mm0 C am 3
343 paddq %mm4, %mm6 C am 3
344 movd 20(%eax), %mm1 C am 3
345 paddq %mm3, %mm5 C am 3
346 movd 16(%edx), %mm4 C am 3
347 movd %mm6, 8(%edx) C am 3
348 psrlq $32, %mm6 C am 3
349 pmuludq %mm7, %mm1 C am 3
350 paddq %mm5, %mm6 C am 3
351 movd 24(%eax), %mm2 C am 3
352 paddq %mm0, %mm4 C am 3
353 movd 20(%edx), %mm5 C am 3
354 movd %mm6, 12(%edx) C am 3
355 psrlq $32, %mm6 C am 3
356 lea 16(%eax), %eax C am 3
357 lea 16(%edx), %edx C am 3
360 pmuludq %mm7, %mm2 C am 3
361 paddq %mm4, %mm6 C am 3
362 paddq %mm1, %mm5 C am 3
363 movd 8(%edx), %mm4 C am 3
364 movd %mm6, (%edx) C am 3
365 psrlq $32, %mm6 C am 3
366 paddq %mm5, %mm6 C am 3
367 paddq %mm2, %mm4 C am 3
368 L(2): movd %mm6, 4(%edx) C am 3
369 psrlq $32, %mm6 C am 3
370 paddq %mm4, %mm6 C am 3
371 movd %mm6, 8(%edx) C am 3
372 psrlq $32, %mm6 C am 3
373 movd %mm6, 12(%edx) C am 3
375 lea 8(%edi), %edi C rp += 2
376 movd (%esi), %mm7 C am 2
377 mov %edi, %edx C rp2 = rp am 2
378 lea 4(%esi), %esi C up++ am 2
379 lea (%esi), %eax C up2 = up am 2
380 movd (%eax), %mm1 C am 2
381 lea (%ebx), %ecx C inner loop count am 2
382 pxor %mm6, %mm6 C am 2
383 pmuludq %mm7, %mm1 C am 2
384 movd 4(%eax), %mm2 C am 2
385 movd (%edx), %mm5 C am 2
386 pmuludq %mm7, %mm2 C am 2
387 movd 8(%eax), %mm3 C am 2
388 paddq %mm1, %mm5 C am 2
389 movd 4(%edx), %mm4 C am 2
393 pmuludq %mm7, %mm2 C am 2
394 paddq %mm4, %mm6 C am 2
395 movd 8(%eax), %mm3 C am 2
396 paddq %mm1, %mm5 C am 2
397 movd 4(%edx), %mm4 C am 2
398 movd %mm6, -4(%edx) C am 2
399 psrlq $32, %mm6 C am 2
401 pmuludq %mm7, %mm3 C am 2
402 paddq %mm5, %mm6 C am 2
403 movd 12(%eax), %mm0 C am 2
404 paddq %mm2, %mm4 C am 2
405 movd 8(%edx), %mm5 C am 2
406 movd %mm6, (%edx) C am 2
407 psrlq $32, %mm6 C am 2
408 pmuludq %mm7, %mm0 C am 2
409 paddq %mm4, %mm6 C am 2
410 movd 16(%eax), %mm1 C am 2
411 paddq %mm3, %mm5 C am 2
412 movd 12(%edx), %mm4 C am 2
413 movd %mm6, 4(%edx) C am 2
414 psrlq $32, %mm6 C am 2
415 pmuludq %mm7, %mm1 C am 2
416 paddq %mm5, %mm6 C am 2
417 movd 20(%eax), %mm2 C am 2
418 paddq %mm0, %mm4 C am 2
419 movd 16(%edx), %mm5 C am 2
420 movd %mm6, 8(%edx) C am 2
421 psrlq $32, %mm6 C am 2
422 lea 16(%eax), %eax C am 2
423 lea 16(%edx), %edx C am 2
426 pmuludq %mm7, %mm2 C am 2
427 paddq %mm4, %mm6 C am 2
428 paddq %mm1, %mm5 C am 2
429 movd 4(%edx), %mm4 C am 2
430 movd %mm6, -4(%edx) C am 2
431 psrlq $32, %mm6 C am 2
432 paddq %mm5, %mm6 C am 2
433 paddq %mm2, %mm4 C am 2
434 L(1): movd %mm6, (%edx) C am 2
435 psrlq $32, %mm6 C am 2
436 paddq %mm4, %mm6 C am 2
437 movd %mm6, 4(%edx) C am 2
438 psrlq $32, %mm6 C am 2
439 movd %mm6, 8(%edx) C am 2
441 lea 8(%edi), %edi C rp += 2
442 movd (%esi), %mm7 C am 1
443 mov %edi, %edx C rp2 = rp am 1
444 lea 4(%esi), %esi C up++ am 1
445 lea (%esi), %eax C up2 = up am 1
446 movd (%eax), %mm2 C am 1
447 lea (%ebx), %ecx C inner loop count am 1
448 pxor %mm6, %mm6 C am 1
449 pmuludq %mm7, %mm2 C am 1
450 movd 4(%eax), %mm3 C am 1
451 movd (%edx), %mm4 C am 1
452 pmuludq %mm7, %mm3 C am 1
453 movd 8(%eax), %mm0 C am 1
454 paddq %mm2, %mm4 C am 1
455 movd 4(%edx), %mm5 C am 1
459 pmuludq %mm7, %mm2 C am 1
460 paddq %mm4, %mm6 C am 1
461 movd 4(%eax), %mm3 C am 1
462 paddq %mm1, %mm5 C am 1
463 movd (%edx), %mm4 C am 1
464 movd %mm6, -8(%edx) C am 1
465 psrlq $32, %mm6 C am 1
466 pmuludq %mm7, %mm3 C am 1
467 paddq %mm5, %mm6 C am 1
468 movd 8(%eax), %mm0 C am 1
469 paddq %mm2, %mm4 C am 1
470 movd 4(%edx), %mm5 C am 1
471 movd %mm6, -4(%edx) C am 1
472 psrlq $32, %mm6 C am 1
474 pmuludq %mm7, %mm0 C am 1
475 paddq %mm4, %mm6 C am 1
476 movd 12(%eax), %mm1 C am 1
477 paddq %mm3, %mm5 C am 1
478 movd 8(%edx), %mm4 C am 1
479 movd %mm6, (%edx) C am 1
480 psrlq $32, %mm6 C am 1
481 pmuludq %mm7, %mm1 C am 1
482 paddq %mm5, %mm6 C am 1
483 movd 16(%eax), %mm2 C am 1
484 paddq %mm0, %mm4 C am 1
485 movd 12(%edx), %mm5 C am 1
486 movd %mm6, 4(%edx) C am 1
487 psrlq $32, %mm6 C am 1
488 lea 16(%eax), %eax C am 1
489 lea 16(%edx), %edx C am 1
492 pmuludq %mm7, %mm2 C am 1
493 paddq %mm4, %mm6 C am 1
494 paddq %mm1, %mm5 C am 1
495 movd (%edx), %mm4 C am 1
496 movd %mm6, -8(%edx) C am 1
497 psrlq $32, %mm6 C am 1
498 paddq %mm5, %mm6 C am 1
499 paddq %mm2, %mm4 C am 1
500 L(0): movd %mm6, -4(%edx) C am 1
501 psrlq $32, %mm6 C am 1
502 paddq %mm4, %mm6 C am 1
503 movd %mm6, (%edx) C am 1
504 psrlq $32, %mm6 C am 1
505 movd %mm6, 4(%edx) C am 1
507 lea 8(%edi), %edi C rp += 2
508 movd (%esi), %mm7 C am 0
509 mov %edi, %edx C rp2 = rp am 0
510 lea 4(%esi), %esi C up++ am 0
511 lea (%esi), %eax C up2 = up am 0
512 movd (%eax), %mm3 C am 0
513 lea (%ebx), %ecx C inner loop count am 0
514 pxor %mm6, %mm6 C am 0
515 pmuludq %mm7, %mm3 C am 0
516 movd 4(%eax), %mm0 C am 0
517 movd (%edx), %mm5 C am 0
518 pmuludq %mm7, %mm0 C am 0
519 movd 8(%eax), %mm1 C am 0
520 paddq %mm3, %mm5 C am 0
521 movd 4(%edx), %mm4 C am 0
525 pmuludq %mm7, %mm2 C am 0
526 paddq %mm4, %mm6 C am 0
527 movd (%eax), %mm3 C am 0
528 paddq %mm1, %mm5 C am 0
529 movd -4(%edx), %mm4 C am 0
530 movd %mm6, -12(%edx) C am 0
531 psrlq $32, %mm6 C am 0
532 pmuludq %mm7, %mm3 C am 0
533 paddq %mm5, %mm6 C am 0
534 movd 4(%eax), %mm0 C am 0
535 paddq %mm2, %mm4 C am 0
536 movd (%edx), %mm5 C am 0
537 movd %mm6, -8(%edx) C am 0
538 psrlq $32, %mm6 C am 0
539 pmuludq %mm7, %mm0 C am 0
540 paddq %mm4, %mm6 C am 0
541 movd 8(%eax), %mm1 C am 0
542 paddq %mm3, %mm5 C am 0
543 movd 4(%edx), %mm4 C am 0
544 movd %mm6, -4(%edx) C am 0
545 psrlq $32, %mm6 C am 0
547 pmuludq %mm7, %mm1 C am 0
548 paddq %mm5, %mm6 C am 0
549 movd 12(%eax), %mm2 C am 0
550 paddq %mm0, %mm4 C am 0
551 movd 8(%edx), %mm5 C am 0
552 movd %mm6, (%edx) C am 0
553 psrlq $32, %mm6 C am 0
554 lea 16(%eax), %eax C am 0
555 lea 16(%edx), %edx C am 0
558 pmuludq %mm7, %mm2 C am 0
559 paddq %mm4, %mm6 C am 0
560 paddq %mm1, %mm5 C am 0
561 movd -4(%edx), %mm4 C am 0
562 movd %mm6, -12(%edx) C am 0
563 psrlq $32, %mm6 C am 0
564 paddq %mm5, %mm6 C am 0
565 paddq %mm2, %mm4 C am 0
566 L(3): movd %mm6, -8(%edx) C am 0
567 psrlq $32, %mm6 C am 0
568 paddq %mm4, %mm6 C am 0
569 movd %mm6, -4(%edx) C am 0
570 psrlq $32, %mm6 C am 0
571 movd %mm6, (%edx) C am 0
581 L(am3): C up[un-1..un-3] x up[un-4]
582 lea 8(%edx), %edx C rp2 += 2
604 movd %mm4, 12(%edx) C FIXME feed through!
607 L(am2): C up[un-1..un-2] x up[un-3]
608 lea 8(%edx), %edx C rp2 += 2
623 movd %mm4, 8(%edx) C FIXME feed through!
626 L(am1): C up[un-1] x up[un-2]
627 lea 8(%edx), %edx C rp2 += 2
637 C *** diag stuff, use elementary code for now
639 mov 4(%esp), %edx C rp
640 mov 8(%esp), %eax C up
641 mov 12(%esp), %ecx C un
644 pmuludq %mm2, %mm2 C src[0]^2
649 movd 4(%edx), %mm3 C dst[1]
654 psllq $1, %mm3 C 2*dst[1]
662 movd 4(%eax), %mm0 C src limb
666 pand %mm0, %mm1 C diagonal low
667 psrlq $32, %mm0 C diagonal high
670 psllq $1, %mm3 C 2*dst[i]
677 psllq $1, %mm3 C 2*dst[i+1]
687 movd 4(%eax), %mm0 C src[size-1]
689 pand %mm0, %mm7 C diagonal low
690 psrlq $32, %mm0 C diagonal high
692 movd 8(%edx), %mm3 C dst[2*size-2]
700 movd %mm2, 12(%edx) C dst[2*size-1]