1 / This code is
a translation of dct64_k7.s from MPlayer.
2 / Coded by Felix Buenemann
<atmosfear at users.sourceforge.net
>
4 / TODO
: - fix phases
4 and 5 (sse
)
5 / - optimize scalar FPU code?
(interleave with sse code
)
6 / - fix alignment
(prohibits finishing this code
)
7 / - then use faster insns for aligned data
9 / Note
: currently code is disabled as I couldn
't get input data aligned!
14 //x_plus_minus_3dnow: .long 0x00000000, 0x80000000
37 /* Phase 1 (complete, worx) */
39 // [1] Process Block A1 (16 Bytes)
44 // Copy A1 to another register A2
49 // Process Block B1 (last 16 bytes)
50 / movq 120(%eax), %mm1
51 / movq 112(%eax), %mm5
52 movups 112(%eax), %xmm1
54 /* The PSWAPD instruction swaps or reverses the upper and lower
55 * doublewords of the source operand. PSWAPD mmreg1, mmreg2
56 * performs the following operations:
58 * mmreg1[63:32] = temp[31:0 ]
59 * mmreg1[31:0 ] = temp[63:32]
63 // shufps here exchanges a,b,c,d to b,a,d,c in xmm1 (desc ia32-ref p.752)
64 //// shufps $177, %xmm1, %xmm1
65 shufps $27, %xmm1, %xmm1
91 // I do a,b,c,d -> d,c,b,a to suit order when writing to mem (saves one shufps)
92 shufps $27, %xmm2, %xmm2
94 // Save A2 to mem (end)
95 / movq %mm3, 120(%edx)
96 / movq %mm7, 112(%edx)
97 movups %xmm2, 112(%edx)
99 // [2] Process next data block
100 / movq 16(%eax), %mm0
101 / movq 24(%eax), %mm4
102 movups 16(%eax), %xmm0
108 / movq 104(%eax), %mm1
109 / movq 96(%eax), %mm5
110 movups 96(%eax), %xmm1
114 //// shufps $177, %xmm1, %xmm1
115 shufps $27, %xmm1, %xmm1
121 / movq %mm0, 16(%edx)
122 / movq %mm4, 24(%edx)
123 movups %xmm0, 16(%edx)
129 / pfmul 16(%ebx), %mm3
130 / pfmul 24(%ebx), %mm7
131 movups 16(%ebx), %xmm7
136 shufps $27, %xmm2, %xmm2
138 / movq %mm3, 104(%edx)
139 / movq %mm7, 96(%edx)
140 movups %xmm2, 96(%edx)
143 / movq 32(%eax), %mm0
144 / movq 40(%eax), %mm4
145 movups 32(%eax), %xmm0
151 / movq 88(%eax), %mm1
152 / movq 80(%eax), %mm5
153 movups 80(%eax), %xmm1
157 //// shufps $177, %xmm1, %xmm1
158 shufps $27, %xmm1, %xmm1
164 / movq %mm0, 32(%edx)
165 / movq %mm4, 40(%edx)
166 movups %xmm0, 32(%edx)
172 / pfmul 32(%ebx), %mm3
173 / pfmul 40(%ebx), %mm7
174 movups 32(%ebx), %xmm7
179 shufps $27, %xmm2, %xmm2
181 / movq %mm3, 88(%edx)
182 / movq %mm7, 80(%edx)
183 movups %xmm2, 80(%edx)
186 / movq 48(%eax), %mm0
187 / movq 56(%eax), %mm4
188 movups 48(%eax), %xmm0
194 / movq 72(%eax), %mm1
195 / movq 64(%eax), %mm5
196 movups 64(%eax), %xmm1
200 //// shufps $177, %xmm1, %xmm1
201 shufps $27, %xmm1, %xmm1
207 / movq %mm0, 48(%edx)
208 / movq %mm4, 56(%edx)
209 movups %xmm0, 48(%edx)
215 / pfmul 48(%ebx), %mm3
216 / pfmul 56(%ebx), %mm7
217 movups 48(%ebx), %xmm7
222 shufps $27, %xmm2, %xmm2
224 / movq %mm3, 72(%edx)
225 / movq %mm7, 64(%edx)
226 movups %xmm2, 64(%edx)
240 movl $costab_mmx,%ebx
382 // end phase 1 fpu code
384 /* Phase 2 (completed, worx) */
394 / movq 56(%edx), %mm1
395 / movq 48(%edx), %mm5
396 movups 48(%edx), %xmm1
400 //// shufps $177, %xmm1, %xmm1
401 shufps $27, %xmm1, %xmm1
415 / pfmul 64(%ebx), %mm3
416 / pfmul 72(%ebx), %mm7
417 movups 64(%ebx), %xmm7
422 shufps $27, %xmm2, %xmm2
424 / movq %mm3, 56(%ecx)
425 / movq %mm7, 48(%ecx)
426 movups %xmm2, 48(%ecx)
428 / movq 16(%edx), %mm0
429 / movq 24(%edx), %mm4
430 movups 16(%edx), %xmm0
436 / movq 40(%edx), %mm1
437 / movq 32(%edx), %mm5
438 movups 32(%edx), %xmm1
442 //// shufps $177, %xmm1, %xmm1
443 shufps $27, %xmm1, %xmm1
449 / movq %mm0, 16(%ecx)
450 / movq %mm4, 24(%ecx)
451 movups %xmm0, 16(%ecx)
457 / pfmul 80(%ebx), %mm3
458 / pfmul 88(%ebx), %mm7
459 movups 80(%ebx), %xmm7
464 shufps $27, %xmm2, %xmm2
466 / movq %mm3, 40(%ecx)
467 / movq %mm7, 32(%ecx)
468 movups %xmm2, 32(%ecx)
548 /* Phase 3 (completed, working) */
550 / movq 64(%edx), %mm0
551 / movq 72(%edx), %mm4
552 movups 64(%edx), %xmm0
558 / movq 120(%edx), %mm1
559 / movq 112(%edx), %mm5
560 movups 112(%edx), %xmm1
564 //// shufps $177, %xmm1, %xmm1
565 shufps $27, %xmm1, %xmm1
571 / movq %mm0, 64(%ecx)
572 / movq %mm4, 72(%ecx)
573 movups %xmm0, 64(%ecx)
577 // optimized (xmm1<->xmm2)
580 / pfmul 64(%ebx), %mm3
581 / pfmul 72(%ebx), %mm7
582 movups 64(%ebx), %xmm7
587 shufps $27, %xmm1, %xmm1
589 / movq %mm3, 120(%ecx)
590 / movq %mm7, 112(%ecx)
591 movups %xmm1, 112(%ecx)
594 / movq 80(%edx), %mm0
595 / movq 88(%edx), %mm4
596 movups 80(%edx), %xmm0
602 / movq 104(%edx), %mm1
603 / movq 96(%edx), %mm5
604 movups 96(%edx), %xmm1
608 //// shufps $177, %xmm1, %xmm1
609 shufps $27, %xmm1, %xmm1
615 / movq %mm0, 80(%ecx)
616 / movq %mm4, 88(%ecx)
617 movups %xmm0, 80(%ecx)
621 // optimized (xmm1<->xmm2)
624 / pfmul 80(%ebx), %mm3
625 / pfmul 88(%ebx), %mm7
626 movups 80(%ebx), %xmm7
631 shufps $27, %xmm1, %xmm1
633 / movq %mm3, 104(%ecx)
634 / movq %mm7, 96(%ecx)
635 movups %xmm1, 96(%ecx)
716 /* Phase 4 (completed, buggy) */
718 / movq 96(%ebx), %mm2
719 / movq 104(%ebx), %mm6
720 movups 96(%ebx), %xmm4
731 / movq 24(%ecx), %mm1
732 / movq 16(%ecx), %mm5
733 movups 16(%ecx), %xmm1
737 //// shufps $177, %xmm1, %xmm1
738 shufps $27, %xmm1, %xmm1
758 shufps $27, %xmm2, %xmm2
760 / movq %mm3, 24(%edx)
761 / movq %mm7, 16(%edx)
762 movups %xmm2, 16(%edx)
764 / movq 32(%ecx), %mm0
765 / movq 40(%ecx), %mm4
766 movups 32(%ecx), %xmm0
772 / movq 56(%ecx), %mm1
773 / movq 48(%ecx), %mm5
774 movups 48(%ecx), %xmm1
778 //// shufps $177, %xmm1, %xmm1
779 shufps $27, %xmm1, %xmm1
785 / movq %mm0, 32(%edx)
786 / movq %mm4, 40(%edx)
787 movups %xmm0, 32(%edx)
791 // Luckily we can swap this (xmm1<->xmm2)
800 shufps $27, %xmm1, %xmm1
802 / movq %mm3, 56(%edx)
803 / movq %mm7, 48(%edx)
804 movups %xmm1, 48(%edx)
807 / movq 64(%ecx), %mm0
808 / movq 72(%ecx), %mm4
809 movups 64(%ecx), %xmm0
815 / movq 88(%ecx), %mm1
816 / movq 80(%ecx), %mm5
817 movups 80(%ecx), %xmm1
821 //// shufps $177, %xmm1, %xmm1
822 shufps $27, %xmm1, %xmm1
828 / movq %mm0, 64(%edx)
829 / movq %mm4, 72(%edx)
830 movups %xmm0, 64(%edx)
842 shufps $27, %xmm2, %xmm2
844 / movq %mm3, 88(%edx)
845 / movq %mm7, 80(%edx)
846 movups %xmm2, 80(%edx)
849 / movq 96(%ecx), %mm0
850 / movq 104(%ecx), %mm4
851 movups 96(%ecx), %xmm0
857 / movq 120(%ecx), %mm1
858 / movq 112(%ecx), %mm5
859 movups 112(%ecx), %xmm1
863 //// shufps $177, %xmm1, %xmm1
864 shufps $27, %xmm1, %xmm1
870 / movq %mm0, 96(%edx)
871 / movq %mm4, 104(%edx)
872 movups %xmm0, 96(%edx)
876 // This is already optimized, so xmm2 must be swapped with xmm1 for rest of phase
885 shufps $27, %xmm1, %xmm1
887 / movq %mm3, 120(%edx)
888 / movq %mm7, 112(%edx)
889 movups %xmm1, 112(%edx)
1183 // end of phase 4 fpu
1185 // below stuff needs to be finished I use FPU code for first
1186 /* Phase 5 (completed, crashing) */
1188 / movq 112(%ebx), %mm2
1189 // move 8 byte data to (low)high quadword - check this! atmos
1190 movlps 112(%ebx), %xmm4
1191 // maybe I need movhlps too to get data into correct quadword
1192 movlhps %xmm4, %xmm4
1195 / movq 16(%edx), %mm4
1196 movups (%edx), %xmm0
1202 // hmm? this is strange
1203 / movq 8(%edx), %mm1
1204 / movq 24(%edx), %mm5
1205 movlps 8(%edx), %xmm1
1206 movhps 24(%edx), %xmm1
1210 pshufd $177, %xmm1, %xmm1
1217 / movq %mm4, 16(%ecx)
1218 movlps %xmm0, (%ecx)
1219 movhps %xmm0, 16(%ecx)
1223 // I need to emulate pfsubr here
1227 // now move correct quadword from reverse substration in xmm3 to correct
1228 // quadword in xmm2 and leave other quadword with non-reversed substration untouched
1229 /// shufpd $2, %xmm3, %xmm2
1230 // (or $1?) (see ia32-ref p.749)
1241 shufps $177, %xmm2, %xmm2
1243 / movq %mm3, 8(%ecx)
1244 / movq %mm7, 24(%ecx)
1245 movlps %xmm2, 8(%ecx)
1246 movhps %xmm2, 24(%ecx)
1248 / movq 32(%edx), %mm0
1249 / movq 48(%edx), %mm4
1250 movlps 32(%edx), %xmm0
1251 movhps 48(%edx), %xmm0
1257 / movq 40(%edx), %mm1
1258 / movq 56(%edx), %mm5
1259 movlps 40(%edx), %xmm1
1260 movhps 56(%edx), %xmm1
1264 shufps $177, %xmm1, %xmm1
1270 / movq %mm0, 32(%ecx)
1271 / movq %mm4, 48(%ecx)
1272 movlps %xmm0, 32(%ecx)
1273 movhps %xmm0, 48(%ecx)
1280 /// shufpd $2, %xmm3, %xmm2
1292 shufps $177, %xmm2, %xmm2
1294 / movq %mm3, 40(%ecx)
1295 / movq %mm7, 56(%ecx)
1296 movlps %xmm2, 40(%ecx)
1297 movhps %xmm2, 56(%ecx)
1300 / movq 64(%edx), %mm0
1301 / movq 80(%edx), %mm4
1302 movlps 64(%edx), %xmm0
1303 movhps 80(%edx), %xmm0
1309 / movq 72(%edx), %mm1
1310 / movq 88(%edx), %mm5
1311 movlps 72(%edx), %xmm1
1312 movhps 88(%edx), %xmm1
1316 shufps $177, %xmm1, %xmm1
1322 / movq %mm0, 64(%ecx)
1323 / movq %mm4, 80(%ecx)
1324 movlps %xmm0, 64(%ecx)
1325 movhps %xmm0, 80(%ecx)
1332 /// shufpd $2, %xmm3, %xmm2
1344 shufps $177, %xmm2, %xmm2
1346 / movq %mm3, 72(%ecx)
1347 / movq %mm7, 88(%ecx)
1348 movlps %xmm2, 72(%ecx)
1349 movhps %xmm2, 88(%ecx)
1351 / movq 96(%edx), %mm0
1352 / movq 112(%edx), %mm4
1353 movups 96(%edx), %xmm0
1359 / movq 104(%edx), %mm1
1360 / movq 120(%edx), %mm5
1361 movlps 104(%edx), %xmm1
1362 movhps 120(%edx), %xmm1
1366 shufps $177, %xmm1, %xmm1
1372 / movq %mm0, 96(%ecx)
1373 / movq %mm4, 112(%ecx)
1374 movups %xmm0, 96(%ecx)
1381 /// shufpd $2, %xmm3, %xmm2
1393 shufps $177, %xmm2, %xmm2
1395 / movq %mm3, 104(%ecx)
1396 / movq %mm7, 120(%ecx)
1397 movlps %xmm2, 104(%ecx)
1398 movhps %xmm2, 120(%ecx)
1402 /* Phase 6. This is the end of easy road. */
1403 /* Code below is coded in scalar mode. Should be optimized */
1405 // movd plus_1f, %mm6
1406 // punpckldq 120(%ebx), %mm6 /* mm6 = 1.0 | 120(%ebx)*/
1407 // movq x_plus_minus_3dnow, %mm7 /* mm7 = +1 | -1 */
1424 movd 120(%ebx), %mm3
1425 punpckldq 76(%ecx), %mm0
1426 punpckldq 72(%ecx), %mm2
1427 punpckldq %mm3, %mm3
1436 punpckldq %mm1, %mm0
1437 punpckhdq %mm1, %mm2
1443 pfsub 52(%ecx), %mm3
1444 pfsub 56(%ecx), %mm2
1445 pfmul 120(%ebx), %mm3
1446 pfmul 120(%ebx), %mm2
1449 pfadd 56(%ecx), %mm1
1450 pfadd 60(%ecx), %mm1
1453 pfadd 48(%ecx), %mm0
1454 pfadd 52(%ecx), %mm0
1456 punpckldq %mm2, %mm1
1458 punpckldq %mm2, %mm0
1465 pfsub 88(%ecx), %mm1
1466 pfmul 120(%ebx), %mm1
1468 pfadd 92(%ecx), %mm1
1469 pfadd 88(%ecx), %mm1
1472 pfadd 80(%ecx), %mm0
1473 pfadd 84(%ecx), %mm0
1477 pfsub 84(%ecx), %mm0
1478 pfmul 120(%ebx), %mm0
1480 pfadd 92(%edx), %mm0
1481 punpckldq %mm1, %mm0
1491 movd 108(%ecx), %mm0
1492 pfsub 104(%ecx), %mm0
1493 pfmul 120(%ebx), %mm0
1494 movd %mm0, 108(%edx)
1495 pfadd 104(%ecx), %mm0
1496 pfadd 108(%ecx), %mm0
1497 movd %mm0, 104(%edx)
1499 movd 124(%ecx), %mm1
1500 pfsub 120(%ecx), %mm1
1501 pfmul 120(%ebx), %mm1
1502 movd %mm1, 124(%edx)
1503 pfadd 120(%ecx), %mm1
1504 pfadd 124(%ecx), %mm1
1507 pfadd 112(%ecx), %mm0
1508 pfadd 116(%ecx), %mm0
1509 movd %mm0, 112(%edx)
1511 movd 112(%ecx), %mm0
1512 pfsub 116(%ecx), %mm0
1513 pfmul 120(%ebx), %mm0
1515 pfadd 124(%edx), %mm0
1516 punpckldq %mm1, %mm0
1517 movq %mm0, 116(%edx)
1524 /* Code below is coded in scalar mode. Should be optimized */
1528 movd %mm0, 1024(%esi)
1532 pfmul 120(%ebx), %mm0
1538 pfmul 120(%ebx), %mm0
1539 movd %mm0, 512(%edi)
1540 pfadd 12(%ecx), %mm0
1542 movd %mm0, 512(%esi)
1545 pfsub 20(%ecx), %mm0
1546 pfmul 120(%ebx), %mm0
1550 pfsub 24(%ecx), %mm0
1551 pfmul 120(%ebx), %mm0
1552 movd %mm0, 768(%edi)
1555 pfadd 24(%ecx), %mm0
1556 pfadd 28(%ecx), %mm0
1559 pfadd 16(%ecx), %mm0
1560 pfadd 20(%ecx), %mm0
1561 movd %mm0, 768(%esi)
1563 movd %mm1, 256(%esi)
1565 movd %mm2, 256(%edi)
1573 pfadd 48(%edx), %mm0
1574 pfadd 40(%edx), %mm1
1575 movd %mm0, 896(%esi)
1576 movd %mm1, 640(%esi)
1579 movd %mm0, 128(%edi)
1580 movd %mm1, 384(%edi)
1583 pfadd 56(%edx), %mm0
1584 movd %mm0, 384(%esi)
1587 pfadd 36(%edx), %mm0
1588 movd %mm0, 128(%esi)
1591 movd %mm0, 896(%edi)
1592 pfadd 44(%edx), %mm0
1593 movd %mm0, 640(%edi)
1596 movq 112(%edx), %mm2
1597 movq 104(%edx), %mm4
1598 pfadd 112(%edx), %mm0
1599 pfadd 104(%edx), %mm2
1600 pfadd 120(%edx), %mm4
1604 pfadd 64(%edx), %mm0
1605 pfadd 80(%edx), %mm2
1606 pfadd 72(%edx), %mm4
1607 movd %mm0, 960(%esi)
1608 movd %mm2, 704(%esi)
1609 movd %mm4, 448(%esi)
1614 movd %mm2, 320(%edi)
1615 movd %mm4, 576(%edi)
1616 pfadd 80(%edx), %mm1
1617 pfadd 72(%edx), %mm3
1618 pfadd 88(%edx), %mm5
1619 movd %mm1, 832(%esi)
1620 movd %mm3, 576(%esi)
1621 movd %mm5, 320(%esi)
1625 movd %mm1, 192(%edi)
1626 movd %mm3, 448(%edi)
1627 movd %mm5, 704(%edi)
1629 movd 120(%edx), %mm0
1630 pfadd 100(%edx), %mm0
1632 pfadd 88(%edx), %mm0
1633 movd %mm0, 192(%esi)
1634 pfadd 68(%edx), %mm1
1637 movd 124(%edx), %mm0
1638 movd %mm0, 960(%edi)
1639 pfadd 92(%edx), %mm0
1640 movd %mm0, 832(%edi)
1663 pfmul 120(%ebx), %mm0
1667 pfadd 12(%ecx), %mm0
1674 pfsub 20(%ecx), %mm3
1675 pfmul 120(%ebx), %mm3
1679 pfsub 24(%ecx), %mm2
1680 pfmul 120(%ebx), %mm2
1687 pfadd 24(%ecx), %mm1
1688 pfadd 28(%ecx), %mm1
1691 pfadd 16(%ecx), %mm0
1692 pfadd 20(%ecx), %mm0
1711 pfadd 48(%edx), %mm0
1712 pfadd 40(%edx), %mm1
1730 movd 120(%edx), %mm5
1731 punpckldq %mm4, %mm3
1732 punpckldq 124(%edx), %mm0
1733 pfadd 100(%edx), %mm5
1734 punpckldq 36(%edx), %mm4
1735 punpckldq 92(%edx), %mm2
1740 pfadd 88(%edx), %mm5
1757 pfadd 68(%edx), %mm6
1767 movq 112(%edx), %mm2
1768 movq 104(%edx), %mm4
1771 pfadd 120(%edx), %mm4
1775 pfadd 64(%edx), %mm0
1776 pfadd 80(%edx), %mm2
1777 pfadd 72(%edx), %mm4
1796 pfadd 80(%edx), %mm1
1797 pfadd 72(%edx), %mm3
1798 pfadd 88(%edx), %mm5
1830 // here comes old fashioned FPU code for the tough parts