2 * Discrete Cosine Tansform (DCT) for SSE
3 * Copyright (c) 2006 Zuxy MENG <zuxy.meng@gmail.com>
4 * based upon code from mp3lib/dct64.c, mp3lib/dct64_altivec.c
5 * and mp3lib/dct64_MMX.c
8 /* NOTE: The following code is suboptimal! It can be improved (at least) by
10 1. Replace all movups by movaps. (Can Parameter c be always aligned on
13 2. Rewritten using intrinsics. (GCC generally optimizes intrinsics
14 better. However, when __m128 locals are involved, GCC may
15 produce bad code that uses movaps to access a stack not aligned
16 on a 16-byte boundary, which leads to run-time crashes.)
22 extern float __attribute__((aligned(16))) costab_mmx
[];
24 static const int ppnn
[4] __attribute__((aligned(16))) =
25 { 0, 0, 1 << 31, 1 << 31 };
27 static const int pnpn
[4] __attribute__((aligned(16))) =
28 { 0, 1 << 31, 0, 1 << 31 };
30 static const int nnnn
[4] __attribute__((aligned(16))) =
31 { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
33 void dct64_sse(real
*a
,real
*b
,real
*c
)
35 static real
__attribute__ ((aligned(16))) b1
[0x20];
36 static real
__attribute__ ((aligned(16))) b2
[0x20];
37 static real
const one
= 1.f
;
39 short *out0
= (short*)a
;
40 short *out1
= (short*)b
;
43 real
*costab
= costab_mmx
;
46 for (i
= 0; i
< 0x20 / 2; i
+= 4)
49 "movaps %2, %%xmm3\n\t"
50 "shufps $27, %%xmm3, %%xmm3\n\t"
51 "movups %3, %%xmm1\n\t"
52 "movaps %%xmm1, %%xmm4\n\t"
53 "movups %4, %%xmm2\n\t"
54 "shufps $27, %%xmm4, %%xmm4\n\t"
55 "movaps %%xmm2, %%xmm0\n\t"
56 "shufps $27, %%xmm0, %%xmm0\n\t"
57 "addps %%xmm0, %%xmm1\n\t"
58 "movaps %%xmm1, %0\n\t"
59 "subps %%xmm2, %%xmm4\n\t"
60 "mulps %%xmm3, %%xmm4\n\t"
61 "movaps %%xmm4, %1\n\t"
62 :"=m"(*(b1
+ i
)), "=m"(*(b1
+ 0x1c - i
))
63 :"m"(*(costab
+ i
)), "m"(*(c
+ i
)), "m"(*(c
+ 0x1c - i
))
71 for (i
= 0; i
< 0x20; i
+= 0x10)
74 "movaps %4, %%xmm1\n\t"
75 "movaps %5, %%xmm3\n\t"
76 "movaps %6, %%xmm4\n\t"
77 "movaps %7, %%xmm6\n\t"
78 "movaps %%xmm1, %%xmm7\n\t"
79 "shufps $27, %%xmm7, %%xmm7\n\t"
80 "movaps %%xmm3, %%xmm5\n\t"
81 "shufps $27, %%xmm5, %%xmm5\n\t"
82 "movaps %%xmm4, %%xmm2\n\t"
83 "shufps $27, %%xmm2, %%xmm2\n\t"
84 "movaps %%xmm6, %%xmm0\n\t"
85 "shufps $27, %%xmm0, %%xmm0\n\t"
86 "addps %%xmm0, %%xmm1\n\t"
87 "movaps %%xmm1, %0\n\t"
88 "addps %%xmm2, %%xmm3\n\t"
89 "movaps %%xmm3, %1\n\t"
90 "subps %%xmm4, %%xmm5\n\t"
91 "movaps %%xmm5, %2\n\t"
92 "subps %%xmm6, %%xmm7\n\t"
93 "movaps %%xmm7, %3\n\t"
94 :"=m"(*(b2
+ i
)), "=m"(*(b2
+ i
+ 4)), "=m"(*(b2
+ i
+ 8)), "=m"(*(b2
+ i
+ 12))
95 :"m"(*(b1
+ i
)), "m"(*(b1
+ i
+ 4)), "m"(*(b1
+ i
+ 8)), "m"(*(b1
+ i
+ 12))
101 real
*costab
= costab_mmx
+ 16;
103 "movaps %4, %%xmm0\n\t"
104 "movaps %5, %%xmm1\n\t"
105 "movaps %8, %%xmm4\n\t"
106 "xorps %%xmm6, %%xmm6\n\t"
107 "shufps $27, %%xmm4, %%xmm4\n\t"
108 "mulps %%xmm4, %%xmm1\n\t"
109 "movaps %9, %%xmm2\n\t"
110 "xorps %%xmm7, %%xmm7\n\t"
111 "shufps $27, %%xmm2, %%xmm2\n\t"
112 "mulps %%xmm2, %%xmm0\n\t"
113 "movaps %%xmm0, %0\n\t"
114 "movaps %%xmm1, %1\n\t"
115 "movaps %6, %%xmm3\n\t"
116 "mulps %%xmm2, %%xmm3\n\t"
117 "subps %%xmm3, %%xmm6\n\t"
118 "movaps %%xmm6, %2\n\t"
119 "movaps %7, %%xmm5\n\t"
120 "mulps %%xmm4, %%xmm5\n\t"
121 "subps %%xmm5, %%xmm7\n\t"
122 "movaps %%xmm7, %3\n\t"
123 :"=m"(*(b2
+ 8)), "=m"(*(b2
+ 0xc)), "=m"(*(b2
+ 0x18)), "=m"(*(b2
+ 0x1c))
124 :"m"(*(b2
+ 8)), "m"(*(b2
+ 0xc)), "m"(*(b2
+ 0x18)), "m"(*(b2
+ 0x1c)), "m"(*costab
), "m"(*(costab
+ 4))
129 real
*costab
= costab_mmx
+ 24;
133 "movaps %0, %%xmm0\n\t"
134 "shufps $27, %%xmm0, %%xmm0\n\t"
135 "movaps %1, %%xmm5\n\t"
136 "movaps %%xmm5, %%xmm6\n\t"
138 :"m"(*costab
), "m"(*nnnn
)
141 for (i
= 0; i
< 0x20; i
+= 8)
144 "movaps %2, %%xmm2\n\t"
145 "movaps %3, %%xmm3\n\t"
146 "movaps %%xmm2, %%xmm4\n\t"
147 "xorps %%xmm5, %%xmm6\n\t"
148 "shufps $27, %%xmm4, %%xmm4\n\t"
149 "movaps %%xmm3, %%xmm1\n\t"
150 "shufps $27, %%xmm1, %%xmm1\n\t"
151 "addps %%xmm1, %%xmm2\n\t"
152 "movaps %%xmm2, %0\n\t"
153 "subps %%xmm3, %%xmm4\n\t"
154 "xorps %%xmm6, %%xmm4\n\t"
155 "mulps %%xmm0, %%xmm4\n\t"
156 "movaps %%xmm4, %1\n\t"
157 :"=m"(*(b1
+ i
)), "=m"(*(b1
+ i
+ 4))
158 :"m"(*(b2
+ i
)), "m"(*(b2
+ i
+ 4))
167 "movss %0, %%xmm1\n\t"
168 "movss %1, %%xmm0\n\t"
169 "movaps %%xmm1, %%xmm3\n\t"
170 "unpcklps %%xmm0, %%xmm3\n\t"
171 "movss %2, %%xmm2\n\t"
172 "movaps %%xmm1, %%xmm0\n\t"
173 "unpcklps %%xmm2, %%xmm0\n\t"
174 "unpcklps %%xmm3, %%xmm0\n\t"
175 "movaps %3, %%xmm2\n\t"
177 :"m"(one
), "m"(costab_mmx
[28]), "m"(costab_mmx
[29]), "m"(*ppnn
)
180 for (i
= 0; i
< 0x20; i
+= 8)
183 "movaps %2, %%xmm3\n\t"
184 "movaps %%xmm3, %%xmm4\n\t"
185 "shufps $20, %%xmm4, %%xmm4\n\t"
186 "shufps $235, %%xmm3, %%xmm3\n\t"
187 "xorps %%xmm2, %%xmm3\n\t"
188 "addps %%xmm3, %%xmm4\n\t"
189 "mulps %%xmm0, %%xmm4\n\t"
190 "movaps %%xmm4, %0\n\t"
191 "movaps %3, %%xmm6\n\t"
192 "movaps %%xmm6, %%xmm5\n\t"
193 "shufps $27, %%xmm5, %%xmm5\n\t"
194 "xorps %%xmm2, %%xmm5\n\t"
195 "addps %%xmm5, %%xmm6\n\t"
196 "mulps %%xmm0, %%xmm6\n\t"
197 "movaps %%xmm6, %1\n\t"
198 :"=m"(*(b2
+ i
)), "=m"(*(b2
+ i
+ 4))
199 :"m"(*(b1
+ i
)), "m"(*(b1
+ i
+ 4))
207 "movss %0, %%xmm0\n\t"
208 "movaps %%xmm1, %%xmm2\n\t"
209 "movaps %%xmm0, %%xmm7\n\t"
210 "unpcklps %%xmm1, %%xmm2\n\t"
211 "unpcklps %%xmm0, %%xmm7\n\t"
212 "movaps %1, %%xmm0\n\t"
213 "unpcklps %%xmm7, %%xmm2\n\t"
215 :"m"(costab_mmx
[30]), "m"(*pnpn
)
218 for (i
= 0x8; i
< 0x20; i
+= 8)
221 "movaps %2, %%xmm1\n\t"
222 "movaps %%xmm1, %%xmm3\n\t"
223 "shufps $224, %%xmm3, %%xmm3\n\t"
224 "shufps $181, %%xmm1, %%xmm1\n\t"
225 "xorps %%xmm0, %%xmm1\n\t"
226 "addps %%xmm1, %%xmm3\n\t"
227 "mulps %%xmm2, %%xmm3\n\t"
228 "movaps %%xmm3, %0\n\t"
229 "movaps %3, %%xmm4\n\t"
230 "movaps %%xmm4, %%xmm5\n\t"
231 "shufps $224, %%xmm5, %%xmm5\n\t"
232 "shufps $181, %%xmm4, %%xmm4\n\t"
233 "xorps %%xmm0, %%xmm4\n\t"
234 "addps %%xmm4, %%xmm5\n\t"
235 "mulps %%xmm2, %%xmm5\n\t"
236 "movaps %%xmm5, %1\n\t"
237 :"=m"(*(b1
+ i
)), "=m"(*(b1
+ i
+ 4))
238 :"m"(*(b2
+ i
)), "m"(*(b2
+ i
+ 4))
242 for (i
= 0x8; i
< 0x20; i
+= 8)
244 b1
[i
+ 2] += b1
[i
+ 3];
245 b1
[i
+ 6] += b1
[i
+ 7];
246 b1
[i
+ 4] += b1
[i
+ 6];
247 b1
[i
+ 6] += b1
[i
+ 5];
248 b1
[i
+ 5] += b1
[i
+ 7];
253 /* Reference C code */
256 Should run faster than x87 asm, given that the compiler is sane.
257 However, the C code dosen't round with saturation (0x7fff for too
258 large positive float, 0x8000 for too small negative float). You
259 can hear the difference if you listen carefully.
262 out0
[256] = (short)(b2
[0] + b2
[1]);
263 out0
[0] = (short)((b2
[0] - b2
[1]) * costab_mmx
[30]);
264 out1
[128] = (short)((b2
[3] - b2
[2]) * costab_mmx
[30]);
265 out0
[128] = (short)((b2
[3] - b2
[2]) * costab_mmx
[30] + b2
[3] + b2
[2]);
266 out1
[192] = (short)((b2
[7] - b2
[6]) * costab_mmx
[30]);
267 out0
[192] = (short)((b2
[7] - b2
[6]) * costab_mmx
[30] + b2
[6] + b2
[7] + b2
[4] + b2
[5]);
268 out0
[64] = (short)((b2
[7] - b2
[6]) * costab_mmx
[30] + b2
[6] + b2
[7] + (b2
[4] - b2
[5]) * costab_mmx
[30]);
269 out1
[64] = (short)((b2
[7] - b2
[6]) * costab_mmx
[30] + (b2
[4] - b2
[5]) * costab_mmx
[30]);
271 out0
[224] = (short)(b1
[8] + b1
[12]);
272 out0
[160] = (short)(b1
[12] + b1
[10]);
273 out0
[96] = (short)(b1
[10] + b1
[14]);
274 out0
[32] = (short)(b1
[14] + b1
[9]);
275 out1
[32] = (short)(b1
[9] + b1
[13]);
276 out1
[96] = (short)(b1
[13] + b1
[11]);
277 out1
[222] = (short)b1
[15];
278 out1
[160] = (short)(b1
[15] + b1
[11]);
279 out0
[240] = (short)(b1
[24] + b1
[28] + b1
[16]);
280 out0
[208] = (short)(b1
[24] + b1
[28] + b1
[20]);
281 out0
[176] = (short)(b1
[28] + b1
[26] + b1
[20]);
282 out0
[144] = (short)(b1
[28] + b1
[26] + b1
[18]);
283 out0
[112] = (short)(b1
[26] + b1
[30] + b1
[18]);
284 out0
[80] = (short)(b1
[26] + b1
[30] + b1
[22]);
285 out0
[48] = (short)(b1
[30] + b1
[25] + b1
[22]);
286 out0
[16] = (short)(b1
[30] + b1
[25] + b1
[17]);
287 out1
[16] = (short)(b1
[25] + b1
[29] + b1
[17]);
288 out1
[48] = (short)(b1
[25] + b1
[29] + b1
[21]);
289 out1
[80] = (short)(b1
[29] + b1
[27] + b1
[21]);
290 out1
[112] = (short)(b1
[29] + b1
[27] + b1
[19]);
291 out1
[144] = (short)(b1
[27] + b1
[31] + b1
[19]);
292 out1
[176] = (short)(b1
[27] + b1
[31] + b1
[23]);
293 out1
[240] = (short)(b1
[31]);
294 out1
[208] = (short)(b1
[31] + b1
[23]);
298 To do saturation efficiently in x86 we can use fist(t)(p),
299 pf2iw, or packssdw. We use fist(p) here.
431 :"m"(costab_mmx
[30]), "r"(b1
), "r"(b2
), "r"(a
), "r"(b
)