Remove unused functions.
[mplayer/glamo.git] / mp3lib / dct64_sse.c
blobcfb3948ee2e4f342386b26da81d7139ed7f1a423
1 /*
2 * Discrete Cosine Tansform (DCT) for SSE
3 * Copyright (c) 2006 Zuxy MENG <zuxy.meng@gmail.com>
4 * based upon code from mp3lib/dct64.c, mp3lib/dct64_altivec.c
5 * and mp3lib/dct64_mmx.c
6 */
8 #include "libavutil/internal.h"
9 #include "mpg123.h"
11 extern float __attribute__((aligned(16))) costab_mmx[];
13 static const int ppnn[4] __attribute__((aligned(16))) =
14 { 0, 0, 1 << 31, 1 << 31 };
16 static const int pnpn[4] __attribute__((aligned(16))) =
17 { 0, 1 << 31, 0, 1 << 31 };
19 static const int nnnn[4] __attribute__((aligned(16))) =
20 { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
22 void dct64_sse(short *out0,short *out1,real *c)
24 DECLARE_ALIGNED(16, real, b1[0x20]);
25 DECLARE_ALIGNED(16, real, b2[0x20]);
26 static real const one = 1.f;
29 real *costab = costab_mmx;
30 int i;
32 for (i = 0; i < 0x20 / 2; i += 4)
34 __asm__(
35 "movaps %2, %%xmm3\n\t"
36 "shufps $27, %%xmm3, %%xmm3\n\t"
37 "movaps %3, %%xmm1\n\t"
38 "movaps %%xmm1, %%xmm4\n\t"
39 "movaps %4, %%xmm2\n\t"
40 "shufps $27, %%xmm4, %%xmm4\n\t"
41 "movaps %%xmm2, %%xmm0\n\t"
42 "shufps $27, %%xmm0, %%xmm0\n\t"
43 "addps %%xmm0, %%xmm1\n\t"
44 "movaps %%xmm1, %0\n\t"
45 "subps %%xmm2, %%xmm4\n\t"
46 "mulps %%xmm3, %%xmm4\n\t"
47 "movaps %%xmm4, %1\n\t"
48 :"=m"(*(b1 + i)), "=m"(*(b1 + 0x1c - i))
49 :"m"(*(costab + i)), "m"(*(c + i)), "m"(*(c + 0x1c - i))
55 int i;
57 for (i = 0; i < 0x20; i += 0x10)
59 __asm__(
60 "movaps %4, %%xmm1\n\t"
61 "movaps %5, %%xmm3\n\t"
62 "movaps %6, %%xmm4\n\t"
63 "movaps %7, %%xmm6\n\t"
64 "movaps %%xmm1, %%xmm7\n\t"
65 "shufps $27, %%xmm7, %%xmm7\n\t"
66 "movaps %%xmm3, %%xmm5\n\t"
67 "shufps $27, %%xmm5, %%xmm5\n\t"
68 "movaps %%xmm4, %%xmm2\n\t"
69 "shufps $27, %%xmm2, %%xmm2\n\t"
70 "movaps %%xmm6, %%xmm0\n\t"
71 "shufps $27, %%xmm0, %%xmm0\n\t"
72 "addps %%xmm0, %%xmm1\n\t"
73 "movaps %%xmm1, %0\n\t"
74 "addps %%xmm2, %%xmm3\n\t"
75 "movaps %%xmm3, %1\n\t"
76 "subps %%xmm4, %%xmm5\n\t"
77 "movaps %%xmm5, %2\n\t"
78 "subps %%xmm6, %%xmm7\n\t"
79 "movaps %%xmm7, %3\n\t"
80 :"=m"(*(b2 + i)), "=m"(*(b2 + i + 4)), "=m"(*(b2 + i + 8)), "=m"(*(b2 + i + 12))
81 :"m"(*(b1 + i)), "m"(*(b1 + i + 4)), "m"(*(b1 + i + 8)), "m"(*(b1 + i + 12))
87 real *costab = costab_mmx + 16;
88 __asm__(
89 "movaps %4, %%xmm0\n\t"
90 "movaps %5, %%xmm1\n\t"
91 "movaps %8, %%xmm4\n\t"
92 "xorps %%xmm6, %%xmm6\n\t"
93 "shufps $27, %%xmm4, %%xmm4\n\t"
94 "mulps %%xmm4, %%xmm1\n\t"
95 "movaps %9, %%xmm2\n\t"
96 "xorps %%xmm7, %%xmm7\n\t"
97 "shufps $27, %%xmm2, %%xmm2\n\t"
98 "mulps %%xmm2, %%xmm0\n\t"
99 "movaps %%xmm0, %0\n\t"
100 "movaps %%xmm1, %1\n\t"
101 "movaps %6, %%xmm3\n\t"
102 "mulps %%xmm2, %%xmm3\n\t"
103 "subps %%xmm3, %%xmm6\n\t"
104 "movaps %%xmm6, %2\n\t"
105 "movaps %7, %%xmm5\n\t"
106 "mulps %%xmm4, %%xmm5\n\t"
107 "subps %%xmm5, %%xmm7\n\t"
108 "movaps %%xmm7, %3\n\t"
109 :"=m"(*(b2 + 8)), "=m"(*(b2 + 0xc)), "=m"(*(b2 + 0x18)), "=m"(*(b2 + 0x1c))
110 :"m"(*(b2 + 8)), "m"(*(b2 + 0xc)), "m"(*(b2 + 0x18)), "m"(*(b2 + 0x1c)), "m"(*costab), "m"(*(costab + 4))
115 real *costab = costab_mmx + 24;
116 int i;
118 __asm__(
119 "movaps %0, %%xmm0\n\t"
120 "shufps $27, %%xmm0, %%xmm0\n\t"
121 "movaps %1, %%xmm5\n\t"
122 "movaps %%xmm5, %%xmm6\n\t"
124 :"m"(*costab), "m"(*nnnn)
127 for (i = 0; i < 0x20; i += 8)
129 __asm__(
130 "movaps %2, %%xmm2\n\t"
131 "movaps %3, %%xmm3\n\t"
132 "movaps %%xmm2, %%xmm4\n\t"
133 "xorps %%xmm5, %%xmm6\n\t"
134 "shufps $27, %%xmm4, %%xmm4\n\t"
135 "movaps %%xmm3, %%xmm1\n\t"
136 "shufps $27, %%xmm1, %%xmm1\n\t"
137 "addps %%xmm1, %%xmm2\n\t"
138 "movaps %%xmm2, %0\n\t"
139 "subps %%xmm3, %%xmm4\n\t"
140 "xorps %%xmm6, %%xmm4\n\t"
141 "mulps %%xmm0, %%xmm4\n\t"
142 "movaps %%xmm4, %1\n\t"
143 :"=m"(*(b1 + i)), "=m"(*(b1 + i + 4))
144 :"m"(*(b2 + i)), "m"(*(b2 + i + 4))
150 int i;
152 __asm__(
153 "movss %0, %%xmm1\n\t"
154 "movss %1, %%xmm0\n\t"
155 "movaps %%xmm1, %%xmm3\n\t"
156 "unpcklps %%xmm0, %%xmm3\n\t"
157 "movss %2, %%xmm2\n\t"
158 "movaps %%xmm1, %%xmm0\n\t"
159 "unpcklps %%xmm2, %%xmm0\n\t"
160 "unpcklps %%xmm3, %%xmm0\n\t"
161 "movaps %3, %%xmm2\n\t"
163 :"m"(one), "m"(costab_mmx[28]), "m"(costab_mmx[29]), "m"(*ppnn)
166 for (i = 0; i < 0x20; i += 8)
168 __asm__(
169 "movaps %2, %%xmm3\n\t"
170 "movaps %%xmm3, %%xmm4\n\t"
171 "shufps $20, %%xmm4, %%xmm4\n\t"
172 "shufps $235, %%xmm3, %%xmm3\n\t"
173 "xorps %%xmm2, %%xmm3\n\t"
174 "addps %%xmm3, %%xmm4\n\t"
175 "mulps %%xmm0, %%xmm4\n\t"
176 "movaps %%xmm4, %0\n\t"
177 "movaps %3, %%xmm6\n\t"
178 "movaps %%xmm6, %%xmm5\n\t"
179 "shufps $27, %%xmm5, %%xmm5\n\t"
180 "xorps %%xmm2, %%xmm5\n\t"
181 "addps %%xmm5, %%xmm6\n\t"
182 "mulps %%xmm0, %%xmm6\n\t"
183 "movaps %%xmm6, %1\n\t"
184 :"=m"(*(b2 + i)), "=m"(*(b2 + i + 4))
185 :"m"(*(b1 + i)), "m"(*(b1 + i + 4))
191 int i;
192 __asm__(
193 "movss %0, %%xmm0\n\t"
194 "movaps %%xmm1, %%xmm2\n\t"
195 "movaps %%xmm0, %%xmm7\n\t"
196 "unpcklps %%xmm1, %%xmm2\n\t"
197 "unpcklps %%xmm0, %%xmm7\n\t"
198 "movaps %1, %%xmm0\n\t"
199 "unpcklps %%xmm7, %%xmm2\n\t"
201 :"m"(costab_mmx[30]), "m"(*pnpn)
204 for (i = 0x8; i < 0x20; i += 8)
206 __asm__ volatile (
207 "movaps %2, %%xmm1\n\t"
208 "movaps %%xmm1, %%xmm3\n\t"
209 "shufps $224, %%xmm3, %%xmm3\n\t"
210 "shufps $181, %%xmm1, %%xmm1\n\t"
211 "xorps %%xmm0, %%xmm1\n\t"
212 "addps %%xmm1, %%xmm3\n\t"
213 "mulps %%xmm2, %%xmm3\n\t"
214 "movaps %%xmm3, %0\n\t"
215 "movaps %3, %%xmm4\n\t"
216 "movaps %%xmm4, %%xmm5\n\t"
217 "shufps $224, %%xmm5, %%xmm5\n\t"
218 "shufps $181, %%xmm4, %%xmm4\n\t"
219 "xorps %%xmm0, %%xmm4\n\t"
220 "addps %%xmm4, %%xmm5\n\t"
221 "mulps %%xmm2, %%xmm5\n\t"
222 "movaps %%xmm5, %1\n\t"
223 :"=m"(*(b1 + i)), "=m"(*(b1 + i + 4))
224 :"m"(*(b2 + i)), "m"(*(b2 + i + 4))
225 :"memory"
228 for (i = 0x8; i < 0x20; i += 8)
230 b1[i + 2] += b1[i + 3];
231 b1[i + 6] += b1[i + 7];
232 b1[i + 4] += b1[i + 6];
233 b1[i + 6] += b1[i + 5];
234 b1[i + 5] += b1[i + 7];
238 #if 0
239 /* Reference C code */
242 Should run faster than x87 asm, given that the compiler is sane.
243 However, the C code dosen't round with saturation (0x7fff for too
244 large positive float, 0x8000 for too small negative float). You
245 can hear the difference if you listen carefully.
248 out0[256] = (short)(b2[0] + b2[1]);
249 out0[0] = (short)((b2[0] - b2[1]) * costab_mmx[30]);
250 out1[128] = (short)((b2[3] - b2[2]) * costab_mmx[30]);
251 out0[128] = (short)((b2[3] - b2[2]) * costab_mmx[30] + b2[3] + b2[2]);
252 out1[192] = (short)((b2[7] - b2[6]) * costab_mmx[30]);
253 out0[192] = (short)((b2[7] - b2[6]) * costab_mmx[30] + b2[6] + b2[7] + b2[4] + b2[5]);
254 out0[64] = (short)((b2[7] - b2[6]) * costab_mmx[30] + b2[6] + b2[7] + (b2[4] - b2[5]) * costab_mmx[30]);
255 out1[64] = (short)((b2[7] - b2[6]) * costab_mmx[30] + (b2[4] - b2[5]) * costab_mmx[30]);
257 out0[224] = (short)(b1[8] + b1[12]);
258 out0[160] = (short)(b1[12] + b1[10]);
259 out0[96] = (short)(b1[10] + b1[14]);
260 out0[32] = (short)(b1[14] + b1[9]);
261 out1[32] = (short)(b1[9] + b1[13]);
262 out1[96] = (short)(b1[13] + b1[11]);
263 out1[224] = (short)b1[15];
264 out1[160] = (short)(b1[15] + b1[11]);
265 out0[240] = (short)(b1[24] + b1[28] + b1[16]);
266 out0[208] = (short)(b1[24] + b1[28] + b1[20]);
267 out0[176] = (short)(b1[28] + b1[26] + b1[20]);
268 out0[144] = (short)(b1[28] + b1[26] + b1[18]);
269 out0[112] = (short)(b1[26] + b1[30] + b1[18]);
270 out0[80] = (short)(b1[26] + b1[30] + b1[22]);
271 out0[48] = (short)(b1[30] + b1[25] + b1[22]);
272 out0[16] = (short)(b1[30] + b1[25] + b1[17]);
273 out1[16] = (short)(b1[25] + b1[29] + b1[17]);
274 out1[48] = (short)(b1[25] + b1[29] + b1[21]);
275 out1[80] = (short)(b1[29] + b1[27] + b1[21]);
276 out1[112] = (short)(b1[29] + b1[27] + b1[19]);
277 out1[144] = (short)(b1[27] + b1[31] + b1[19]);
278 out1[176] = (short)(b1[27] + b1[31] + b1[23]);
279 out1[240] = (short)(b1[31]);
280 out1[208] = (short)(b1[31] + b1[23]);
282 #else
284 To do saturation efficiently in x86 we can use fist(t)(p),
285 pf2iw, or packssdw. We use fist(p) here.
287 __asm__(
288 "flds %0\n\t"
289 "flds (%2)\n\t"
290 "fadds 4(%2)\n\t"
291 "fistp 512(%3)\n\t"
293 "flds (%2)\n\t"
294 "fsubs 4(%2)\n\t"
295 "fmul %%st(1)\n\t"
296 "fistp (%3)\n\t"
298 "flds 12(%2)\n\t"
299 "fsubs 8(%2)\n\t"
300 "fmul %%st(1)\n\t"
301 "fist 256(%4)\n\t"
302 "fadds 12(%2)\n\t"
303 "fadds 8(%2)\n\t"
304 "fistp 256(%3)\n\t"
306 "flds 16(%2)\n\t"
307 "fsubs 20(%2)\n\t"
308 "fmul %%st(1)\n\t"
310 "flds 28(%2)\n\t"
311 "fsubs 24(%2)\n\t"
312 "fmul %%st(2)\n\t"
313 "fist 384(%4)\n\t"
314 "fld %%st(0)\n\t"
315 "fadds 24(%2)\n\t"
316 "fadds 28(%2)\n\t"
317 "fld %%st(0)\n\t"
318 "fadds 16(%2)\n\t"
319 "fadds 20(%2)\n\t"
320 "fistp 384(%3)\n\t"
321 "fadd %%st(2)\n\t"
322 "fistp 128(%3)\n\t"
323 "faddp %%st(1)\n\t"
324 "fistp 128(%4)\n\t"
326 "flds 32(%1)\n\t"
327 "fadds 48(%1)\n\t"
328 "fistp 448(%3)\n\t"
330 "flds 48(%1)\n\t"
331 "fadds 40(%1)\n\t"
332 "fistp 320(%3)\n\t"
334 "flds 40(%1)\n\t"
335 "fadds 56(%1)\n\t"
336 "fistp 192(%3)\n\t"
338 "flds 56(%1)\n\t"
339 "fadds 36(%1)\n\t"
340 "fistp 64(%3)\n\t"
342 "flds 36(%1)\n\t"
343 "fadds 52(%1)\n\t"
344 "fistp 64(%4)\n\t"
346 "flds 52(%1)\n\t"
347 "fadds 44(%1)\n\t"
348 "fistp 192(%4)\n\t"
350 "flds 60(%1)\n\t"
351 "fist 448(%4)\n\t"
352 "fadds 44(%1)\n\t"
353 "fistp 320(%4)\n\t"
355 "flds 96(%1)\n\t"
356 "fadds 112(%1)\n\t"
357 "fld %%st(0)\n\t"
358 "fadds 64(%1)\n\t"
359 "fistp 480(%3)\n\t"
360 "fadds 80(%1)\n\t"
361 "fistp 416(%3)\n\t"
363 "flds 112(%1)\n\t"
364 "fadds 104(%1)\n\t"
365 "fld %%st(0)\n\t"
366 "fadds 80(%1)\n\t"
367 "fistp 352(%3)\n\t"
368 "fadds 72(%1)\n\t"
369 "fistp 288(%3)\n\t"
371 "flds 104(%1)\n\t"
372 "fadds 120(%1)\n\t"
373 "fld %%st(0)\n\t"
374 "fadds 72(%1)\n\t"
375 "fistp 224(%3)\n\t"
376 "fadds 88(%1)\n\t"
377 "fistp 160(%3)\n\t"
379 "flds 120(%1)\n\t"
380 "fadds 100(%1)\n\t"
381 "fld %%st(0)\n\t"
382 "fadds 88(%1)\n\t"
383 "fistp 96(%3)\n\t"
384 "fadds 68(%1)\n\t"
385 "fistp 32(%3)\n\t"
387 "flds 100(%1)\n\t"
388 "fadds 116(%1)\n\t"
389 "fld %%st(0)\n\t"
390 "fadds 68(%1)\n\t"
391 "fistp 32(%4)\n\t"
392 "fadds 84(%1)\n\t"
393 "fistp 96(%4)\n\t"
395 "flds 116(%1)\n\t"
396 "fadds 108(%1)\n\t"
397 "fld %%st(0)\n\t"
398 "fadds 84(%1)\n\t"
399 "fistp 160(%4)\n\t"
400 "fadds 76(%1)\n\t"
401 "fistp 224(%4)\n\t"
403 "flds 108(%1)\n\t"
404 "fadds 124(%1)\n\t"
405 "fld %%st(0)\n\t"
406 "fadds 76(%1)\n\t"
407 "fistp 288(%4)\n\t"
408 "fadds 92(%1)\n\t"
409 "fistp 352(%4)\n\t"
411 "flds 124(%1)\n\t"
412 "fist 480(%4)\n\t"
413 "fadds 92(%1)\n\t"
414 "fistp 416(%4)\n\t"
415 ".byte 0xdf, 0xc0\n\t" // ffreep %%st(0)
417 :"m"(costab_mmx[30]), "r"(b1), "r"(b2), "r"(out0), "r"(out1)
418 :"memory"
420 #endif
421 out1[0] = out0[0];