Check if there is enough bytes before reading the buffer in the EA ADPCM
[ffmpeg-lucabe.git] / libavcodec / x86 / simple_idct_mmx.c
blob578674451aa83d5dee46f3bcf5db56ca0d501931
1 /*
2 * Simple IDCT MMX
4 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavcodec/dsputil.h"
23 #include "libavcodec/simple_idct.h"
26 23170.475006
27 22725.260826
28 21406.727617
29 19265.545870
30 16384.000000
31 12872.826198
32 8866.956905
33 4520.335430
35 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
36 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
38 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
39 #if 0
40 #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
41 #else
42 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
43 #endif
44 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
48 #define ROW_SHIFT 11
49 #define COL_SHIFT 20 // 6
51 DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
52 DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
54 DECLARE_ALIGNED(8, static const int16_t, coeffs[])= {
55 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
56 // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
57 // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
58 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
59 // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
60 // 0, 0, 0, 0,
61 // 0, 0, 0, 0,
63 C4, C4, C4, C4,
64 C4, -C4, C4, -C4,
66 C2, C6, C2, C6,
67 C6, -C2, C6, -C2,
69 C1, C3, C1, C3,
70 C5, C7, C5, C7,
72 C3, -C7, C3, -C7,
73 -C1, -C5, -C1, -C5,
75 C5, -C1, C5, -C1,
76 C7, C3, C7, C3,
78 C7, -C5, C7, -C5,
79 C3, -C1, C3, -C1
82 #if 0
83 static void unused_var_killer(void)
85 int a= wm1010 + d40000;
86 temp[0]=a;
89 static void inline idctCol (int16_t * col, int16_t *input)
91 #undef C0
92 #undef C1
93 #undef C2
94 #undef C3
95 #undef C4
96 #undef C5
97 #undef C6
98 #undef C7
99 int a0, a1, a2, a3, b0, b1, b2, b3;
100 const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
101 const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
102 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
103 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
104 const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
105 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
106 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
107 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
109 if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
110 col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
111 col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
112 return;
115 col[8*0] = input[8*0 + 0];
116 col[8*1] = input[8*2 + 0];
117 col[8*2] = input[8*0 + 1];
118 col[8*3] = input[8*2 + 1];
119 col[8*4] = input[8*4 + 0];
120 col[8*5] = input[8*6 + 0];
121 col[8*6] = input[8*4 + 1];
122 col[8*7] = input[8*6 + 1];
124 a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
125 a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
126 a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
127 a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
129 b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
130 b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
131 b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
132 b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
134 col[8*0] = (a0 + b0) >> COL_SHIFT;
135 col[8*1] = (a1 + b1) >> COL_SHIFT;
136 col[8*2] = (a2 + b2) >> COL_SHIFT;
137 col[8*3] = (a3 + b3) >> COL_SHIFT;
138 col[8*4] = (a3 - b3) >> COL_SHIFT;
139 col[8*5] = (a2 - b2) >> COL_SHIFT;
140 col[8*6] = (a1 - b1) >> COL_SHIFT;
141 col[8*7] = (a0 - b0) >> COL_SHIFT;
144 static void inline idctRow (int16_t * output, int16_t * input)
146 int16_t row[8];
148 int a0, a1, a2, a3, b0, b1, b2, b3;
149 const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
150 const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
151 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
152 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
153 const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
154 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
155 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
156 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
158 row[0] = input[0];
159 row[2] = input[1];
160 row[4] = input[4];
161 row[6] = input[5];
162 row[1] = input[8];
163 row[3] = input[9];
164 row[5] = input[12];
165 row[7] = input[13];
167 if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
168 row[0] = row[1] = row[2] = row[3] = row[4] =
169 row[5] = row[6] = row[7] = row[0]<<3;
170 output[0] = row[0];
171 output[2] = row[1];
172 output[4] = row[2];
173 output[6] = row[3];
174 output[8] = row[4];
175 output[10] = row[5];
176 output[12] = row[6];
177 output[14] = row[7];
178 return;
181 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
182 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
183 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
184 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
186 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
187 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
188 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
189 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
191 row[0] = (a0 + b0) >> ROW_SHIFT;
192 row[1] = (a1 + b1) >> ROW_SHIFT;
193 row[2] = (a2 + b2) >> ROW_SHIFT;
194 row[3] = (a3 + b3) >> ROW_SHIFT;
195 row[4] = (a3 - b3) >> ROW_SHIFT;
196 row[5] = (a2 - b2) >> ROW_SHIFT;
197 row[6] = (a1 - b1) >> ROW_SHIFT;
198 row[7] = (a0 - b0) >> ROW_SHIFT;
200 output[0] = row[0];
201 output[2] = row[1];
202 output[4] = row[2];
203 output[6] = row[3];
204 output[8] = row[4];
205 output[10] = row[5];
206 output[12] = row[6];
207 output[14] = row[7];
209 #endif
211 static inline void idct(int16_t *block)
213 DECLARE_ALIGNED(8, int64_t, align_tmp[16]);
214 int16_t * const temp= (int16_t*)align_tmp;
216 __asm__ volatile(
217 #if 0 //Alternative, simpler variant
219 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
220 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
221 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
222 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
223 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
224 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
225 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
226 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
227 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
228 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
229 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
230 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
231 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
232 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
233 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
234 #rounder ", %%mm4 \n\t"\
235 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
236 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
237 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
238 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
239 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
240 #rounder ", %%mm0 \n\t"\
241 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
242 "paddd %%mm0, %%mm0 \n\t" \
243 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
244 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
245 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
246 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
247 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
248 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
249 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
250 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
251 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
252 "psrad $" #shift ", %%mm7 \n\t"\
253 "psrad $" #shift ", %%mm4 \n\t"\
254 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
255 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
256 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
257 "psrad $" #shift ", %%mm1 \n\t"\
258 "psrad $" #shift ", %%mm2 \n\t"\
259 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
260 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
261 "movq %%mm7, " #dst " \n\t"\
262 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
263 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
264 "movq %%mm2, 24+" #dst " \n\t"\
265 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
266 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
267 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
268 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
269 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
270 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
271 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
272 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
273 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
274 "psrad $" #shift ", %%mm2 \n\t"\
275 "psrad $" #shift ", %%mm0 \n\t"\
276 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
277 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
278 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
279 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
280 "psrad $" #shift ", %%mm6 \n\t"\
281 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
282 "movq %%mm2, 8+" #dst " \n\t"\
283 "psrad $" #shift ", %%mm4 \n\t"\
284 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
285 "movq %%mm4, 16+" #dst " \n\t"\
287 #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
288 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
289 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
290 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
291 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
292 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
293 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
294 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
295 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
296 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
297 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
298 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
299 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
300 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
301 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
302 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
303 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
304 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
305 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
306 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
307 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
308 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
309 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
310 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
311 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
312 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
313 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
314 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
315 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
316 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
317 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
318 "psrad $" #shift ", %%mm7 \n\t"\
319 "psrad $" #shift ", %%mm4 \n\t"\
320 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
321 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
322 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
323 "psrad $" #shift ", %%mm0 \n\t"\
324 "psrad $" #shift ", %%mm2 \n\t"\
325 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
326 "movd %%mm7, " #dst " \n\t"\
327 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
328 "movd %%mm0, 16+" #dst " \n\t"\
329 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
330 "movd %%mm2, 96+" #dst " \n\t"\
331 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
332 "movd %%mm4, 112+" #dst " \n\t"\
333 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
334 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
335 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
336 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
337 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
338 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
339 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
340 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
341 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
342 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
343 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
344 "psrad $" #shift ", %%mm2 \n\t"\
345 "psrad $" #shift ", %%mm5 \n\t"\
346 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
347 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
348 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
349 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
350 "psrad $" #shift ", %%mm6 \n\t"\
351 "psrad $" #shift ", %%mm4 \n\t"\
352 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
353 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
354 "movd %%mm2, 32+" #dst " \n\t"\
355 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
356 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
357 "movd %%mm6, 48+" #dst " \n\t"\
358 "movd %%mm4, 64+" #dst " \n\t"\
359 "movd %%mm5, 80+" #dst " \n\t"\
362 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
363 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
364 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
365 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
366 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
367 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
368 "pand %%mm0, %%mm4 \n\t"\
369 "por %%mm1, %%mm4 \n\t"\
370 "por %%mm2, %%mm4 \n\t"\
371 "por %%mm3, %%mm4 \n\t"\
372 "packssdw %%mm4,%%mm4 \n\t"\
373 "movd %%mm4, %%eax \n\t"\
374 "orl %%eax, %%eax \n\t"\
375 "jz 1f \n\t"\
376 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
377 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
378 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
379 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
380 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
381 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
382 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
383 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
384 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
385 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
386 #rounder ", %%mm4 \n\t"\
387 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
388 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
389 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
390 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
391 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
392 #rounder ", %%mm0 \n\t"\
393 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
394 "paddd %%mm0, %%mm0 \n\t" \
395 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
396 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
397 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
398 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
399 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
400 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
401 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
402 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
403 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
404 "psrad $" #shift ", %%mm7 \n\t"\
405 "psrad $" #shift ", %%mm4 \n\t"\
406 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
407 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
408 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
409 "psrad $" #shift ", %%mm1 \n\t"\
410 "psrad $" #shift ", %%mm2 \n\t"\
411 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
412 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
413 "movq %%mm7, " #dst " \n\t"\
414 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
415 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
416 "movq %%mm2, 24+" #dst " \n\t"\
417 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
418 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
419 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
420 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
421 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
422 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
423 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
424 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
425 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
426 "psrad $" #shift ", %%mm2 \n\t"\
427 "psrad $" #shift ", %%mm0 \n\t"\
428 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
429 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
430 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
431 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
432 "psrad $" #shift ", %%mm6 \n\t"\
433 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
434 "movq %%mm2, 8+" #dst " \n\t"\
435 "psrad $" #shift ", %%mm4 \n\t"\
436 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
437 "movq %%mm4, 16+" #dst " \n\t"\
438 "jmp 2f \n\t"\
439 "1: \n\t"\
440 "pslld $16, %%mm0 \n\t"\
441 "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
442 "psrad $13, %%mm0 \n\t"\
443 "packssdw %%mm0, %%mm0 \n\t"\
444 "movq %%mm0, " #dst " \n\t"\
445 "movq %%mm0, 8+" #dst " \n\t"\
446 "movq %%mm0, 16+" #dst " \n\t"\
447 "movq %%mm0, 24+" #dst " \n\t"\
448 "2: \n\t"
451 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
452 ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
453 /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
454 ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
455 ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
457 DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
458 DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
459 DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
462 //IDCT( src0, src4, src1, src5, dst, shift)
463 COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
464 COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
465 COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
466 COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
468 #else
470 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
471 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
472 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
473 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
474 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
475 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
476 "pand %%mm0, %%mm4 \n\t"\
477 "por %%mm1, %%mm4 \n\t"\
478 "por %%mm2, %%mm4 \n\t"\
479 "por %%mm3, %%mm4 \n\t"\
480 "packssdw %%mm4,%%mm4 \n\t"\
481 "movd %%mm4, %%eax \n\t"\
482 "orl %%eax, %%eax \n\t"\
483 "jz 1f \n\t"\
484 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
485 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
486 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
487 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
488 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
489 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
490 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
491 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
492 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
493 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
494 #rounder ", %%mm4 \n\t"\
495 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
496 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
497 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
498 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
499 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
500 #rounder ", %%mm0 \n\t"\
501 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
502 "paddd %%mm0, %%mm0 \n\t" \
503 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
504 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
505 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
506 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
507 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
508 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
509 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
510 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
511 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
512 "psrad $" #shift ", %%mm7 \n\t"\
513 "psrad $" #shift ", %%mm4 \n\t"\
514 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
515 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
516 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
517 "psrad $" #shift ", %%mm1 \n\t"\
518 "psrad $" #shift ", %%mm2 \n\t"\
519 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
520 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
521 "movq %%mm7, " #dst " \n\t"\
522 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
523 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
524 "movq %%mm2, 24+" #dst " \n\t"\
525 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
526 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
527 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
528 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
529 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
530 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
531 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
532 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
533 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
534 "psrad $" #shift ", %%mm2 \n\t"\
535 "psrad $" #shift ", %%mm0 \n\t"\
536 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
537 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
538 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
539 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
540 "psrad $" #shift ", %%mm6 \n\t"\
541 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
542 "movq %%mm2, 8+" #dst " \n\t"\
543 "psrad $" #shift ", %%mm4 \n\t"\
544 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
545 "movq %%mm4, 16+" #dst " \n\t"\
546 "jmp 2f \n\t"\
547 "1: \n\t"\
548 "pslld $16, %%mm0 \n\t"\
549 "paddd "MANGLE(d40000)", %%mm0 \n\t"\
550 "psrad $13, %%mm0 \n\t"\
551 "packssdw %%mm0, %%mm0 \n\t"\
552 "movq %%mm0, " #dst " \n\t"\
553 "movq %%mm0, 8+" #dst " \n\t"\
554 "movq %%mm0, 16+" #dst " \n\t"\
555 "movq %%mm0, 24+" #dst " \n\t"\
556 "2: \n\t"
558 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
559 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
560 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
561 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
562 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
563 "movq %%mm0, %%mm4 \n\t"\
564 "por %%mm1, %%mm4 \n\t"\
565 "por %%mm2, %%mm4 \n\t"\
566 "por %%mm3, %%mm4 \n\t"\
567 "packssdw %%mm4,%%mm4 \n\t"\
568 "movd %%mm4, %%eax \n\t"\
569 "orl %%eax, %%eax \n\t"\
570 "jz " #bt " \n\t"\
571 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
572 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
573 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
574 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
575 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
576 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
577 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
578 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
579 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
580 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
581 #rounder ", %%mm4 \n\t"\
582 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
583 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
584 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
585 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
586 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
587 #rounder ", %%mm0 \n\t"\
588 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
589 "paddd %%mm0, %%mm0 \n\t" \
590 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
591 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
592 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
593 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
594 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
595 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
596 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
597 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
598 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
599 "psrad $" #shift ", %%mm7 \n\t"\
600 "psrad $" #shift ", %%mm4 \n\t"\
601 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
602 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
603 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
604 "psrad $" #shift ", %%mm1 \n\t"\
605 "psrad $" #shift ", %%mm2 \n\t"\
606 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
607 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
608 "movq %%mm7, " #dst " \n\t"\
609 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
610 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
611 "movq %%mm2, 24+" #dst " \n\t"\
612 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
613 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
614 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
615 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
616 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
617 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
618 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
619 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
620 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
621 "psrad $" #shift ", %%mm2 \n\t"\
622 "psrad $" #shift ", %%mm0 \n\t"\
623 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
624 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
625 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
626 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
627 "psrad $" #shift ", %%mm6 \n\t"\
628 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
629 "movq %%mm2, 8+" #dst " \n\t"\
630 "psrad $" #shift ", %%mm4 \n\t"\
631 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
632 "movq %%mm4, 16+" #dst " \n\t"\
634 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
635 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
636 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
637 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
638 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
639 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
640 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
641 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
642 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
643 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
644 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
645 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
646 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
647 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
648 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
649 #rounder ", %%mm4 \n\t"\
650 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
651 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
652 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
653 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
654 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
655 #rounder ", %%mm0 \n\t"\
656 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
657 "paddd %%mm0, %%mm0 \n\t" \
658 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
659 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
660 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
661 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
662 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
663 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
664 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
665 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
666 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
667 "psrad $" #shift ", %%mm7 \n\t"\
668 "psrad $" #shift ", %%mm4 \n\t"\
669 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
670 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
671 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
672 "psrad $" #shift ", %%mm1 \n\t"\
673 "psrad $" #shift ", %%mm2 \n\t"\
674 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
675 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
676 "movq %%mm7, " #dst " \n\t"\
677 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
678 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
679 "movq %%mm2, 24+" #dst " \n\t"\
680 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
681 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
682 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
683 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
684 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
685 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
686 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
687 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
688 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
689 "psrad $" #shift ", %%mm2 \n\t"\
690 "psrad $" #shift ", %%mm0 \n\t"\
691 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
692 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
693 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
694 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
695 "psrad $" #shift ", %%mm6 \n\t"\
696 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
697 "movq %%mm2, 8+" #dst " \n\t"\
698 "psrad $" #shift ", %%mm4 \n\t"\
699 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
700 "movq %%mm4, 16+" #dst " \n\t"\
702 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
703 DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
704 Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
705 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
706 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
708 #undef IDCT
709 #define IDCT(src0, src4, src1, src5, dst, shift) \
710 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
711 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
712 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
713 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
714 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
715 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
716 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
717 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
718 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
719 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
720 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
721 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
722 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
723 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
724 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
725 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
726 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
727 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
728 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
729 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
730 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
731 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
732 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
733 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
734 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
735 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
736 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
737 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
738 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
739 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
740 "psrad $" #shift ", %%mm7 \n\t"\
741 "psrad $" #shift ", %%mm4 \n\t"\
742 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
743 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
744 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
745 "psrad $" #shift ", %%mm0 \n\t"\
746 "psrad $" #shift ", %%mm2 \n\t"\
747 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
748 "movd %%mm7, " #dst " \n\t"\
749 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
750 "movd %%mm0, 16+" #dst " \n\t"\
751 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
752 "movd %%mm2, 96+" #dst " \n\t"\
753 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
754 "movd %%mm4, 112+" #dst " \n\t"\
755 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
756 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
757 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
758 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
759 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
760 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
761 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
762 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
763 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
764 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
765 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
766 "psrad $" #shift ", %%mm2 \n\t"\
767 "psrad $" #shift ", %%mm5 \n\t"\
768 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
769 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
770 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
771 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
772 "psrad $" #shift ", %%mm6 \n\t"\
773 "psrad $" #shift ", %%mm4 \n\t"\
774 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
775 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
776 "movd %%mm2, 32+" #dst " \n\t"\
777 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
778 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
779 "movd %%mm6, 48+" #dst " \n\t"\
780 "movd %%mm4, 64+" #dst " \n\t"\
781 "movd %%mm5, 80+" #dst " \n\t"
784 //IDCT( src0, src4, src1, src5, dst, shift)
785 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
786 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
787 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
788 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
789 "jmp 9f \n\t"
791 "#" ASMALIGN(4) \
792 "4: \n\t"
793 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
794 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
796 #undef IDCT
797 #define IDCT(src0, src4, src1, src5, dst, shift) \
798 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
799 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
800 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
801 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
802 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
803 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
804 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
805 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
806 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
807 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
808 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
809 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
810 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
811 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
812 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
813 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
814 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
815 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
816 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
817 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
818 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
819 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
820 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
821 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
822 "psrad $" #shift ", %%mm1 \n\t"\
823 "psrad $" #shift ", %%mm4 \n\t"\
824 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
825 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
826 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
827 "psrad $" #shift ", %%mm0 \n\t"\
828 "psrad $" #shift ", %%mm2 \n\t"\
829 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
830 "movd %%mm1, " #dst " \n\t"\
831 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
832 "movd %%mm0, 16+" #dst " \n\t"\
833 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
834 "movd %%mm2, 96+" #dst " \n\t"\
835 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
836 "movd %%mm4, 112+" #dst " \n\t"\
837 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
838 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
839 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
840 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
841 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
842 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
843 "psrad $" #shift ", %%mm2 \n\t"\
844 "psrad $" #shift ", %%mm5 \n\t"\
845 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
846 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
847 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
848 "psrad $" #shift ", %%mm6 \n\t"\
849 "psrad $" #shift ", %%mm1 \n\t"\
850 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
851 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
852 "movd %%mm2, 32+" #dst " \n\t"\
853 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
854 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
855 "movd %%mm6, 48+" #dst " \n\t"\
856 "movd %%mm1, 64+" #dst " \n\t"\
857 "movd %%mm5, 80+" #dst " \n\t"
859 //IDCT( src0, src4, src1, src5, dst, shift)
860 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
861 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
862 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
863 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
864 "jmp 9f \n\t"
866 "#" ASMALIGN(4) \
867 "6: \n\t"
868 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
870 #undef IDCT
871 #define IDCT(src0, src4, src1, src5, dst, shift) \
872 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
873 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
874 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
875 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
876 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
877 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
878 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
879 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
880 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
881 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
882 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
883 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
884 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
885 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
886 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
887 "psrad $" #shift ", %%mm1 \n\t"\
888 "psrad $" #shift ", %%mm4 \n\t"\
889 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
890 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
891 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
892 "psrad $" #shift ", %%mm0 \n\t"\
893 "psrad $" #shift ", %%mm2 \n\t"\
894 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
895 "movd %%mm1, " #dst " \n\t"\
896 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
897 "movd %%mm0, 16+" #dst " \n\t"\
898 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
899 "movd %%mm2, 96+" #dst " \n\t"\
900 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
901 "movd %%mm4, 112+" #dst " \n\t"\
902 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
903 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
904 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
905 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
906 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
907 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
908 "psrad $" #shift ", %%mm2 \n\t"\
909 "psrad $" #shift ", %%mm5 \n\t"\
910 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
911 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
912 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
913 "psrad $" #shift ", %%mm6 \n\t"\
914 "psrad $" #shift ", %%mm1 \n\t"\
915 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
916 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
917 "movd %%mm2, 32+" #dst " \n\t"\
918 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
919 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
920 "movd %%mm6, 48+" #dst " \n\t"\
921 "movd %%mm1, 64+" #dst " \n\t"\
922 "movd %%mm5, 80+" #dst " \n\t"
925 //IDCT( src0, src4, src1, src5, dst, shift)
926 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
927 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
928 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
929 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
930 "jmp 9f \n\t"
932 "#" ASMALIGN(4) \
933 "2: \n\t"
934 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
936 #undef IDCT
937 #define IDCT(src0, src4, src1, src5, dst, shift) \
938 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
939 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
940 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
941 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
942 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
943 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
944 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
945 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
946 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
947 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
948 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
949 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
950 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
951 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
952 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
953 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
954 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
955 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
956 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
957 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
958 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
959 "psrad $" #shift ", %%mm7 \n\t"\
960 "psrad $" #shift ", %%mm4 \n\t"\
961 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
962 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
963 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
964 "psrad $" #shift ", %%mm0 \n\t"\
965 "psrad $" #shift ", %%mm2 \n\t"\
966 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
967 "movd %%mm7, " #dst " \n\t"\
968 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
969 "movd %%mm0, 16+" #dst " \n\t"\
970 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
971 "movd %%mm2, 96+" #dst " \n\t"\
972 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
973 "movd %%mm4, 112+" #dst " \n\t"\
974 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
975 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
976 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
977 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
978 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
979 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
980 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
981 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
982 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
983 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
984 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
985 "psrad $" #shift ", %%mm2 \n\t"\
986 "psrad $" #shift ", %%mm5 \n\t"\
987 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
988 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
989 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
990 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
991 "psrad $" #shift ", %%mm6 \n\t"\
992 "psrad $" #shift ", %%mm4 \n\t"\
993 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
994 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
995 "movd %%mm2, 32+" #dst " \n\t"\
996 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
997 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
998 "movd %%mm6, 48+" #dst " \n\t"\
999 "movd %%mm4, 64+" #dst " \n\t"\
1000 "movd %%mm5, 80+" #dst " \n\t"
1002 //IDCT( src0, src4, src1, src5, dst, shift)
1003 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1004 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1005 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1006 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1007 "jmp 9f \n\t"
1009 "#" ASMALIGN(4) \
1010 "3: \n\t"
1011 #undef IDCT
1012 #define IDCT(src0, src4, src1, src5, dst, shift) \
1013 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1014 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1015 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1016 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1017 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1018 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1019 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1020 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1021 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1022 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1023 "movq 64(%2), %%mm3 \n\t"\
1024 "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1025 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1026 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1027 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1028 "psrad $" #shift ", %%mm7 \n\t"\
1029 "psrad $" #shift ", %%mm4 \n\t"\
1030 "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
1031 "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1032 "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
1033 "psrad $" #shift ", %%mm0 \n\t"\
1034 "psrad $" #shift ", %%mm1 \n\t"\
1035 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1036 "movd %%mm7, " #dst " \n\t"\
1037 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1038 "movd %%mm0, 16+" #dst " \n\t"\
1039 "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
1040 "movd %%mm1, 96+" #dst " \n\t"\
1041 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1042 "movd %%mm4, 112+" #dst " \n\t"\
1043 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1044 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1045 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1046 "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
1047 "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
1048 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1049 "psrad $" #shift ", %%mm1 \n\t"\
1050 "psrad $" #shift ", %%mm5 \n\t"\
1051 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1052 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1053 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1054 "psrad $" #shift ", %%mm6 \n\t"\
1055 "psrad $" #shift ", %%mm4 \n\t"\
1056 "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
1057 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1058 "movd %%mm1, 32+" #dst " \n\t"\
1059 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1060 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1061 "movd %%mm6, 48+" #dst " \n\t"\
1062 "movd %%mm4, 64+" #dst " \n\t"\
1063 "movd %%mm5, 80+" #dst " \n\t"
1066 //IDCT( src0, src4, src1, src5, dst, shift)
1067 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1068 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1069 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1070 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1071 "jmp 9f \n\t"
1073 "#" ASMALIGN(4) \
1074 "5: \n\t"
1075 #undef IDCT
1076 #define IDCT(src0, src4, src1, src5, dst, shift) \
1077 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1078 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1079 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1080 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1081 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1082 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1083 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1084 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1085 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1086 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1087 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1088 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1089 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1090 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1091 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1092 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1093 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1094 "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
1095 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1096 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1097 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1098 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1099 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1100 "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1101 "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1102 "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
1103 "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
1104 "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
1105 "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
1106 "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
1107 "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
1108 "psrad $" #shift ", %%mm4 \n\t"\
1109 "psrad $" #shift ", %%mm7 \n\t"\
1110 "psrad $" #shift ", %%mm3 \n\t"\
1111 "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
1112 "movq %%mm4, " #dst " \n\t"\
1113 "psrad $" #shift ", %%mm0 \n\t"\
1114 "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
1115 "movq %%mm0, 16+" #dst " \n\t"\
1116 "movq %%mm0, 96+" #dst " \n\t"\
1117 "movq %%mm4, 112+" #dst " \n\t"\
1118 "psrad $" #shift ", %%mm5 \n\t"\
1119 "psrad $" #shift ", %%mm6 \n\t"\
1120 "psrad $" #shift ", %%mm2 \n\t"\
1121 "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1122 "movq %%mm5, 32+" #dst " \n\t"\
1123 "psrad $" #shift ", %%mm1 \n\t"\
1124 "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1125 "movq %%mm6, 48+" #dst " \n\t"\
1126 "movq %%mm6, 64+" #dst " \n\t"\
1127 "movq %%mm5, 80+" #dst " \n\t"
1130 //IDCT( src0, src4, src1, src5, dst, shift)
1131 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1132 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1133 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1134 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1135 "jmp 9f \n\t"
1138 "#" ASMALIGN(4) \
1139 "1: \n\t"
1140 #undef IDCT
1141 #define IDCT(src0, src4, src1, src5, dst, shift) \
1142 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1143 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1144 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1145 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1146 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1147 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1148 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1149 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1150 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1151 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1152 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1153 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1154 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1155 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1156 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1157 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1158 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1159 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1160 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1161 "movq 64(%2), %%mm1 \n\t"\
1162 "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1163 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1164 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1165 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1166 "psrad $" #shift ", %%mm7 \n\t"\
1167 "psrad $" #shift ", %%mm4 \n\t"\
1168 "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
1169 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1170 "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1171 "psrad $" #shift ", %%mm0 \n\t"\
1172 "psrad $" #shift ", %%mm3 \n\t"\
1173 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1174 "movd %%mm7, " #dst " \n\t"\
1175 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1176 "movd %%mm0, 16+" #dst " \n\t"\
1177 "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1178 "movd %%mm3, 96+" #dst " \n\t"\
1179 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1180 "movd %%mm4, 112+" #dst " \n\t"\
1181 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1182 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1183 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1184 "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
1185 "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1186 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1187 "psrad $" #shift ", %%mm3 \n\t"\
1188 "psrad $" #shift ", %%mm5 \n\t"\
1189 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1190 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1191 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1192 "psrad $" #shift ", %%mm6 \n\t"\
1193 "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1194 "movd %%mm3, 32+" #dst " \n\t"\
1195 "psrad $" #shift ", %%mm4 \n\t"\
1196 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1197 "movd %%mm6, 48+" #dst " \n\t"\
1198 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1199 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1200 "movd %%mm4, 64+" #dst " \n\t"\
1201 "movd %%mm5, 80+" #dst " \n\t"
1204 //IDCT( src0, src4, src1, src5, dst, shift)
1205 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1206 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1207 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1208 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1209 "jmp 9f \n\t"
1212 "#" ASMALIGN(4)
1213 "7: \n\t"
1214 #undef IDCT
1215 #define IDCT(src0, src4, src1, src5, dst, shift) \
1216 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1217 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1218 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1219 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1220 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1221 "psrad $" #shift ", %%mm4 \n\t"\
1222 "psrad $" #shift ", %%mm0 \n\t"\
1223 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1224 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1225 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1226 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1227 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1228 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1229 "psrad $" #shift ", %%mm1 \n\t"\
1230 "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
1231 "movq %%mm4, " #dst " \n\t"\
1232 "psrad $" #shift ", %%mm2 \n\t"\
1233 "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
1234 "movq %%mm0, 16+" #dst " \n\t"\
1235 "movq %%mm0, 96+" #dst " \n\t"\
1236 "movq %%mm4, 112+" #dst " \n\t"\
1237 "movq %%mm0, 32+" #dst " \n\t"\
1238 "movq %%mm4, 48+" #dst " \n\t"\
1239 "movq %%mm4, 64+" #dst " \n\t"\
1240 "movq %%mm0, 80+" #dst " \n\t"
1242 //IDCT( src0, src4, src1, src5, dst, shift)
1243 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1244 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1245 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1246 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1249 #endif
1252 Input
1253 00 40 04 44 20 60 24 64
1254 10 30 14 34 50 70 54 74
1255 01 41 03 43 21 61 23 63
1256 11 31 13 33 51 71 53 73
1257 02 42 06 46 22 62 26 66
1258 12 32 16 36 52 72 56 76
1259 05 45 07 47 25 65 27 67
1260 15 35 17 37 55 75 57 77
1262 Temp
1263 00 04 10 14 20 24 30 34
1264 40 44 50 54 60 64 70 74
1265 01 03 11 13 21 23 31 33
1266 41 43 51 53 61 63 71 73
1267 02 06 12 16 22 26 32 36
1268 42 46 52 56 62 66 72 76
1269 05 07 15 17 25 27 35 37
1270 45 47 55 57 65 67 75 77
1273 "9: \n\t"
1274 :: "r" (block), "r" (temp), "r" (coeffs)
1275 : "%eax"
1279 void ff_simple_idct_mmx(int16_t *block)
1281 idct(block);
1284 //FIXME merge add/put into the idct
1286 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1288 idct(block);
1289 put_pixels_clamped_mmx(block, dest, line_size);
1291 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1293 idct(block);
1294 add_pixels_clamped_mmx(block, dest, line_size);