3 * Copyright (C) 2002-2003 Falk Hueffner <falk@debian.org>
4 * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
5 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
7 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
8 * See http://libmpeg2.sourceforge.net/ for updates.
10 * mpeg2dec is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * mpeg2dec is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 * Modified for use with MPlayer, see libmpeg-0.4.1.diff for the exact changes.
25 * detailed changelog at http://svn.mplayerhq.hu/mplayer/trunk/
37 #include "attributes.h"
38 #include "mpeg2_internal.h"
39 #include "alpha_asm.h"
41 #define W1 2841 /* 2048 * sqrt (2) * cos (1 * pi / 16) */
42 #define W2 2676 /* 2048 * sqrt (2) * cos (2 * pi / 16) */
43 #define W3 2408 /* 2048 * sqrt (2) * cos (3 * pi / 16) */
44 #define W5 1609 /* 2048 * sqrt (2) * cos (5 * pi / 16) */
45 #define W6 1108 /* 2048 * sqrt (2) * cos (6 * pi / 16) */
46 #define W7 565 /* 2048 * sqrt (2) * cos (7 * pi / 16) */
48 extern uint8_t mpeg2_clip
[3840 * 2 + 256];
49 #define CLIP(i) ((mpeg2_clip + 3840)[i])
52 #define BUTTERFLY(t0,t1,W0,W1,d0,d1) \
54 t0 = W0 * d0 + W1 * d1; \
55 t1 = W0 * d1 - W1 * d0; \
58 #define BUTTERFLY(t0,t1,W0,W1,d0,d1) \
60 int_fast32_t tmp = W0 * (d0 + d1); \
61 t0 = tmp + (W1 - W0) * d1; \
62 t1 = tmp - (W1 + W0) * d0; \
66 static inline void idct_row (int16_t * const block
)
69 int_fast32_t d0
, d1
, d2
, d3
;
70 int_fast32_t a0
, a1
, a2
, a3
, b0
, b1
, b2
, b3
;
71 int_fast32_t t0
, t1
, t2
, t3
;
77 if (likely (!((l
& ~0xffffUL
) | r
))) {
78 uint64_t tmp
= (uint16_t) (l
>> 1);
81 ((int32_t *)block
)[0] = tmp
;
82 ((int32_t *)block
)[1] = tmp
;
83 ((int32_t *)block
)[2] = tmp
;
84 ((int32_t *)block
)[3] = tmp
;
88 d0
= (sextw (l
) << 11) + 2048;
89 d1
= sextw (extwl (l
, 2));
90 d2
= sextw (extwl (l
, 4)) << 11;
91 d3
= sextw (extwl (l
, 6));
94 BUTTERFLY (t2
, t3
, W6
, W2
, d3
, d1
);
101 d1
= sextw (extwl (r
, 2));
102 d2
= sextw (extwl (r
, 4));
103 d3
= sextw (extwl (r
, 6));
104 BUTTERFLY (t0
, t1
, W7
, W1
, d3
, d0
);
105 BUTTERFLY (t2
, t3
, W3
, W5
, d1
, d2
);
110 b1
= ((t0
+ t1
) >> 8) * 181;
111 b2
= ((t0
- t1
) >> 8) * 181;
113 block
[0] = (a0
+ b0
) >> 12;
114 block
[1] = (a1
+ b1
) >> 12;
115 block
[2] = (a2
+ b2
) >> 12;
116 block
[3] = (a3
+ b3
) >> 12;
117 block
[4] = (a3
- b3
) >> 12;
118 block
[5] = (a2
- b2
) >> 12;
119 block
[6] = (a1
- b1
) >> 12;
120 block
[7] = (a0
- b0
) >> 12;
123 static inline void idct_col (int16_t * const block
)
125 int_fast32_t d0
, d1
, d2
, d3
;
126 int_fast32_t a0
, a1
, a2
, a3
, b0
, b1
, b2
, b3
;
127 int_fast32_t t0
, t1
, t2
, t3
;
129 d0
= (block
[8*0] << 11) + 65536;
131 d2
= block
[8*2] << 11;
135 BUTTERFLY (t2
, t3
, W6
, W2
, d3
, d1
);
145 BUTTERFLY (t0
, t1
, W7
, W1
, d3
, d0
);
146 BUTTERFLY (t2
, t3
, W3
, W5
, d1
, d2
);
151 b1
= ((t0
+ t1
) >> 8) * 181;
152 b2
= ((t0
- t1
) >> 8) * 181;
154 block
[8*0] = (a0
+ b0
) >> 17;
155 block
[8*1] = (a1
+ b1
) >> 17;
156 block
[8*2] = (a2
+ b2
) >> 17;
157 block
[8*3] = (a3
+ b3
) >> 17;
158 block
[8*4] = (a3
- b3
) >> 17;
159 block
[8*5] = (a2
- b2
) >> 17;
160 block
[8*6] = (a1
- b1
) >> 17;
161 block
[8*7] = (a0
- b0
) >> 17;
164 #ifdef CAN_COMPILE_ALPHA_MVI
165 void mpeg2_idct_copy_mvi (int16_t * block
, uint8_t * dest
, const int stride
)
170 for (i
= 0; i
< 8; i
++)
171 idct_row (block
+ 8 * i
);
173 for (i
= 0; i
< 8; i
++)
174 idct_col (block
+ i
);
176 clampmask
= zap (-1, 0xaa); /* 0x00ff00ff00ff00ff */
178 uint64_t shorts0
, shorts1
;
180 shorts0
= ldq (block
);
181 shorts0
= maxsw4 (shorts0
, 0);
182 shorts0
= minsw4 (shorts0
, clampmask
);
183 stl (pkwb (shorts0
), dest
);
185 shorts1
= ldq (block
+ 4);
186 shorts1
= maxsw4 (shorts1
, 0);
187 shorts1
= minsw4 (shorts1
, clampmask
);
188 stl (pkwb (shorts1
), dest
+ 4);
198 void mpeg2_idct_add_mvi (const int last
, int16_t * block
,
199 uint8_t * dest
, const int stride
)
205 if (last
!= 129 || (block
[0] & (7 << 4)) == (4 << 4)) {
206 for (i
= 0; i
< 8; i
++)
207 idct_row (block
+ 8 * i
);
208 for (i
= 0; i
< 8; i
++)
209 idct_col (block
+ i
);
210 clampmask
= zap (-1, 0xaa); /* 0x00ff00ff00ff00ff */
211 signmask
= zap (-1, 0x33);
212 signmask
^= signmask
>> 1; /* 0x8000800080008000 */
215 uint64_t shorts0
, pix0
, signs0
;
216 uint64_t shorts1
, pix1
, signs1
;
218 shorts0
= ldq (block
);
219 shorts1
= ldq (block
+ 4);
221 pix0
= unpkbw (ldl (dest
));
222 /* signed subword add (MMX paddw). */
223 signs0
= shorts0
& signmask
;
224 shorts0
&= ~signmask
;
228 shorts0
= maxsw4 (shorts0
, 0);
229 shorts0
= minsw4 (shorts0
, clampmask
);
232 pix1
= unpkbw (ldl (dest
+ 4));
233 signs1
= shorts1
& signmask
;
234 shorts1
&= ~signmask
;
237 shorts1
= maxsw4 (shorts1
, 0);
238 shorts1
= minsw4 (shorts1
, clampmask
);
240 stl (pkwb (shorts0
), dest
);
241 stl (pkwb (shorts1
), dest
+ 4);
250 uint64_t p0
, p1
, p2
, p3
, p4
, p5
, p6
, p7
;
253 DC
= (block
[0] + 64) >> 7;
254 block
[0] = block
[63] = 0;
256 p0
= ldq (dest
+ 0 * stride
);
257 p1
= ldq (dest
+ 1 * stride
);
258 p2
= ldq (dest
+ 2 * stride
);
259 p3
= ldq (dest
+ 3 * stride
);
260 p4
= ldq (dest
+ 4 * stride
);
261 p5
= ldq (dest
+ 5 * stride
);
262 p6
= ldq (dest
+ 6 * stride
);
263 p7
= ldq (dest
+ 7 * stride
);
266 DCs
= BYTE_VEC (likely (DC
<= 255) ? DC
: 255);
267 p0
+= minub8 (DCs
, ~p0
);
268 p1
+= minub8 (DCs
, ~p1
);
269 p2
+= minub8 (DCs
, ~p2
);
270 p3
+= minub8 (DCs
, ~p3
);
271 p4
+= minub8 (DCs
, ~p4
);
272 p5
+= minub8 (DCs
, ~p5
);
273 p6
+= minub8 (DCs
, ~p6
);
274 p7
+= minub8 (DCs
, ~p7
);
276 DCs
= BYTE_VEC (likely (-DC
<= 255) ? -DC
: 255);
277 p0
-= minub8 (DCs
, p0
);
278 p1
-= minub8 (DCs
, p1
);
279 p2
-= minub8 (DCs
, p2
);
280 p3
-= minub8 (DCs
, p3
);
281 p4
-= minub8 (DCs
, p4
);
282 p5
-= minub8 (DCs
, p5
);
283 p6
-= minub8 (DCs
, p6
);
284 p7
-= minub8 (DCs
, p7
);
287 stq (p0
, dest
+ 0 * stride
);
288 stq (p1
, dest
+ 1 * stride
);
289 stq (p2
, dest
+ 2 * stride
);
290 stq (p3
, dest
+ 3 * stride
);
291 stq (p4
, dest
+ 4 * stride
);
292 stq (p5
, dest
+ 5 * stride
);
293 stq (p6
, dest
+ 6 * stride
);
294 stq (p7
, dest
+ 7 * stride
);
299 void mpeg2_idct_copy_alpha (int16_t * block
, uint8_t * dest
, const int stride
)
303 for (i
= 0; i
< 8; i
++)
304 idct_row (block
+ 8 * i
);
305 for (i
= 0; i
< 8; i
++)
306 idct_col (block
+ i
);
308 dest
[0] = CLIP (block
[0]);
309 dest
[1] = CLIP (block
[1]);
310 dest
[2] = CLIP (block
[2]);
311 dest
[3] = CLIP (block
[3]);
312 dest
[4] = CLIP (block
[4]);
313 dest
[5] = CLIP (block
[5]);
314 dest
[6] = CLIP (block
[6]);
315 dest
[7] = CLIP (block
[7]);
325 void mpeg2_idct_add_alpha (const int last
, int16_t * block
,
326 uint8_t * dest
, const int stride
)
330 if (last
!= 129 || (block
[0] & (7 << 4)) == (4 << 4)) {
331 for (i
= 0; i
< 8; i
++)
332 idct_row (block
+ 8 * i
);
333 for (i
= 0; i
< 8; i
++)
334 idct_col (block
+ i
);
336 dest
[0] = CLIP (block
[0] + dest
[0]);
337 dest
[1] = CLIP (block
[1] + dest
[1]);
338 dest
[2] = CLIP (block
[2] + dest
[2]);
339 dest
[3] = CLIP (block
[3] + dest
[3]);
340 dest
[4] = CLIP (block
[4] + dest
[4]);
341 dest
[5] = CLIP (block
[5] + dest
[5]);
342 dest
[6] = CLIP (block
[6] + dest
[6]);
343 dest
[7] = CLIP (block
[7] + dest
[7]);
354 DC
= (block
[0] + 64) >> 7;
355 block
[0] = block
[63] = 0;
358 dest
[0] = CLIP (DC
+ dest
[0]);
359 dest
[1] = CLIP (DC
+ dest
[1]);
360 dest
[2] = CLIP (DC
+ dest
[2]);
361 dest
[3] = CLIP (DC
+ dest
[3]);
362 dest
[4] = CLIP (DC
+ dest
[4]);
363 dest
[5] = CLIP (DC
+ dest
[5]);
364 dest
[6] = CLIP (DC
+ dest
[6]);
365 dest
[7] = CLIP (DC
+ dest
[7]);
371 void mpeg2_idct_alpha_init (void)
373 extern uint8_t mpeg2_scan_norm
[64];
374 extern uint8_t mpeg2_scan_alt
[64];
377 for (i
= 0; i
< 64; i
++) {
378 j
= mpeg2_scan_norm
[i
];
379 mpeg2_scan_norm
[i
] = ((j
& 0x36) >> 1) | ((j
& 0x09) << 2);
380 j
= mpeg2_scan_alt
[i
];
381 mpeg2_scan_alt
[i
] = ((j
& 0x36) >> 1) | ((j
& 0x09) << 2);
385 #endif /* ARCH_ALPHA */