3 * Copyright (C) 2002-2003 Falk Hueffner <falk@debian.org>
4 * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
5 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
7 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
8 * See http://libmpeg2.sourceforge.net/ for updates.
10 * mpeg2dec is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * mpeg2dec is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
33 #include "attributes.h"
34 #include "mpeg2_internal.h"
35 #include "alpha_asm.h"
37 #define W1 2841 /* 2048 * sqrt (2) * cos (1 * pi / 16) */
38 #define W2 2676 /* 2048 * sqrt (2) * cos (2 * pi / 16) */
39 #define W3 2408 /* 2048 * sqrt (2) * cos (3 * pi / 16) */
40 #define W5 1609 /* 2048 * sqrt (2) * cos (5 * pi / 16) */
41 #define W6 1108 /* 2048 * sqrt (2) * cos (6 * pi / 16) */
42 #define W7 565 /* 2048 * sqrt (2) * cos (7 * pi / 16) */
44 extern uint8_t mpeg2_clip
[3840 * 2 + 256];
45 #define CLIP(i) ((mpeg2_clip + 3840)[i])
48 #define BUTTERFLY(t0,t1,W0,W1,d0,d1) \
50 t0 = W0 * d0 + W1 * d1; \
51 t1 = W0 * d1 - W1 * d0; \
54 #define BUTTERFLY(t0,t1,W0,W1,d0,d1) \
56 int_fast32_t tmp = W0 * (d0 + d1); \
57 t0 = tmp + (W1 - W0) * d1; \
58 t1 = tmp - (W1 + W0) * d0; \
62 static inline void idct_row (int16_t * const block
)
65 int_fast32_t d0
, d1
, d2
, d3
;
66 int_fast32_t a0
, a1
, a2
, a3
, b0
, b1
, b2
, b3
;
67 int_fast32_t t0
, t1
, t2
, t3
;
73 if (likely (!((l
& ~0xffffUL
) | r
))) {
74 uint64_t tmp
= (uint16_t) (l
>> 1);
77 ((int32_t *)block
)[0] = tmp
;
78 ((int32_t *)block
)[1] = tmp
;
79 ((int32_t *)block
)[2] = tmp
;
80 ((int32_t *)block
)[3] = tmp
;
84 d0
= (sextw (l
) << 11) + 2048;
85 d1
= sextw (extwl (l
, 2));
86 d2
= sextw (extwl (l
, 4)) << 11;
87 d3
= sextw (extwl (l
, 6));
90 BUTTERFLY (t2
, t3
, W6
, W2
, d3
, d1
);
97 d1
= sextw (extwl (r
, 2));
98 d2
= sextw (extwl (r
, 4));
99 d3
= sextw (extwl (r
, 6));
100 BUTTERFLY (t0
, t1
, W7
, W1
, d3
, d0
);
101 BUTTERFLY (t2
, t3
, W3
, W5
, d1
, d2
);
106 b1
= ((t0
+ t1
) >> 8) * 181;
107 b2
= ((t0
- t1
) >> 8) * 181;
109 block
[0] = (a0
+ b0
) >> 12;
110 block
[1] = (a1
+ b1
) >> 12;
111 block
[2] = (a2
+ b2
) >> 12;
112 block
[3] = (a3
+ b3
) >> 12;
113 block
[4] = (a3
- b3
) >> 12;
114 block
[5] = (a2
- b2
) >> 12;
115 block
[6] = (a1
- b1
) >> 12;
116 block
[7] = (a0
- b0
) >> 12;
119 static inline void idct_col (int16_t * const block
)
121 int_fast32_t d0
, d1
, d2
, d3
;
122 int_fast32_t a0
, a1
, a2
, a3
, b0
, b1
, b2
, b3
;
123 int_fast32_t t0
, t1
, t2
, t3
;
125 d0
= (block
[8*0] << 11) + 65536;
127 d2
= block
[8*2] << 11;
131 BUTTERFLY (t2
, t3
, W6
, W2
, d3
, d1
);
141 BUTTERFLY (t0
, t1
, W7
, W1
, d3
, d0
);
142 BUTTERFLY (t2
, t3
, W3
, W5
, d1
, d2
);
147 b1
= ((t0
+ t1
) >> 8) * 181;
148 b2
= ((t0
- t1
) >> 8) * 181;
150 block
[8*0] = (a0
+ b0
) >> 17;
151 block
[8*1] = (a1
+ b1
) >> 17;
152 block
[8*2] = (a2
+ b2
) >> 17;
153 block
[8*3] = (a3
+ b3
) >> 17;
154 block
[8*4] = (a3
- b3
) >> 17;
155 block
[8*5] = (a2
- b2
) >> 17;
156 block
[8*6] = (a1
- b1
) >> 17;
157 block
[8*7] = (a0
- b0
) >> 17;
160 void mpeg2_idct_copy_mvi (int16_t * block
, uint8_t * dest
, const int stride
)
165 for (i
= 0; i
< 8; i
++)
166 idct_row (block
+ 8 * i
);
168 for (i
= 0; i
< 8; i
++)
169 idct_col (block
+ i
);
171 clampmask
= zap (-1, 0xaa); /* 0x00ff00ff00ff00ff */
173 uint64_t shorts0
, shorts1
;
175 shorts0
= ldq (block
);
176 shorts0
= maxsw4 (shorts0
, 0);
177 shorts0
= minsw4 (shorts0
, clampmask
);
178 stl (pkwb (shorts0
), dest
);
180 shorts1
= ldq (block
+ 4);
181 shorts1
= maxsw4 (shorts1
, 0);
182 shorts1
= minsw4 (shorts1
, clampmask
);
183 stl (pkwb (shorts1
), dest
+ 4);
193 void mpeg2_idct_add_mvi (const int last
, int16_t * block
,
194 uint8_t * dest
, const int stride
)
200 if (last
!= 129 || (block
[0] & (7 << 4)) == (4 << 4)) {
201 for (i
= 0; i
< 8; i
++)
202 idct_row (block
+ 8 * i
);
203 for (i
= 0; i
< 8; i
++)
204 idct_col (block
+ i
);
205 clampmask
= zap (-1, 0xaa); /* 0x00ff00ff00ff00ff */
206 signmask
= zap (-1, 0x33);
207 signmask
^= signmask
>> 1; /* 0x8000800080008000 */
210 uint64_t shorts0
, pix0
, signs0
;
211 uint64_t shorts1
, pix1
, signs1
;
213 shorts0
= ldq (block
);
214 shorts1
= ldq (block
+ 4);
216 pix0
= unpkbw (ldl (dest
));
217 /* signed subword add (MMX paddw). */
218 signs0
= shorts0
& signmask
;
219 shorts0
&= ~signmask
;
223 shorts0
= maxsw4 (shorts0
, 0);
224 shorts0
= minsw4 (shorts0
, clampmask
);
227 pix1
= unpkbw (ldl (dest
+ 4));
228 signs1
= shorts1
& signmask
;
229 shorts1
&= ~signmask
;
232 shorts1
= maxsw4 (shorts1
, 0);
233 shorts1
= minsw4 (shorts1
, clampmask
);
235 stl (pkwb (shorts0
), dest
);
236 stl (pkwb (shorts1
), dest
+ 4);
245 uint64_t p0
, p1
, p2
, p3
, p4
, p5
, p6
, p7
;
248 DC
= (block
[0] + 64) >> 7;
249 block
[0] = block
[63] = 0;
251 p0
= ldq (dest
+ 0 * stride
);
252 p1
= ldq (dest
+ 1 * stride
);
253 p2
= ldq (dest
+ 2 * stride
);
254 p3
= ldq (dest
+ 3 * stride
);
255 p4
= ldq (dest
+ 4 * stride
);
256 p5
= ldq (dest
+ 5 * stride
);
257 p6
= ldq (dest
+ 6 * stride
);
258 p7
= ldq (dest
+ 7 * stride
);
261 DCs
= BYTE_VEC (likely (DC
<= 255) ? DC
: 255);
262 p0
+= minub8 (DCs
, ~p0
);
263 p1
+= minub8 (DCs
, ~p1
);
264 p2
+= minub8 (DCs
, ~p2
);
265 p3
+= minub8 (DCs
, ~p3
);
266 p4
+= minub8 (DCs
, ~p4
);
267 p5
+= minub8 (DCs
, ~p5
);
268 p6
+= minub8 (DCs
, ~p6
);
269 p7
+= minub8 (DCs
, ~p7
);
271 DCs
= BYTE_VEC (likely (-DC
<= 255) ? -DC
: 255);
272 p0
-= minub8 (DCs
, p0
);
273 p1
-= minub8 (DCs
, p1
);
274 p2
-= minub8 (DCs
, p2
);
275 p3
-= minub8 (DCs
, p3
);
276 p4
-= minub8 (DCs
, p4
);
277 p5
-= minub8 (DCs
, p5
);
278 p6
-= minub8 (DCs
, p6
);
279 p7
-= minub8 (DCs
, p7
);
282 stq (p0
, dest
+ 0 * stride
);
283 stq (p1
, dest
+ 1 * stride
);
284 stq (p2
, dest
+ 2 * stride
);
285 stq (p3
, dest
+ 3 * stride
);
286 stq (p4
, dest
+ 4 * stride
);
287 stq (p5
, dest
+ 5 * stride
);
288 stq (p6
, dest
+ 6 * stride
);
289 stq (p7
, dest
+ 7 * stride
);
293 void mpeg2_idct_copy_alpha (int16_t * block
, uint8_t * dest
, const int stride
)
297 for (i
= 0; i
< 8; i
++)
298 idct_row (block
+ 8 * i
);
299 for (i
= 0; i
< 8; i
++)
300 idct_col (block
+ i
);
302 dest
[0] = CLIP (block
[0]);
303 dest
[1] = CLIP (block
[1]);
304 dest
[2] = CLIP (block
[2]);
305 dest
[3] = CLIP (block
[3]);
306 dest
[4] = CLIP (block
[4]);
307 dest
[5] = CLIP (block
[5]);
308 dest
[6] = CLIP (block
[6]);
309 dest
[7] = CLIP (block
[7]);
319 void mpeg2_idct_add_alpha (const int last
, int16_t * block
,
320 uint8_t * dest
, const int stride
)
324 if (last
!= 129 || (block
[0] & (7 << 4)) == (4 << 4)) {
325 for (i
= 0; i
< 8; i
++)
326 idct_row (block
+ 8 * i
);
327 for (i
= 0; i
< 8; i
++)
328 idct_col (block
+ i
);
330 dest
[0] = CLIP (block
[0] + dest
[0]);
331 dest
[1] = CLIP (block
[1] + dest
[1]);
332 dest
[2] = CLIP (block
[2] + dest
[2]);
333 dest
[3] = CLIP (block
[3] + dest
[3]);
334 dest
[4] = CLIP (block
[4] + dest
[4]);
335 dest
[5] = CLIP (block
[5] + dest
[5]);
336 dest
[6] = CLIP (block
[6] + dest
[6]);
337 dest
[7] = CLIP (block
[7] + dest
[7]);
348 DC
= (block
[0] + 64) >> 7;
349 block
[0] = block
[63] = 0;
352 dest
[0] = CLIP (DC
+ dest
[0]);
353 dest
[1] = CLIP (DC
+ dest
[1]);
354 dest
[2] = CLIP (DC
+ dest
[2]);
355 dest
[3] = CLIP (DC
+ dest
[3]);
356 dest
[4] = CLIP (DC
+ dest
[4]);
357 dest
[5] = CLIP (DC
+ dest
[5]);
358 dest
[6] = CLIP (DC
+ dest
[6]);
359 dest
[7] = CLIP (DC
+ dest
[7]);
365 void mpeg2_idct_alpha_init (void)
369 for (i
= 0; i
< 64; i
++) {
370 j
= mpeg2_scan_norm
[i
];
371 mpeg2_scan_norm
[i
] = ((j
& 0x36) >> 1) | ((j
& 0x09) << 2);
372 j
= mpeg2_scan_alt
[i
];
373 mpeg2_scan_alt
[i
] = ((j
& 0x36) >> 1) | ((j
& 0x09) << 2);
377 #endif /* ARCH_ALPHA */