* SDL compilation fix for FreeBSD.
[vlc.git] / plugins / motion / vdec_motion_inner_mmx.c
blobac31682aaf1f87601fe79428412108bae3764737
1 /*****************************************************************************
2 * vdec_motion_inner_mmx.c : motion compensation inner routines optimized in
3 * MMX
4 *****************************************************************************
5 * Copyright (C) 1999, 2000 VideoLAN
6 * $Id: vdec_motion_inner_mmx.c,v 1.3 2001/06/07 22:14:55 sam Exp $
8 * Authors: Christophe Massiot <massiot@via.ecp.fr>, largerly inspired by the
9 * work done by the livid project <http://www.linuxvideo.org/>
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
24 *****************************************************************************/
26 #define MODULE_NAME motionmmx
27 #include "modules_inner.h"
29 /*****************************************************************************
30 * Preamble
31 *****************************************************************************/
32 #include "defs.h"
34 #include "config.h"
35 #include "common.h"
36 #include "threads.h"
37 #include "mtime.h"
39 #include "video.h"
41 #include "attributes.h"
42 #include "mmx.h"
44 /* OK, I know, this code has been taken from livid's mpeg2dec --Meuuh */
46 /* Some rounding constants */
47 mmx_t round1 = {0x0001000100010001LL};
48 mmx_t round4 = {0x0002000200020002LL};
51 * Useful functions
54 static __inline__ void MMXZeroReg()
56 /* load 0 into mm0 */
57 pxor_r2r(mm0,mm0);
60 static __inline__ void MMXAverage2( u8 *dst, u8 *src1, u8 *src2 )
63 // *dst = clip_to_u8((*src1 + *src2 + 1)/2);
66 movq_m2r(*src1,mm1); // load 8 src1 bytes
67 movq_r2r(mm1,mm2); // copy 8 src1 bytes
69 movq_m2r(*src2,mm3); // load 8 src2 bytes
70 movq_r2r(mm3,mm4); // copy 8 src2 bytes
72 punpcklbw_r2r(mm0,mm1); // unpack low src1 bytes
73 punpckhbw_r2r(mm0,mm2); // unpack high src1 bytes
75 punpcklbw_r2r(mm0,mm3); // unpack low src2 bytes
76 punpckhbw_r2r(mm0,mm4); // unpack high src2 bytes
78 paddw_r2r(mm3,mm1); // add lows to mm1
79 paddw_m2r(round1,mm1);
80 psraw_i2r(1,mm1); // /2
82 paddw_r2r(mm4,mm2); // add highs to mm2
83 paddw_m2r(round1,mm2);
84 psraw_i2r(1,mm2); // /2
86 packuswb_r2r(mm2,mm1); // pack (w/ saturation)
87 movq_r2m(mm1,*dst); // store result in dst
90 static __inline__ void MMXInterpAverage2( u8 *dst, u8 *src1, u8 *src2 )
93 // *dst = clip_to_u8((*dst + (*src1 + *src2 + 1)/2 + 1)/2);
96 movq_m2r(*dst,mm1); // load 8 dst bytes
97 movq_r2r(mm1,mm2); // copy 8 dst bytes
99 movq_m2r(*src1,mm3); // load 8 src1 bytes
100 movq_r2r(mm3,mm4); // copy 8 src1 bytes
102 movq_m2r(*src2,mm5); // load 8 src2 bytes
103 movq_r2r(mm5,mm6); // copy 8 src2 bytes
105 punpcklbw_r2r(mm0,mm1); // unpack low dst bytes
106 punpckhbw_r2r(mm0,mm2); // unpack high dst bytes
108 punpcklbw_r2r(mm0,mm3); // unpack low src1 bytes
109 punpckhbw_r2r(mm0,mm4); // unpack high src1 bytes
111 punpcklbw_r2r(mm0,mm5); // unpack low src2 bytes
112 punpckhbw_r2r(mm0,mm6); // unpack high src2 bytes
114 paddw_r2r(mm5,mm3); // add lows
115 paddw_m2r(round1,mm3);
116 psraw_i2r(1,mm3); // /2
118 paddw_r2r(mm6,mm4); // add highs
119 paddw_m2r(round1,mm4);
120 psraw_i2r(1,mm4); // /2
122 paddw_r2r(mm3,mm1); // add lows
123 paddw_m2r(round1,mm1);
124 psraw_i2r(1,mm1); // /2
126 paddw_r2r(mm4,mm2); // add highs
127 paddw_m2r(round1,mm2);
128 psraw_i2r(1,mm2); // /2
130 packuswb_r2r(mm2,mm1); // pack (w/ saturation)
131 movq_r2m(mm1,*dst); // store result in dst
134 static __inline__ void MMXAverage4( u8 *dst, u8 *src1, u8 *src2, u8 *src3,
135 u8 *src4 )
138 // *dst = (*src1 + *src2 + *src3 + *src4 + 2) / 4;
141 movq_m2r(*src1,mm1); // load 8 src1 bytes
142 movq_r2r(mm1,mm2); // copy 8 src1 bytes
144 punpcklbw_r2r(mm0,mm1); // unpack low src1 bytes
145 punpckhbw_r2r(mm0,mm2); // unpack high src1 bytes
147 movq_m2r(*src2,mm3); // load 8 src2 bytes
148 movq_r2r(mm3,mm4); // copy 8 src2 bytes
150 punpcklbw_r2r(mm0,mm3); // unpack low src2 bytes
151 punpckhbw_r2r(mm0,mm4); // unpack high src2 bytes
153 paddw_r2r(mm3,mm1); // add lows
154 paddw_r2r(mm4,mm2); // add highs
156 // now have partials in mm1 and mm2
158 movq_m2r(*src3,mm3); // load 8 src3 bytes
159 movq_r2r(mm3,mm4); // copy 8 src3 bytes
161 punpcklbw_r2r(mm0,mm3); // unpack low src3 bytes
162 punpckhbw_r2r(mm0,mm4); // unpack high src3 bytes
164 paddw_r2r(mm3,mm1); // add lows
165 paddw_r2r(mm4,mm2); // add highs
167 movq_m2r(*src4,mm5); // load 8 src4 bytes
168 movq_r2r(mm5,mm6); // copy 8 src4 bytes
170 punpcklbw_r2r(mm0,mm5); // unpack low src4 bytes
171 punpckhbw_r2r(mm0,mm6); // unpack high src4 bytes
173 paddw_r2r(mm5,mm1); // add lows
174 paddw_r2r(mm6,mm2); // add highs
176 // now have subtotal in mm1 and mm2
178 paddw_m2r(round4,mm1);
179 psraw_i2r(2,mm1); // /4
180 paddw_m2r(round4,mm2);
181 psraw_i2r(2,mm2); // /4
183 packuswb_r2r(mm2,mm1); // pack (w/ saturation)
184 movq_r2m(mm1,*dst); // store result in dst
187 static __inline__ void MMXInterpAverage4( u8 *dst, u8 *src1, u8 *src2,
188 u8 *src3, u8 *src4 )
191 // *dst = clip_to_u8((*dst + (*src1 + *src2 + *src3 + *src4 + 2)/4 + 1)/2);
194 movq_m2r(*src1,mm1); // load 8 src1 bytes
195 movq_r2r(mm1,mm2); // copy 8 src1 bytes
197 punpcklbw_r2r(mm0,mm1); // unpack low src1 bytes
198 punpckhbw_r2r(mm0,mm2); // unpack high src1 bytes
200 movq_m2r(*src2,mm3); // load 8 src2 bytes
201 movq_r2r(mm3,mm4); // copy 8 src2 bytes
203 punpcklbw_r2r(mm0,mm3); // unpack low src2 bytes
204 punpckhbw_r2r(mm0,mm4); // unpack high src2 bytes
206 paddw_r2r(mm3,mm1); // add lows
207 paddw_r2r(mm4,mm2); // add highs
209 // now have partials in mm1 and mm2
211 movq_m2r(*src3,mm3); // load 8 src3 bytes
212 movq_r2r(mm3,mm4); // copy 8 src3 bytes
214 punpcklbw_r2r(mm0,mm3); // unpack low src3 bytes
215 punpckhbw_r2r(mm0,mm4); // unpack high src3 bytes
217 paddw_r2r(mm3,mm1); // add lows
218 paddw_r2r(mm4,mm2); // add highs
220 movq_m2r(*src4,mm5); // load 8 src4 bytes
221 movq_r2r(mm5,mm6); // copy 8 src4 bytes
223 punpcklbw_r2r(mm0,mm5); // unpack low src4 bytes
224 punpckhbw_r2r(mm0,mm6); // unpack high src4 bytes
226 paddw_r2r(mm5,mm1); // add lows
227 paddw_r2r(mm6,mm2); // add highs
229 paddw_m2r(round4,mm1);
230 psraw_i2r(2,mm1); // /4
231 paddw_m2r(round4,mm2);
232 psraw_i2r(2,mm2); // /4
234 // now have subtotal/4 in mm1 and mm2
236 movq_m2r(*dst,mm3); // load 8 dst bytes
237 movq_r2r(mm3,mm4); // copy 8 dst bytes
239 punpcklbw_r2r(mm0,mm3); // unpack low dst bytes
240 punpckhbw_r2r(mm0,mm4); // unpack high dst bytes
242 paddw_r2r(mm3,mm1); // add lows
243 paddw_r2r(mm4,mm2); // add highs
245 paddw_m2r(round1,mm1);
246 psraw_i2r(1,mm1); // /2
247 paddw_m2r(round1,mm2);
248 psraw_i2r(1,mm2); // /2
250 // now have end value in mm1 and mm2
252 packuswb_r2r(mm2,mm1); // pack (w/ saturation)
253 movq_r2m(mm1,*dst); // store result in dst
258 * Actual Motion compensation
261 #define pavg_r2r(src,dest) pavgusb_r2r (src, dest);
262 #define pavg_m2r(src,dest) pavgusb_m2r (src, dest);
264 #define __MotionComponent_x_y_copy(width,height) \
265 void _M(MotionComponent_x_y_copy_##width##_##height)(yuv_data_t * p_src, \
266 yuv_data_t * p_dest, \
267 int i_stride) \
269 int i_y; \
271 MMXZeroReg(); \
273 for( i_y = 0; i_y < height; i_y ++ ) \
275 movq_m2r( *p_src, mm0 ); /* load 8 ref bytes */ \
276 if( width == 16 ) \
277 movq_m2r( *(p_src + 8), mm1 ); \
278 p_src += i_stride; \
280 movq_r2m( mm0, *p_dest ); /* store 8 bytes at curr */ \
281 if( width == 16 ) \
282 movq_r2m( mm1, *(p_dest + 8) ); \
283 p_dest += i_stride; \
287 #define __MotionComponent_X_y_copy(width,height) \
288 void _M(MotionComponent_X_y_copy_##width##_##height)(yuv_data_t * p_src, \
289 yuv_data_t * p_dest, \
290 int i_stride) \
292 int i_y; \
294 MMXZeroReg(); \
296 for( i_y = 0; i_y < height; i_y ++ ) \
298 MMXAverage2( p_dest, p_src, p_src + 1 ); \
300 if( width == 16 ) \
302 MMXAverage2( p_dest + 8, p_src + 8, p_src + 9 ); \
305 p_dest += i_stride; \
306 p_src += i_stride; \
310 #define __MotionComponent_x_Y_copy(width,height) \
311 void _M(MotionComponent_x_Y_copy_##width##_##height)(yuv_data_t * p_src, \
312 yuv_data_t * p_dest, \
313 int i_stride) \
315 int i_y; \
316 yuv_data_t * p_next_src = p_src + i_stride; \
318 MMXZeroReg(); \
320 for( i_y = 0; i_y < height; i_y ++ ) \
322 MMXAverage2( p_dest, p_src, p_next_src ); \
324 if( width == 16 ) \
326 MMXAverage2( p_dest + 8, p_src + 8, p_next_src + 8 ); \
329 p_dest += i_stride; \
330 p_src += i_stride; \
331 p_next_src += i_stride; \
335 #define __MotionComponent_X_Y_copy(width,height) \
336 void _M(MotionComponent_X_Y_copy_##width##_##height)(yuv_data_t * p_src, \
337 yuv_data_t * p_dest, \
338 int i_stride) \
340 int i_y; \
341 yuv_data_t * p_next_src = p_src + i_stride; \
343 MMXZeroReg(); \
345 for( i_y = 0; i_y < height; i_y ++ ) \
347 MMXAverage4( p_dest, p_src, p_src + 1, p_next_src, p_next_src + 1 );\
349 if( width == 16 ) \
351 MMXAverage4( p_dest + 8, p_src + 8, p_src + 9, \
352 p_next_src + 8, p_next_src + 9 ); \
355 p_dest += i_stride; \
356 p_src += i_stride; \
357 p_next_src += i_stride; \
361 #define __MotionComponent_x_y_avg(width,height) \
362 void _M(MotionComponent_x_y_avg_##width##_##height)(yuv_data_t * p_src, \
363 yuv_data_t * p_dest, \
364 int i_stride) \
366 int i_y; \
368 MMXZeroReg(); \
370 for( i_y = 0; i_y < height; i_y ++ ) \
372 MMXAverage2( p_dest, p_dest, p_src ); \
374 if( width == 16 ) \
376 MMXAverage2( p_dest + 8, p_dest + 8, p_src + 8 ); \
379 p_dest += i_stride; \
380 p_src += i_stride; \
384 #define __MotionComponent_X_y_avg(width,height) \
385 void _M(MotionComponent_X_y_avg_##width##_##height)(yuv_data_t * p_src, \
386 yuv_data_t * p_dest, \
387 int i_stride) \
389 int i_y; \
391 MMXZeroReg(); \
393 for( i_y = 0; i_y < height; i_y ++ ) \
395 MMXInterpAverage2( p_dest, p_src, p_src + 1 ); \
397 if( width == 16 ) \
399 MMXInterpAverage2( p_dest + 8, p_src + 8, p_src + 9 ); \
402 p_dest += i_stride; \
403 p_src += i_stride; \
407 #define __MotionComponent_x_Y_avg(width,height) \
408 void _M(MotionComponent_x_Y_avg_##width##_##height)(yuv_data_t * p_src, \
409 yuv_data_t * p_dest, \
410 int i_stride) \
412 int i_y; \
413 yuv_data_t * p_next_src = p_src + i_stride; \
415 MMXZeroReg(); \
417 for( i_y = 0; i_y < height; i_y ++ ) \
419 MMXInterpAverage2( p_dest, p_src, p_next_src ); \
421 if( width == 16 ) \
423 MMXInterpAverage2( p_dest + 8, p_src + 8, p_next_src + 8 ); \
425 p_dest += i_stride; \
426 p_src += i_stride; \
427 p_next_src += i_stride; \
431 #define __MotionComponent_X_Y_avg(width,height) \
432 void _M(MotionComponent_X_Y_avg_##width##_##height)(yuv_data_t * p_src, \
433 yuv_data_t * p_dest, \
434 int i_stride) \
436 int i_y; \
437 yuv_data_t * p_next_src = p_src + i_stride; \
439 MMXZeroReg(); \
441 for( i_y = 0; i_y < height; i_y ++ ) \
443 MMXInterpAverage4( p_dest, p_src, p_src + 1, p_next_src, \
444 p_next_src + 1 ); \
446 if( width == 16 ) \
448 MMXInterpAverage4( p_dest + 8, p_src + 8, p_src + 9, \
449 p_next_src + 8, p_next_src + 9 ); \
452 p_dest += i_stride; \
453 p_src += i_stride; \
454 p_next_src += i_stride; \
458 #define __MotionComponents(width,height) \
459 __MotionComponent_x_y_copy(width,height) \
460 __MotionComponent_X_y_copy(width,height) \
461 __MotionComponent_x_Y_copy(width,height) \
462 __MotionComponent_X_Y_copy(width,height) \
463 __MotionComponent_x_y_avg(width,height) \
464 __MotionComponent_X_y_avg(width,height) \
465 __MotionComponent_x_Y_avg(width,height) \
466 __MotionComponent_X_Y_avg(width,height)
468 __MotionComponents (16,16) /* 444, 422, 420 */
469 __MotionComponents (16,8) /* 444, 422, 420 */
470 __MotionComponents (8,8) /* 422, 420 */
471 __MotionComponents (8,4) /* 420 */
472 #if 0
473 __MotionComponents (8,16) /* 422 */
474 #endif