1 /*****************************************************************************
2 * mc.c: h264 encoder library (Motion Compensation)
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 *****************************************************************************/
34 static inline void pixel_avg( uint8_t *dst
, int i_dst_stride
,
35 uint8_t *src1
, int i_src1_stride
,
36 uint8_t *src2
, int i_src2_stride
,
37 int i_width
, int i_height
)
40 for( y
= 0; y
< i_height
; y
++ )
42 for( x
= 0; x
< i_width
; x
++ )
44 dst
[x
] = ( src1
[x
] + src2
[x
] + 1 ) >> 1;
47 src1
+= i_src1_stride
;
48 src2
+= i_src2_stride
;
52 static inline void pixel_avg_wxh( uint8_t *dst
, int i_dst
, uint8_t *src1
, int i_src1
, uint8_t *src2
, int i_src2
, int width
, int height
)
55 for( y
= 0; y
< height
; y
++ )
57 for( x
= 0; x
< width
; x
++ )
59 dst
[x
] = ( src1
[x
] + src2
[x
] + 1 ) >> 1;
67 /* Implicit weighted bipred only:
68 * assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 */
69 #define op_scale2(x) dst[x] = x264_clip_uint8( (src1[x]*i_weight1 + src2[x]*i_weight2 + (1<<5)) >> 6 )
70 static inline void pixel_avg_weight_wxh( uint8_t *dst
, int i_dst
, uint8_t *src1
, int i_src1
, uint8_t *src2
, int i_src2
, int width
, int height
, int i_weight1
)
73 const int i_weight2
= 64 - i_weight1
;
74 for( y
= 0; y
<height
; y
++, dst
+= i_dst
, src1
+= i_src1
, src2
+= i_src2
)
78 if(width
==2) continue;
81 if(width
==4) continue;
86 if(width
==8) continue;
99 #define PIXEL_AVG_C( name, width, height ) \
100 static void name( uint8_t *pix1, int i_stride_pix1, \
101 uint8_t *pix2, int i_stride_pix2, \
102 uint8_t *pix3, int i_stride_pix3, int weight ) \
105 pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \
107 pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, weight ); \
109 PIXEL_AVG_C( pixel_avg_16x16
, 16, 16 )
110 PIXEL_AVG_C( pixel_avg_16x8
, 16, 8 )
111 PIXEL_AVG_C( pixel_avg_8x16
, 8, 16 )
112 PIXEL_AVG_C( pixel_avg_8x8
, 8, 8 )
113 PIXEL_AVG_C( pixel_avg_8x4
, 8, 4 )
114 PIXEL_AVG_C( pixel_avg_4x8
, 4, 8 )
115 PIXEL_AVG_C( pixel_avg_4x4
, 4, 4 )
116 PIXEL_AVG_C( pixel_avg_4x2
, 4, 2 )
117 PIXEL_AVG_C( pixel_avg_2x4
, 2, 4 )
118 PIXEL_AVG_C( pixel_avg_2x2
, 2, 2 )
120 static void mc_copy( uint8_t *src
, int i_src_stride
, uint8_t *dst
, int i_dst_stride
, int i_width
, int i_height
)
124 for( y
= 0; y
< i_height
; y
++ )
126 memcpy( dst
, src
, i_width
);
133 #define TAPFILTER(pix, d) ((pix)[x-2*d] + (pix)[x+3*d] - 5*((pix)[x-d] + (pix)[x+2*d]) + 20*((pix)[x] + (pix)[x+d]))
134 static void hpel_filter( uint8_t *dsth
, uint8_t *dstv
, uint8_t *dstc
, uint8_t *src
,
135 int stride
, int width
, int height
)
137 int16_t *buf
= x264_malloc((width
+5)*sizeof(int16_t));
139 for( y
=0; y
<height
; y
++ )
141 for( x
=-2; x
<width
+3; x
++ )
143 int v
= TAPFILTER(src
,stride
);
144 dstv
[x
] = x264_clip_uint8((v
+ 16) >> 5);
147 for( x
=0; x
<width
; x
++ )
148 dstc
[x
] = x264_clip_uint8((TAPFILTER(buf
+2,1) + 512) >> 10);
149 for( x
=0; x
<width
; x
++ )
150 dsth
[x
] = x264_clip_uint8((TAPFILTER(src
,1) + 16) >> 5);
159 static const int hpel_ref0
[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
160 static const int hpel_ref1
[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
162 static void mc_luma( uint8_t *dst
, int i_dst_stride
,
163 uint8_t *src
[4], int i_src_stride
,
165 int i_width
, int i_height
)
167 int qpel_idx
= ((mvy
&3)<<2) + (mvx
&3);
168 int offset
= (mvy
>>2)*i_src_stride
+ (mvx
>>2);
169 uint8_t *src1
= src
[hpel_ref0
[qpel_idx
]] + offset
+ ((mvy
&3) == 3) * i_src_stride
;
171 if( qpel_idx
& 5 ) /* qpel interpolation needed */
173 uint8_t *src2
= src
[hpel_ref1
[qpel_idx
]] + offset
+ ((mvx
&3) == 3);
174 pixel_avg( dst
, i_dst_stride
, src1
, i_src_stride
,
175 src2
, i_src_stride
, i_width
, i_height
);
179 mc_copy( src1
, i_src_stride
, dst
, i_dst_stride
, i_width
, i_height
);
183 static uint8_t *get_ref( uint8_t *dst
, int *i_dst_stride
,
184 uint8_t *src
[4], int i_src_stride
,
186 int i_width
, int i_height
)
188 int qpel_idx
= ((mvy
&3)<<2) + (mvx
&3);
189 int offset
= (mvy
>>2)*i_src_stride
+ (mvx
>>2);
190 uint8_t *src1
= src
[hpel_ref0
[qpel_idx
]] + offset
+ ((mvy
&3) == 3) * i_src_stride
;
192 if( qpel_idx
& 5 ) /* qpel interpolation needed */
194 uint8_t *src2
= src
[hpel_ref1
[qpel_idx
]] + offset
+ ((mvx
&3) == 3);
195 pixel_avg( dst
, *i_dst_stride
, src1
, i_src_stride
,
196 src2
, i_src_stride
, i_width
, i_height
);
201 *i_dst_stride
= i_src_stride
;
206 /* full chroma mc (ie until 1/8 pixel)*/
207 static void mc_chroma( uint8_t *dst
, int i_dst_stride
,
208 uint8_t *src
, int i_src_stride
,
210 int i_width
, int i_height
)
215 const int d8x
= mvx
&0x07;
216 const int d8y
= mvy
&0x07;
218 const int cA
= (8-d8x
)*(8-d8y
);
219 const int cB
= d8x
*(8-d8y
);
220 const int cC
= (8-d8x
)*d8y
;
221 const int cD
= d8x
*d8y
;
223 src
+= (mvy
>> 3) * i_src_stride
+ (mvx
>> 3);
224 srcp
= &src
[i_src_stride
];
226 for( y
= 0; y
< i_height
; y
++ )
228 for( x
= 0; x
< i_width
; x
++ )
230 dst
[x
] = ( cA
*src
[x
] + cB
*src
[x
+1] +
231 cC
*srcp
[x
] + cD
*srcp
[x
+1] + 32 ) >> 6;
236 srcp
+= i_src_stride
;
241 static void mc_copy_w##W( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_height ) \
243 mc_copy( src, i_src, dst, i_dst, W, i_height ); \
249 static void plane_copy( uint8_t *dst
, int i_dst
,
250 uint8_t *src
, int i_src
, int w
, int h
)
254 memcpy( dst
, src
, w
);
260 static void prefetch_fenc_null( uint8_t *pix_y
, int stride_y
,
261 uint8_t *pix_uv
, int stride_uv
, int mb_x
)
264 static void prefetch_ref_null( uint8_t *pix
, int stride
, int parity
)
267 static void memzero_aligned( void * dst
, int n
)
272 void x264_frame_init_lowres( x264_t
*h
, x264_frame_t
*frame
)
274 uint8_t *src
= frame
->plane
[0];
275 int i_stride
= frame
->i_stride
[0];
276 int i_height
= frame
->i_lines
[0];
277 int i_width
= frame
->i_width
[0];
280 // duplicate last row and column so that their interpolation doesn't have to be special-cased
281 for( y
=0; y
<i_height
; y
++ )
282 src
[i_width
+y
*i_stride
] = src
[i_width
-1+y
*i_stride
];
283 h
->mc
.memcpy_aligned( src
+i_stride
*i_height
, src
+i_stride
*(i_height
-1), i_width
);
284 h
->mc
.frame_init_lowres_core( src
, frame
->lowres
[0], frame
->lowres
[1], frame
->lowres
[2], frame
->lowres
[3],
285 i_stride
, frame
->i_stride_lowres
, frame
->i_width_lowres
, frame
->i_lines_lowres
);
286 x264_frame_expand_border_lowres( frame
);
288 memset( frame
->i_cost_est
, -1, sizeof(frame
->i_cost_est
) );
290 for( x
= 0; x
< h
->param
.i_bframe
+ 2; x
++ )
291 for( y
= 0; y
< h
->param
.i_bframe
+ 2; y
++ )
292 frame
->i_row_satds
[y
][x
][0] = -1;
294 for( y
= 0; y
<= !!h
->param
.i_bframe
; y
++ )
295 for( x
= 0; x
<= h
->param
.i_bframe
; x
++ )
296 frame
->lowres_mvs
[y
][x
][0][0] = 0x7FFF;
299 static void frame_init_lowres_core( uint8_t *src0
, uint8_t *dst0
, uint8_t *dsth
, uint8_t *dstv
, uint8_t *dstc
,
300 int src_stride
, int dst_stride
, int width
, int height
)
303 for( y
=0; y
<height
; y
++ )
305 uint8_t *src1
= src0
+src_stride
;
306 uint8_t *src2
= src1
+src_stride
;
307 for( x
=0; x
<width
; x
++ )
309 // slower than naive bilinear, but matches asm
310 #define FILTER(a,b,c,d) ((((a+b+1)>>1)+((c+d+1)>>1)+1)>>1)
311 dst0
[x
] = FILTER(src0
[2*x
], src1
[2*x
], src0
[2*x
+1], src1
[2*x
+1]);
312 dsth
[x
] = FILTER(src0
[2*x
+1], src1
[2*x
+1], src0
[2*x
+2], src1
[2*x
+2]);
313 dstv
[x
] = FILTER(src1
[2*x
], src2
[2*x
], src1
[2*x
+1], src2
[2*x
+1]);
314 dstc
[x
] = FILTER(src1
[2*x
+1], src2
[2*x
+1], src1
[2*x
+2], src2
[2*x
+2]);
317 src0
+= src_stride
*2;
325 void x264_mc_init( int cpu
, x264_mc_functions_t
*pf
)
327 pf
->mc_luma
= mc_luma
;
328 pf
->get_ref
= get_ref
;
329 pf
->mc_chroma
= mc_chroma
;
331 pf
->avg
[PIXEL_16x16
]= pixel_avg_16x16
;
332 pf
->avg
[PIXEL_16x8
] = pixel_avg_16x8
;
333 pf
->avg
[PIXEL_8x16
] = pixel_avg_8x16
;
334 pf
->avg
[PIXEL_8x8
] = pixel_avg_8x8
;
335 pf
->avg
[PIXEL_8x4
] = pixel_avg_8x4
;
336 pf
->avg
[PIXEL_4x8
] = pixel_avg_4x8
;
337 pf
->avg
[PIXEL_4x4
] = pixel_avg_4x4
;
338 pf
->avg
[PIXEL_4x2
] = pixel_avg_4x2
;
339 pf
->avg
[PIXEL_2x4
] = pixel_avg_2x4
;
340 pf
->avg
[PIXEL_2x2
] = pixel_avg_2x2
;
342 pf
->copy_16x16_unaligned
= mc_copy_w16
;
343 pf
->copy
[PIXEL_16x16
] = mc_copy_w16
;
344 pf
->copy
[PIXEL_8x8
] = mc_copy_w8
;
345 pf
->copy
[PIXEL_4x4
] = mc_copy_w4
;
347 pf
->plane_copy
= plane_copy
;
348 pf
->hpel_filter
= hpel_filter
;
350 pf
->prefetch_fenc
= prefetch_fenc_null
;
351 pf
->prefetch_ref
= prefetch_ref_null
;
352 pf
->memcpy_aligned
= memcpy
;
353 pf
->memzero_aligned
= memzero_aligned
;
354 pf
->frame_init_lowres_core
= frame_init_lowres_core
;
357 x264_mc_init_mmx( cpu
, pf
);
360 if( cpu
&X264_CPU_ALTIVEC
)
361 x264_mc_altivec_init( pf
);
365 void x264_frame_filter( x264_t
*h
, x264_frame_t
*frame
, int mb_y
, int b_end
)
367 const int b_interlaced
= h
->sh
.b_mbaff
;
368 const int stride
= frame
->i_stride
[0] << b_interlaced
;
369 const int width
= frame
->i_width
[0];
370 int start
= (mb_y
*16 >> b_interlaced
) - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8
371 int height
= ((b_end
? frame
->i_lines
[0] : mb_y
*16) >> b_interlaced
) + 8;
372 int offs
= start
*stride
- 8; // buffer = 3 for 6tap, aligned to 8 for simd
375 if( mb_y
& b_interlaced
)
378 for( y
=0; y
<=b_interlaced
; y
++, offs
+=frame
->i_stride
[0] )
381 frame
->filtered
[1] + offs
,
382 frame
->filtered
[2] + offs
,
383 frame
->filtered
[3] + offs
,
384 frame
->plane
[0] + offs
,
385 stride
, width
+ 16, height
- start
);
388 /* generate integral image:
389 * frame->integral contains 2 planes. in the upper plane, each element is
390 * the sum of an 8x8 pixel region with top-left corner on that point.
391 * in the lower plane, 4x4 sums (needed only with --partitions p4x4). */
393 if( frame
->integral
)
397 memset( frame
->integral
- PADV
* stride
- PADH
, 0, stride
* sizeof(uint16_t) );
402 for( y
= start
; y
< height
; y
++ )
404 uint8_t *ref
= frame
->plane
[0] + y
* stride
- PADH
;
405 uint16_t *line
= frame
->integral
+ (y
+1) * stride
- PADH
+ 1;
406 uint16_t v
= line
[0] = 0;
407 for( x
= 1; x
< stride
-1; x
++ )
408 line
[x
] = v
+= ref
[x
] + line
[x
-stride
] - line
[x
-stride
-1];
412 uint16_t *sum4
= line
+ stride
* (frame
->i_lines
[0] + PADV
*2);
413 for( x
= 1; x
< stride
-8; x
++, line
++, sum4
++ )
415 sum4
[0] = line
[4+4*stride
] - line
[4] - line
[4*stride
] + line
[0];
416 line
[0] += line
[8+8*stride
] - line
[8] - line
[8*stride
];