1 diff --git a/common/common.c b/common/common.c
\r
2 index 71a29b1..301b9ed 100644
\r
3 --- a/common/common.c
\r
4 +++ b/common/common.c
\r
5 @@ -117,6 +117,7 @@ void x264_param_default( x264_param_t *param )
\r
6 | X264_ANALYSE_PSUB16x16 | X264_ANALYSE_BSUB16x16;
\r
7 param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
\r
8 param->analyse.i_me_method = X264_ME_HEX;
\r
9 + param->analyse.f_psy_rd = 1.0;
\r
10 param->analyse.i_me_range = 16;
\r
11 param->analyse.i_subpel_refine = 5;
\r
12 param->analyse.b_chroma_me = 1;
\r
13 @@ -464,6 +465,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
\r
14 p->analyse.i_mv_range_thread = atoi(value);
\r
15 OPT2("subme", "subq")
\r
16 p->analyse.i_subpel_refine = atoi(value);
\r
18 + p->analyse.f_psy_rd = atof(value);
\r
20 p->analyse.b_bidir_me = atobool(value);
\r
22 @@ -856,6 +859,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
\r
23 s += sprintf( s, " analyse=%#x:%#x", p->analyse.intra, p->analyse.inter );
\r
24 s += sprintf( s, " me=%s", x264_motion_est_names[ p->analyse.i_me_method ] );
\r
25 s += sprintf( s, " subme=%d", p->analyse.i_subpel_refine );
\r
26 + s += sprintf( s, " psy_rd=%f", p->analyse.f_psy_rd );
\r
27 s += sprintf( s, " brdo=%d", p->analyse.b_bframe_rdo );
\r
28 s += sprintf( s, " mixed_ref=%d", p->analyse.b_mixed_references );
\r
29 s += sprintf( s, " me_range=%d", p->analyse.i_me_range );
\r
30 diff --git a/common/common.h b/common/common.h
\r
31 index e2792cc..fbd88fd 100644
\r
32 --- a/common/common.h
\r
33 +++ b/common/common.h
\r
34 @@ -454,6 +454,12 @@ struct x264_t
\r
35 DECLARE_ALIGNED_16( int16_t i8x8_dct_buf[3][64] );
\r
36 DECLARE_ALIGNED_16( int16_t i4x4_dct_buf[15][16] );
\r
38 + /* SATD scores for psy RD */
\r
39 + int fenc_satd[4][4];
\r
40 + int fenc_satd_sum;
\r
41 + int fenc_sa8d[2][2];
\r
42 + int fenc_sa8d_sum;
\r
44 /* pointer over mb of the frame to be compressed */
\r
47 diff --git a/encoder/analyse.c b/encoder/analyse.c
\r
48 index 270b90a..25346b4 100644
\r
49 --- a/encoder/analyse.c
\r
50 +++ b/encoder/analyse.c
\r
51 @@ -467,6 +467,34 @@ static void predict_4x4_mode_available( unsigned int i_neighbour,
\r
55 +/* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
\r
56 +static inline void x264_mb_cache_fenc_satd( x264_t *h )
\r
58 + DECLARE_ALIGNED_16(uint8_t zero[16]) = {0};
\r
60 + int x, y, satd_sum = 0, sa8d_sum = 0;
\r
61 + if( !h->param.analyse.i_psy_rd)
\r
63 + for( y = 0; y < 4; y++ )
\r
64 + for( x = 0; x < 4; x++ )
\r
66 + fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
\r
67 + h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
\r
68 + - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
\r
69 + satd_sum += h->mb.pic.fenc_satd[y][x];
\r
71 + for( y = 0; y < 2; y++ )
\r
72 + for( x = 0; x < 2; x++ )
\r
74 + fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
\r
75 + h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
\r
76 + - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
\r
77 + sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
\r
79 + h->mb.pic.fenc_satd_sum = satd_sum;
\r
80 + h->mb.pic.fenc_sa8d_sum = sa8d_sum;
\r
83 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
\r
86 @@ -1016,12 +1044,15 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
\r
87 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
\r
89 h->mb.i_type = P_L0;
\r
90 - if( a->b_mbrd && a->l0.me16x16.i_ref == 0
\r
91 - && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv )
\r
94 - h->mb.i_partition = D_16x16;
\r
95 - x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
\r
96 - a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
\r
97 + x264_mb_cache_fenc_satd( h );
\r
98 + if( a->l0.me16x16.i_ref == 0 && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv )
\r
100 + h->mb.i_partition = D_16x16;
\r
101 + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
\r
102 + a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
\r
107 @@ -1906,7 +1937,7 @@ static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
\r
109 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
\r
111 - int thresh = i_satd_inter * 17/16;
\r
112 + int thresh = i_satd_inter * (17 + (!!h->param.analyse.i_psy_rd))/16;
\r
114 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
\r
116 @@ -2067,7 +2098,10 @@ void x264_macroblock_analyse( x264_t *h )
\r
118 x264_mb_analyse_intra( h, &analysis, COST_MAX );
\r
119 if( analysis.b_mbrd )
\r
121 + x264_mb_cache_fenc_satd( h );
\r
122 x264_intra_rd( h, &analysis, COST_MAX );
\r
125 i_cost = analysis.i_satd_i16x16;
\r
126 h->mb.i_type = I_16x16;
\r
127 @@ -2342,6 +2376,9 @@ void x264_macroblock_analyse( x264_t *h )
\r
129 int i_bskip_cost = COST_MAX;
\r
132 + if( analysis.b_mbrd )
\r
133 + x264_mb_cache_fenc_satd( h );
\r
135 h->mb.i_type = B_SKIP;
\r
136 if( h->mb.b_direct_auto_write )
\r
137 diff --git a/encoder/encoder.c b/encoder/encoder.c
\r
138 index 2c2fe8c..ff5febe 100644
\r
139 --- a/encoder/encoder.c
\r
140 +++ b/encoder/encoder.c
\r
141 @@ -411,5 +411,6 @@ static int x264_validate_parameters( x264_t *h )
\r
142 h->param.analyse.b_fast_pskip = 0;
\r
143 h->param.analyse.i_noise_reduction = 0;
\r
144 + h->param.analyse.f_psy_rd = 0;
\r
146 if( h->param.rc.i_rc_method == X264_RC_CQP )
\r
148 @@ -486,6 +487,16 @@ static int x264_validate_parameters( x264_t *h )
\r
149 if( !h->param.b_cabac )
\r
150 h->param.analyse.i_trellis = 0;
\r
151 h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 );
\r
152 + h->param.analyse.f_psy_rd = x264_clip3f( h->param.analyse.f_psy_rd, 0, 10 );
\r
153 + if( h->param.analyse.i_subpel_refine < 6)
\r
154 + h->param.analyse.f_psy_rd = 0;
\r
155 + if( h->param.analyse.f_psy_rd && h->param.analyse.i_trellis == 1 )
\r
157 + x264_log( h, X264_LOG_WARNING, "psy RD is not compatible with trellis=1; use 0 or 2.\n" );
\r
158 + h->param.analyse.i_trellis = 0;
\r
160 + if( h->param.analyse.f_psy_rd )
\r
161 + h->param.analyse.i_psy_rd = FIX8( h->param.analyse.f_psy_rd );
\r
162 h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 );
\r
163 if( h->param.rc.f_aq_strength <= 0 )
\r
164 h->param.rc.i_aq_mode = 0;
\r
165 diff --git a/encoder/rdo.c b/encoder/rdo.c
\r
166 index 76bf57b..7da862a 100644
\r
167 --- a/encoder/rdo.c
\r
168 +++ b/encoder/rdo.c
\r
169 @@ -50,21 +50,82 @@ static uint16_t cabac_prefix_size[15][128];
\r
171 #define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
\r
172 sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) )
\r
174 -static int ssd_mb( x264_t *h )
\r
176 +#define ADD_ABS_SATD(satdtype, pixel)\
\r
177 + satd += abs((h->pixf.satdtype[pixel]( zero, 0, fdec, FDEC_STRIDE ) - dc_coef)\
\r
178 + - sum_##satdtype( h, pixel, x, y ));
\r
180 +/* Sum the cached SATDs to avoid repeating them. */
\r
181 +static inline int sum_satd( x264_t *h, int pixel, int x, int y )
\r
184 + int min_x = x>>2;
\r
185 + int min_y = y>>2;
\r
186 + int max_x = (x>>2) + (x264_pixel_size[pixel].w>>2);
\r
187 + int max_y = (y>>2) + (x264_pixel_size[pixel].h>>2);
\r
188 + if( pixel == PIXEL_16x16 )
\r
189 + return h->mb.pic.fenc_satd_sum;
\r
190 + for( y = min_y; y < max_y; y++ )
\r
191 + for( x = min_x; x < max_x; x++ )
\r
192 + satd += h->mb.pic.fenc_satd[y][x];
\r
196 +static inline int sum_sa8d( x264_t *h, int pixel, int x, int y )
\r
199 + int min_x = x>>3;
\r
200 + int min_y = y>>3;
\r
201 + int max_x = (x>>3) + (x264_pixel_size[pixel].w>>3);
\r
202 + int max_y = (y>>3) + (x264_pixel_size[pixel].h>>3);
\r
203 + if( pixel == PIXEL_16x16 )
\r
204 + return h->mb.pic.fenc_sa8d_sum;
\r
205 + for( y = min_y; y < max_y; y++ )
\r
206 + for( x = min_x; x < max_x; x++ )
\r
207 + sa8d += h->mb.pic.fenc_sa8d[y][x];
\r
211 +/* Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" */
\r
212 +/* SATD and SA8D are used to measure block complexity. */
\r
213 +/* Blocks with a complexity most similar to that of the source are scored best. */
\r
214 +/* The difference between SATD and SA8D scores are both used to avoid bias from the DCT size. Using SATD */
\r
215 +/* only, for example, results in overusage of 8x8dct, while the opposite occurs when using SA8D. */
\r
216 +/* This is because frequencies stored in an 8x8dct sum up to a larger value when viewed through a 4x4 */
\r
217 +/* transform and vice versa with a 4x4dct and an 8x8 transform. */
\r
218 +/* SSD is still used as the primary RD metric; this value is merely added to it for psy purposes. */
\r
220 +/* FIXME: Is there a better metric than averaged SATD/SA8D difference for complexity difference? */
\r
221 +/* Hadamard transform is recursive, so a SATD+SA8D can be done faster by taking advantage of this fact. */
\r
222 +/* This optimization can also be used in non-RD transform decision. */
\r
224 +static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
\r
226 - return h->pixf.ssd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
\r
227 - h->mb.pic.p_fdec[0], FDEC_STRIDE )
\r
228 - + h->pixf.ssd[PIXEL_8x8]( h->mb.pic.p_fenc[1], FENC_STRIDE,
\r
229 - h->mb.pic.p_fdec[1], FDEC_STRIDE )
\r
230 - + h->pixf.ssd[PIXEL_8x8]( h->mb.pic.p_fenc[2], FENC_STRIDE,
\r
231 - h->mb.pic.p_fdec[2], FDEC_STRIDE );
\r
232 + DECLARE_ALIGNED_16(uint8_t zero[16]) = {0};
\r
234 + uint8_t *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE;
\r
235 + uint8_t *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE;
\r
236 + if( p == 0 && h->param.analyse.i_psy_rd )
\r
238 + int dc_coef = h->pixf.sad[size](zero, 0, fdec, FDEC_STRIDE) >> 1;
\r
239 + ADD_ABS_SATD(satd, size);
\r
240 + /* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */
\r
241 + if(size <= PIXEL_8x8)
\r
244 + ADD_ABS_SATD(sa8d, size);
\r
247 + satd = (satd * h->param.analyse.i_psy_rd * x264_lambda_tab[h->mb.i_qp] + 128) >> 8;
\r
249 + return h->pixf.ssd[size](fenc, FENC_STRIDE, fdec, FDEC_STRIDE) + satd;
\r
252 -static int ssd_plane( x264_t *h, int size, int p, int x, int y )
\r
253 +static inline int ssd_mb( x264_t *h )
\r
255 - return h->pixf.ssd[size]( h->mb.pic.p_fenc[p] + x+y*FENC_STRIDE, FENC_STRIDE,
\r
256 - h->mb.pic.p_fdec[p] + x+y*FDEC_STRIDE, FDEC_STRIDE );
\r
257 + return ssd_plane(h, PIXEL_16x16, 0, 0, 0)
\r
258 + + ssd_plane(h, PIXEL_8x8, 1, 0, 0)
\r
259 + + ssd_plane(h, PIXEL_8x8, 2, 0, 0);
\r
262 static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
\r
263 diff --git a/x264.c b/x264.c
\r
264 index 14466e5..96326cd 100644
\r
267 @@ -243,6 +243,9 @@ static void Help( x264_param_t *defaults, int b_longhelp )
\r
268 H0( " -m, --subme <integer> Subpixel motion estimation and partition\n"
\r
269 " decision quality: 1=fast, 7=best. [%d]\n", defaults->analyse.i_subpel_refine );
\r
270 H0( " --b-rdo RD based mode decision for B-frames. Requires subme 6.\n" );
\r
271 + H0( " --psy-rd Strength of mode decision psychovisual optimization [\"%f\"]\n"
\r
272 + " Does nothing at subme < 6.",
\r
273 + defaults->analyse.f_psy_rd );
\r
274 H0( " --mixed-refs Decide references on a per partition basis\n" );
\r
275 H1( " --no-chroma-me Ignore chroma in motion estimation\n" );
\r
276 H1( " --bime Jointly optimize both MVs in B-frames\n" );
\r
277 @@ -411,6 +414,7 @@ static int Parse( int argc, char **argv,
\r
278 { "mvrange", required_argument, NULL, 0 },
\r
279 { "mvrange-thread", required_argument, NULL, 0 },
\r
280 { "subme", required_argument, NULL, 'm' },
\r
281 + { "psy-rd", required_argument, NULL, 0 },
\r
282 { "b-rdo", no_argument, NULL, 0 },
\r
283 { "mixed-refs", no_argument, NULL, 0 },
\r
284 { "no-chroma-me", no_argument, NULL, 0 },
\r
285 diff --git a/x264.h b/x264.h
\r
286 index 3b678dc..02266c1 100644
\r
289 @@ -239,6 +239,8 @@ typedef struct x264_param_t
\r
290 int b_fast_pskip; /* early SKIP detection on P-frames */
\r
291 int b_dct_decimate; /* transform coefficient thresholding on P-frames */
\r
292 int i_noise_reduction; /* adaptive pseudo-deadzone */
\r
293 + float f_psy_rd; /* Psy RD strength */
\r
294 + int i_psy_rd; /* Psy RD strength--fixed point value*/
\r
296 /* the deadzone size that will be used in luma quantization */
\r
297 int i_luma_deadzone[2]; /* {inter, intra} */
\r