Copy from Berkano Overlay
[otih-overlay.git] / media-libs / x264 / files / x264-psyrdo-0.5.diff
blobd743fffce908eee49434bfbf58bf6361ccdac0df
1 diff --git a/common/common.c b/common/common.c
2 index 71a29b1..301b9ed 100644
3 --- a/common/common.c
4 +++ b/common/common.c
5 @@ -117,6 +117,7 @@ void x264_param_default( x264_param_t *param )
6 | X264_ANALYSE_PSUB16x16 | X264_ANALYSE_BSUB16x16;
7 param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
8 param->analyse.i_me_method = X264_ME_HEX;
9 + param->analyse.f_psy_rd = 1.0;
10 param->analyse.i_me_range = 16;
11 param->analyse.i_subpel_refine = 5;
12 param->analyse.b_chroma_me = 1;
13 @@ -464,6 +465,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
14 p->analyse.i_mv_range_thread = atoi(value);
15 OPT2("subme", "subq")
16 p->analyse.i_subpel_refine = atoi(value);
17 + OPT("psy-rd")
18 + p->analyse.f_psy_rd = atof(value);
19 OPT("bime")
20 p->analyse.b_bidir_me = atobool(value);
21 OPT("chroma-me")
22 @@ -856,6 +859,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
23 s += sprintf( s, " analyse=%#x:%#x", p->analyse.intra, p->analyse.inter );
24 s += sprintf( s, " me=%s", x264_motion_est_names[ p->analyse.i_me_method ] );
25 s += sprintf( s, " subme=%d", p->analyse.i_subpel_refine );
26 + s += sprintf( s, " psy_rd=%f", p->analyse.f_psy_rd );
27 s += sprintf( s, " brdo=%d", p->analyse.b_bframe_rdo );
28 s += sprintf( s, " mixed_ref=%d", p->analyse.b_mixed_references );
29 s += sprintf( s, " me_range=%d", p->analyse.i_me_range );
30 diff --git a/common/common.h b/common/common.h
31 index e2792cc..fbd88fd 100644
32 --- a/common/common.h
33 +++ b/common/common.h
34 @@ -454,6 +454,12 @@ struct x264_t
35 DECLARE_ALIGNED_16( int16_t i8x8_dct_buf[3][64] );
36 DECLARE_ALIGNED_16( int16_t i4x4_dct_buf[15][16] );
38 + /* SATD scores for psy RD */
39 + int fenc_satd[4][4];
40 + int fenc_satd_sum;
41 + int fenc_sa8d[2][2];
42 + int fenc_sa8d_sum;
44 /* pointer over mb of the frame to be compressed */
45 uint8_t *p_fenc[3];
47 diff --git a/encoder/analyse.c b/encoder/analyse.c
48 index 270b90a..25346b4 100644
49 --- a/encoder/analyse.c
50 +++ b/encoder/analyse.c
51 @@ -467,6 +467,34 @@ static void predict_4x4_mode_available( unsigned int i_neighbour,
55 +/* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
56 +static inline void x264_mb_cache_fenc_satd( x264_t *h )
58 + DECLARE_ALIGNED_16(uint8_t zero[16]) = {0};
59 + uint8_t *fenc;
60 + int x, y, satd_sum = 0, sa8d_sum = 0;
61 + if( !h->param.analyse.i_psy_rd)
62 + return;
63 + for( y = 0; y < 4; y++ )
64 + for( x = 0; x < 4; x++ )
65 + {
66 + fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
67 + h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
68 + - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
69 + satd_sum += h->mb.pic.fenc_satd[y][x];
70 + }
71 + for( y = 0; y < 2; y++ )
72 + for( x = 0; x < 2; x++ )
73 + {
74 + fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
75 + h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
76 + - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
77 + sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
78 + }
79 + h->mb.pic.fenc_satd_sum = satd_sum;
80 + h->mb.pic.fenc_sa8d_sum = sa8d_sum;
83 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
85 int i;
86 @@ -1016,12 +1044,15 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
87 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
89 h->mb.i_type = P_L0;
90 - if( a->b_mbrd && a->l0.me16x16.i_ref == 0
91 - && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv )
92 + if( a->b_mbrd )
94 - h->mb.i_partition = D_16x16;
95 - x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
96 - a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
97 + x264_mb_cache_fenc_satd( h );
98 + if( a->l0.me16x16.i_ref == 0 && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv )
99 + {
100 + h->mb.i_partition = D_16x16;
101 + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
102 + a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
107 @@ -1906,7 +1937,7 @@ static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
109 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
111 - int thresh = i_satd_inter * 17/16;
112 + int thresh = i_satd_inter * (17 + (!!h->param.analyse.i_psy_rd))/16;
114 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
116 @@ -2067,7 +2098,10 @@ void x264_macroblock_analyse( x264_t *h )
118 x264_mb_analyse_intra( h, &analysis, COST_MAX );
119 if( analysis.b_mbrd )
121 + x264_mb_cache_fenc_satd( h );
122 x264_intra_rd( h, &analysis, COST_MAX );
125 i_cost = analysis.i_satd_i16x16;
126 h->mb.i_type = I_16x16;
127 @@ -2342,6 +2376,9 @@ void x264_macroblock_analyse( x264_t *h )
129 int i_bskip_cost = COST_MAX;
130 int b_skip = 0;
132 + if( analysis.b_mbrd )
133 + x264_mb_cache_fenc_satd( h );
135 h->mb.i_type = B_SKIP;
136 if( h->mb.b_direct_auto_write )
137 diff --git a/encoder/encoder.c b/encoder/encoder.c
138 index 2c2fe8c..ff5febe 100644
139 --- a/encoder/encoder.c
140 +++ b/encoder/encoder.c
141 @@ -411,6 +411,7 @@ static int x264_validate_parameters( x264_t *h )
142 h->param.analyse.b_fast_pskip = 0;
143 h->param.analyse.i_noise_reduction = 0;
144 h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 1, 6 );
145 + h->param.analyse.f_psy_rd = 0;
147 if( h->param.rc.i_rc_method == X264_RC_CQP )
149 @@ -486,6 +487,16 @@ static int x264_validate_parameters( x264_t *h )
150 if( !h->param.b_cabac )
151 h->param.analyse.i_trellis = 0;
152 h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 );
153 + h->param.analyse.f_psy_rd = x264_clip3f( h->param.analyse.f_psy_rd, 0, 10 );
154 + if( h->param.analyse.i_subpel_refine < 6)
155 + h->param.analyse.f_psy_rd = 0;
156 + if( h->param.analyse.f_psy_rd && h->param.analyse.i_trellis == 1 )
158 + x264_log( h, X264_LOG_WARNING, "psy RD is not compatible with trellis=1; use 0 or 2.\n" );
159 + h->param.analyse.i_trellis = 0;
161 + if( h->param.analyse.f_psy_rd )
162 + h->param.analyse.i_psy_rd = FIX8( h->param.analyse.f_psy_rd );
163 h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 );
164 if( h->param.rc.f_aq_strength <= 0 )
165 h->param.rc.i_aq_mode = 0;
166 diff --git a/encoder/rdo.c b/encoder/rdo.c
167 index 76bf57b..7da862a 100644
168 --- a/encoder/rdo.c
169 +++ b/encoder/rdo.c
170 @@ -50,21 +50,82 @@ static uint16_t cabac_prefix_size[15][128];
172 #define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
173 sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) )
175 -static int ssd_mb( x264_t *h )
177 +#define ADD_ABS_SATD(satdtype, pixel)\
178 + satd += abs((h->pixf.satdtype[pixel]( zero, 0, fdec, FDEC_STRIDE ) - dc_coef)\
179 + - sum_##satdtype( h, pixel, x, y ));
181 +/* Sum the cached SATDs to avoid repeating them. */
182 +static inline int sum_satd( x264_t *h, int pixel, int x, int y )
184 + int satd = 0;
185 + int min_x = x>>2;
186 + int min_y = y>>2;
187 + int max_x = (x>>2) + (x264_pixel_size[pixel].w>>2);
188 + int max_y = (y>>2) + (x264_pixel_size[pixel].h>>2);
189 + if( pixel == PIXEL_16x16 )
190 + return h->mb.pic.fenc_satd_sum;
191 + for( y = min_y; y < max_y; y++ )
192 + for( x = min_x; x < max_x; x++ )
193 + satd += h->mb.pic.fenc_satd[y][x];
194 + return satd;
197 +static inline int sum_sa8d( x264_t *h, int pixel, int x, int y )
199 + int sa8d = 0;
200 + int min_x = x>>3;
201 + int min_y = y>>3;
202 + int max_x = (x>>3) + (x264_pixel_size[pixel].w>>3);
203 + int max_y = (y>>3) + (x264_pixel_size[pixel].h>>3);
204 + if( pixel == PIXEL_16x16 )
205 + return h->mb.pic.fenc_sa8d_sum;
206 + for( y = min_y; y < max_y; y++ )
207 + for( x = min_x; x < max_x; x++ )
208 + sa8d += h->mb.pic.fenc_sa8d[y][x];
209 + return sa8d;
212 +/* Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" */
213 +/* SATD and SA8D are used to measure block complexity. */
214 +/* Blocks with a complexity most similar to that of the source are scored best. */
215 +/* The difference between SATD and SA8D scores are both used to avoid bias from the DCT size. Using SATD */
216 +/* only, for example, results in overusage of 8x8dct, while the opposite occurs when using SA8D. */
217 +/* This is because frequencies stored in an 8x8dct sum up to a larger value when viewed through a 4x4 */
218 +/* transform and vice versa with a 4x4dct and an 8x8 transform. */
219 +/* SSD is still used as the primary RD metric; this value is merely added to it for psy purposes. */
221 +/* FIXME: Is there a better metric than averaged SATD/SA8D difference for complexity difference? */
222 +/* Hadamard transform is recursive, so a SATD+SA8D can be done faster by taking advantage of this fact. */
223 +/* This optimization can also be used in non-RD transform decision. */
225 +static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
227 - return h->pixf.ssd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
228 - h->mb.pic.p_fdec[0], FDEC_STRIDE )
229 - + h->pixf.ssd[PIXEL_8x8]( h->mb.pic.p_fenc[1], FENC_STRIDE,
230 - h->mb.pic.p_fdec[1], FDEC_STRIDE )
231 - + h->pixf.ssd[PIXEL_8x8]( h->mb.pic.p_fenc[2], FENC_STRIDE,
232 - h->mb.pic.p_fdec[2], FDEC_STRIDE );
233 + DECLARE_ALIGNED_16(uint8_t zero[16]) = {0};
234 + int satd = 0;
235 + uint8_t *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE;
236 + uint8_t *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE;
237 + if( p == 0 && h->param.analyse.i_psy_rd )
239 + int dc_coef = h->pixf.sad[size](zero, 0, fdec, FDEC_STRIDE) >> 1;
240 + ADD_ABS_SATD(satd, size);
241 + /* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */
242 + if(size <= PIXEL_8x8)
244 + dc_coef >>= 1;
245 + ADD_ABS_SATD(sa8d, size);
246 + satd >>= 1;
248 + satd = (satd * h->param.analyse.i_psy_rd * x264_lambda_tab[h->mb.i_qp] + 128) >> 8;
250 + return h->pixf.ssd[size](fenc, FENC_STRIDE, fdec, FDEC_STRIDE) + satd;
253 -static int ssd_plane( x264_t *h, int size, int p, int x, int y )
254 +static inline int ssd_mb( x264_t *h )
256 - return h->pixf.ssd[size]( h->mb.pic.p_fenc[p] + x+y*FENC_STRIDE, FENC_STRIDE,
257 - h->mb.pic.p_fdec[p] + x+y*FDEC_STRIDE, FDEC_STRIDE );
258 + return ssd_plane(h, PIXEL_16x16, 0, 0, 0)
259 + + ssd_plane(h, PIXEL_8x8, 1, 0, 0)
260 + + ssd_plane(h, PIXEL_8x8, 2, 0, 0);
263 static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
264 diff --git a/x264.c b/x264.c
265 index 14466e5..96326cd 100644
266 --- a/x264.c
267 +++ b/x264.c
268 @@ -243,6 +243,9 @@ static void Help( x264_param_t *defaults, int b_longhelp )
269 H0( " -m, --subme <integer> Subpixel motion estimation and partition\n"
270 " decision quality: 1=fast, 7=best. [%d]\n", defaults->analyse.i_subpel_refine );
271 H0( " --b-rdo RD based mode decision for B-frames. Requires subme 6.\n" );
272 + H0( " --psy-rd Strength of mode decision psychovisual optimization [\"%f\"]\n"
273 + " Does nothing at subme < 6.",
274 + defaults->analyse.f_psy_rd );
275 H0( " --mixed-refs Decide references on a per partition basis\n" );
276 H1( " --no-chroma-me Ignore chroma in motion estimation\n" );
277 H1( " --bime Jointly optimize both MVs in B-frames\n" );
278 @@ -411,6 +414,7 @@ static int Parse( int argc, char **argv,
279 { "mvrange", required_argument, NULL, 0 },
280 { "mvrange-thread", required_argument, NULL, 0 },
281 { "subme", required_argument, NULL, 'm' },
282 + { "psy-rd", required_argument, NULL, 0 },
283 { "b-rdo", no_argument, NULL, 0 },
284 { "mixed-refs", no_argument, NULL, 0 },
285 { "no-chroma-me", no_argument, NULL, 0 },
286 diff --git a/x264.h b/x264.h
287 index 3b678dc..02266c1 100644
288 --- a/x264.h
289 +++ b/x264.h
290 @@ -239,6 +239,8 @@ typedef struct x264_param_t
291 int b_fast_pskip; /* early SKIP detection on P-frames */
292 int b_dct_decimate; /* transform coefficient thresholding on P-frames */
293 int i_noise_reduction; /* adaptive pseudo-deadzone */
294 + float f_psy_rd; /* Psy RD strength */
295 + int i_psy_rd; /* Psy RD strength--fixed point value*/
297 /* the deadzone size that will be used in luma quantization */
298 int i_luma_deadzone[2]; /* {inter, intra} */