media-libs/x264/files/.svn/text-base/x264-psyrd-0.6-psytrellis-0.1.diff.svn-base

   1 diff --git a/common/common.c b/common/common.c\r
   2 index 71a29b1..21155eb 100644\r
   3 --- a/common/common.c\r
   4 +++ b/common/common.c\r
   5 @@ -117,6 +117,7 @@ void    x264_param_default( x264_param_t *param )\r
   6                           | X264_ANALYSE_PSUB16x16 | X264_ANALYSE_BSUB16x16;\r
   7      param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;\r
   8      param->analyse.i_me_method = X264_ME_HEX;\r
   9 +    param->analyse.f_psy_rd = 1.0;\r
  10      param->analyse.i_me_range = 16;\r
  11      param->analyse.i_subpel_refine = 5;\r
  12      param->analyse.b_chroma_me = 1;\r
  13 @@ -464,6 +465,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )\r
  14          p->analyse.i_mv_range_thread = atoi(value);\r
  15      OPT2("subme", "subq")\r
  16          p->analyse.i_subpel_refine = atoi(value);\r
  17 +    OPT("psy-rd")\r
  18 +        p->analyse.f_psy_rd = atof(value);\r
  19      OPT("bime")\r
  20          p->analyse.b_bidir_me = atobool(value);\r
  21      OPT("chroma-me")\r
  22 @@ -856,6 +859,7 @@ char *x264_param2string( x264_param_t *p, int b_res )\r
  23      s += sprintf( s, " analyse=%#x:%#x", p->analyse.intra, p->analyse.inter );\r
  24      s += sprintf( s, " me=%s", x264_motion_est_names[ p->analyse.i_me_method ] );\r
  25      s += sprintf( s, " subme=%d", p->analyse.i_subpel_refine );\r
  26 +    s += sprintf( s, " psy_rd=%.1f", p->analyse.f_psy_rd );\r
  27      s += sprintf( s, " brdo=%d", p->analyse.b_bframe_rdo );\r
  28      s += sprintf( s, " mixed_ref=%d", p->analyse.b_mixed_references );\r
  29      s += sprintf( s, " me_range=%d", p->analyse.i_me_range );\r
  30 diff --git a/common/common.h b/common/common.h\r
  31 index 80648a8..78cdb05 100644\r
  32 --- a/common/common.h\r
  33 +++ b/common/common.h\r
  34 @@ -342,8 +342,6 @@ struct x264_t\r
  35      x264_frame_t    *fref1[16+3];     /* ref list 1 */\r
  36      int             b_ref_reorder[2];\r
  37  \r
  38 -\r
  39 -\r
  40      /* Current MB DCT coeffs */\r
  41      struct\r
  42      {\r
  43 @@ -454,6 +452,16 @@ struct x264_t\r
  44              DECLARE_ALIGNED_16( int16_t i8x8_dct_buf[3][64] );\r
  45              DECLARE_ALIGNED_16( int16_t i4x4_dct_buf[15][16] );\r
  46  \r
  47 +            /* Psy trellis DCT data */\r
  48 +            DECLARE_ALIGNED_16( int16_t fenc_dct8[4][64] );\r
  49 +            DECLARE_ALIGNED_16( int16_t fenc_dct4[16][16] );\r
  50 +\r
  51 +            /* SATD scores for psy RD */\r
  52 +            int fenc_satd[4][4];\r
  53 +            int fenc_satd_sum;\r
  54 +            int fenc_sa8d[2][2];\r
  55 +            int fenc_sa8d_sum;\r
  56 +\r
  57              /* pointer over mb of the frame to be compressed */\r
  58              uint8_t *p_fenc[3];\r
  59  \r
  60 diff --git a/common/dct.h b/common/dct.h\r
  61 index 1078023..daa96f4 100644\r
  62 --- a/common/dct.h\r
  63 +++ b/common/dct.h\r
  64 @@ -41,6 +41,17 @@ static const uint16_t x264_dct8_weight_tab[64] = {\r
  65  };\r
  66  #undef W\r
  67  \r
  68 +#define W(i) (i==0 ? FIX8(1.76777) :\\r
  69 +              i==1 ? FIX8(1.11803) :\\r
  70 +              i==2 ? FIX8(0.70711) :0)\r
  71 +static const uint16_t x264_dct4_weight_tab[16] = {\r
  72 +    W(0), W(1), W(0), W(1),\r
  73 +    W(1), W(2), W(1), W(2),\r
  74 +    W(0), W(1), W(0), W(1),\r
  75 +    W(1), W(2), W(1), W(2)\r
  76 +};\r
  77 +#undef W\r
  78 +\r
  79  /* inverse squared */\r
  80  #define W(i) (i==0 ? FIX8(3.125) :\\r
  81                i==1 ? FIX8(1.25) :\\r
  82 diff --git a/encoder/analyse.c b/encoder/analyse.c\r
  83 index 270b90a..59cc89b 100644\r
  84 --- a/encoder/analyse.c\r
  85 +++ b/encoder/analyse.c\r
  86 @@ -467,6 +467,34 @@ static void predict_4x4_mode_available( unsigned int i_neighbour,\r
  87      }\r
  88  }\r
  89  \r
  90 +/* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */\r
  91 +static inline void x264_mb_cache_fenc_satd( x264_t *h )\r
  92 +{\r
  93 +    DECLARE_ALIGNED_16(uint8_t zero[16]) = {0};\r
  94 +    uint8_t *fenc;\r
  95 +    int x, y, satd_sum = 0, sa8d_sum = 0;\r
  96 +    if( !h->param.analyse.i_psy_rd )\r
  97 +        return;\r
  98 +    for( y = 0; y < 4; y++ )\r
  99 +        for( x = 0; x < 4; x++ )\r
 100 +        {\r
 101 +            fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;\r
 102 +            h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )\r
 103 +                                      - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);\r
 104 +            satd_sum += h->mb.pic.fenc_satd[y][x];\r
 105 +        }\r
 106 +    for( y = 0; y < 2; y++ )\r
 107 +        for( x = 0; x < 2; x++ )\r
 108 +        {\r
 109 +            fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;\r
 110 +            h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )\r
 111 +                                      - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);\r
 112 +            sa8d_sum += h->mb.pic.fenc_sa8d[y][x];\r
 113 +        }\r
 114 +    h->mb.pic.fenc_satd_sum = satd_sum;\r
 115 +    h->mb.pic.fenc_sa8d_sum = sa8d_sum;\r
 116 +}\r
 117 +\r
 118  static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )\r
 119  {\r
 120      int i;\r
 121 @@ -1016,12 +1044,15 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )\r
 122      assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );\r
 123  \r
 124      h->mb.i_type = P_L0;\r
 125 -    if( a->b_mbrd && a->l0.me16x16.i_ref == 0\r
 126 -        && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv )\r
 127 +    if( a->b_mbrd )\r
 128      {\r
 129 -        h->mb.i_partition = D_16x16;\r
 130 -        x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );\r
 131 -        a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );\r
 132 +        x264_mb_cache_fenc_satd( h );\r
 133 +        if( a->l0.me16x16.i_ref == 0 && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv )\r
 134 +        {\r
 135 +            h->mb.i_partition = D_16x16;\r
 136 +            x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );\r
 137 +            a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );\r
 138 +        }\r
 139      }\r
 140  }\r
 141  \r
 142 @@ -1906,7 +1937,7 @@ static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )\r
 143  \r
 144  static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )\r
 145  {\r
 146 -    int thresh = i_satd_inter * 17/16;\r
 147 +    int thresh = i_satd_inter * (17 + (!!h->param.analyse.i_psy_rd))/16;\r
 148  \r
 149      if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )\r
 150      {\r
 151 @@ -2046,6 +2077,21 @@ static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *\r
 152      }\r
 153  }\r
 154  \r
 155 +void x264_psy_trellis_init( x264_t *h )\r
 156 +{\r
 157 +    DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] );\r
 158 +    DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );\r
 159 +    DECLARE_ALIGNED_16( uint8_t zero[16*FDEC_STRIDE] ) = {0};\r
 160 +    int i;\r
 161 +    \r
 162 +    /* For psy trellis: do DCT on input data */\r
 163 +    h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], zero );\r
 164 +    for( i = 0; i < 4; i++ )\r
 165 +        h->zigzagf.scan_8x8( h->mb.pic.fenc_dct8[i], dct8x8[i] );\r
 166 +    h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], zero );\r
 167 +    for( i = 0; i < 16; i++ )\r
 168 +        h->zigzagf.scan_4x4( h->mb.pic.fenc_dct4[i], dct4x4[i] );\r
 169 +}\r
 170  \r
 171  /*****************************************************************************\r
 172   * x264_macroblock_analyse:\r
 173 @@ -2062,12 +2108,18 @@ void x264_macroblock_analyse( x264_t *h )\r
 174  \r
 175      x264_mb_analyse_init( h, &analysis, h->mb.i_qp );\r
 176  \r
 177 +    if( h->param.analyse.i_psy_rd && h->param.analyse.i_trellis )\r
 178 +        x264_psy_trellis_init( h );\r
 179 +\r
 180      /*--------------------------- Do the analysis ---------------------------*/\r
 181      if( h->sh.i_type == SLICE_TYPE_I )\r
 182      {\r
 183          x264_mb_analyse_intra( h, &analysis, COST_MAX );\r
 184          if( analysis.b_mbrd )\r
 185 +        {\r
 186 +            x264_mb_cache_fenc_satd( h );\r
 187              x264_intra_rd( h, &analysis, COST_MAX );\r
 188 +        }\r
 189  \r
 190          i_cost = analysis.i_satd_i16x16;\r
 191          h->mb.i_type = I_16x16;\r
 192 @@ -2342,6 +2394,9 @@ void x264_macroblock_analyse( x264_t *h )\r
 193      {\r
 194          int i_bskip_cost = COST_MAX;\r
 195          int b_skip = 0;\r
 196 +        \r
 197 +        if( analysis.b_mbrd )\r
 198 +            x264_mb_cache_fenc_satd( h );\r
 199  \r
 200          h->mb.i_type = B_SKIP;\r
 201          if( h->mb.b_direct_auto_write )\r
 202 diff --git a/encoder/encoder.c b/encoder/encoder.c\r
 203 index 8f1ebac..b1d5f7b 100644\r
 204 --- a/encoder/encoder.c\r
 205 +++ b/encoder/encoder.c\r
 206 @@ -411,5 +411,6 @@ static int x264_validate_parameters( x264_t *h )\r
 207          h->param.analyse.b_fast_pskip = 0;\r
 208          h->param.analyse.i_noise_reduction = 0;\r
 209 +        h->param.analyse.f_psy_rd = 0;\r
 210      }\r
 211      if( h->param.rc.i_rc_method == X264_RC_CQP )\r
 212      {\r
 213 @@ -482,10 +483,23 @@ static int x264_validate_parameters( x264_t *h )\r
 214          h->param.analyse.inter &= ~X264_ANALYSE_I8x8;\r
 215          h->param.analyse.intra &= ~X264_ANALYSE_I8x8;\r
 216      }\r
 217 -    h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12);\r
 218      if( !h->param.b_cabac )\r
 219          h->param.analyse.i_trellis = 0;\r
 220      h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 );\r
 221 +    h->param.analyse.f_psy_rd = x264_clip3f( h->param.analyse.f_psy_rd, 0, 10 );\r
 222 +    if( h->param.analyse.i_subpel_refine < 6)\r
 223 +        h->param.analyse.f_psy_rd = 0;\r
 224 +    if( h->param.analyse.f_psy_rd )\r
 225 +    {\r
 226 +        h->param.analyse.i_psy_rd = FIX8( h->param.analyse.f_psy_rd );\r
 227 +        /* Psy RDO increases overall quantizers to improve the quality of luma--this indirectly hurts chroma quality */\r
 228 +        /* so we lower the chroma QP offset to compensate */\r
 229 +        h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_rd < 0.25 ? 1 : 2;\r
 230 +        /* Psy trellis has a similar effect. */\r
 231 +        if( h->param.analyse.i_trellis )\r
 232 +            h->param.analyse.i_chroma_qp_offset -= 2;\r
 233 +    }\r
 234 +    h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12);\r
 235      h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 );\r
 236      if( h->param.rc.f_aq_strength <= 0 )\r
 237          h->param.rc.i_aq_mode = 0;\r
 238 diff --git a/encoder/macroblock.c b/encoder/macroblock.c\r
 239 index 788a8ea..b429c16 100644\r
 240 --- a/encoder/macroblock.c\r
 241 +++ b/encoder/macroblock.c\r
 242 @@ -94,7 +94,7 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )\r
 243      h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );\r
 244  \r
 245      if( h->mb.b_trellis )\r
 246 -        x264_quant_4x4_trellis( h, dct4x4, CQM_4IY, i_qscale, DCT_LUMA_4x4, 1 );\r
 247 +        x264_quant_4x4_trellis( h, dct4x4, CQM_4IY, i_qscale, DCT_LUMA_4x4, 1, idx );\r
 248      else\r
 249          h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] );\r
 250  \r
 251 @@ -121,7 +121,7 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale )\r
 252      h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );\r
 253  \r
 254      if( h->mb.b_trellis )\r
 255 -        x264_quant_8x8_trellis( h, dct8x8, CQM_8IY, i_qscale, 1 );\r
 256 +        x264_quant_8x8_trellis( h, dct8x8, CQM_8IY, i_qscale, 1, idx );\r
 257      else \r
 258          h->quantf.quant_8x8( dct8x8, h->quant8_mf[CQM_8IY][i_qscale], h->quant8_bias[CQM_8IY][i_qscale] );\r
 259  \r
 260 @@ -163,7 +163,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )\r
 261  \r
 262          /* quant/scan/dequant */\r
 263          if( h->mb.b_trellis )\r
 264 -            x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IY, i_qscale, DCT_LUMA_AC, 1 );\r
 265 +            x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IY, i_qscale, DCT_LUMA_AC, 1, i );\r
 266          else\r
 267              h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] );\r
 268  \r
 269 @@ -447,7 +447,7 @@ void x264_macroblock_encode( x264_t *h )\r
 270                  if( h->mb.b_noise_reduction )\r
 271                      h->quantf.denoise_dct_core( *dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 );\r
 272                  if( h->mb.b_trellis )\r
 273 -                    x264_quant_8x8_trellis( h, dct8x8[idx], CQM_8PY, i_qp, 0 );\r
 274 +                    x264_quant_8x8_trellis( h, dct8x8[idx], CQM_8PY, i_qp, 0, idx );\r
 275                  else\r
 276                      h->quantf.quant_8x8( dct8x8[idx], h->quant8_mf[CQM_8PY][i_qp], h->quant8_bias[CQM_8PY][i_qp] );\r
 277  \r
 278 @@ -495,7 +495,7 @@ void x264_macroblock_encode( x264_t *h )\r
 279                      if( h->mb.b_noise_reduction )\r
 280                          h->quantf.denoise_dct_core( *dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );\r
 281                      if( h->mb.b_trellis )\r
 282 -                        x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, DCT_LUMA_4x4, 0 );\r
 283 +                        x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, DCT_LUMA_4x4, 0, idx );\r
 284                      else\r
 285                          h->quantf.quant_4x4( dct4x4[idx], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );\r
 286  \r
 287 diff --git a/encoder/macroblock.h b/encoder/macroblock.h\r
 288 index 49d13a2..eb97d44 100644\r
 289 --- a/encoder/macroblock.h\r
 290 +++ b/encoder/macroblock.h\r
 291 @@ -50,9 +50,9 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale );\r
 292  void x264_cabac_mb_skip( x264_t *h, int b_skip );\r
 293  \r
 294  void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,\r
 295 -                             int i_qp, int i_ctxBlockCat, int b_intra );\r
 296 +                             int i_qp, int i_ctxBlockCat, int b_intra, int idx );\r
 297  void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,\r
 298 -                             int i_qp, int b_intra );\r
 299 +                             int i_qp, int b_intra, int idx );\r
 300  \r
 301  void x264_noise_reduction_update( x264_t *h );\r
 302  \r
 303 diff --git a/encoder/rdo.c b/encoder/rdo.c\r
 304 index 76bf57b..5b23e18 100644\r
 305 --- a/encoder/rdo.c\r
 306 +++ b/encoder/rdo.c\r
 307 @@ -50,21 +50,82 @@ static uint16_t cabac_prefix_size[15][128];\r
 308  \r
 309  #define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \\r
 310          sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) )\r
 311 -    \r
 312 -static int ssd_mb( x264_t *h )\r
 313 +\r
 314 +#define ADD_ABS_SATD(satdtype, pixel)\\r
 315 +    satd += abs((h->pixf.satdtype[pixel]( zero, 0, fdec, FDEC_STRIDE ) - dc_coef)\\r
 316 +          - sum_##satdtype( h, pixel, x, y ));\r
 317 +\r
 318 +/* Sum the cached SATDs to avoid repeating them. */\r
 319 +static inline int sum_satd( x264_t *h, int pixel, int x, int y )\r
 320 +{\r
 321 +    int satd = 0;\r
 322 +    int min_x = x>>2;\r
 323 +    int min_y = y>>2;\r
 324 +    int max_x = (x>>2) + (x264_pixel_size[pixel].w>>2);\r
 325 +    int max_y = (y>>2) + (x264_pixel_size[pixel].h>>2);\r
 326 +    if( pixel == PIXEL_16x16 )\r
 327 +        return h->mb.pic.fenc_satd_sum;\r
 328 +    for( y = min_y; y < max_y; y++ )\r
 329 +        for( x = min_x; x < max_x; x++ )\r
 330 +            satd += h->mb.pic.fenc_satd[y][x];\r
 331 +    return satd;\r
 332 +}\r
 333 +\r
 334 +static inline int sum_sa8d( x264_t *h, int pixel, int x, int y )\r
 335 +{\r
 336 +    int sa8d = 0;\r
 337 +    int min_x = x>>3;\r
 338 +    int min_y = y>>3;\r
 339 +    int max_x = (x>>3) + (x264_pixel_size[pixel].w>>3);\r
 340 +    int max_y = (y>>3) + (x264_pixel_size[pixel].h>>3);\r
 341 +    if( pixel == PIXEL_16x16 )\r
 342 +        return h->mb.pic.fenc_sa8d_sum;\r
 343 +    for( y = min_y; y < max_y; y++ )\r
 344 +        for( x = min_x; x < max_x; x++ )\r
 345 +            sa8d += h->mb.pic.fenc_sa8d[y][x];\r
 346 +    return sa8d;\r
 347 +}\r
 348 +\r
 349 +/* Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" */\r
 350 +/* SATD and SA8D are used to measure block complexity. */\r
 351 +/* Blocks with a complexity most similar to that of the source are scored best. */\r
 352 +/* The difference between SATD and SA8D scores are both used to avoid bias from the DCT size.  Using SATD */\r
 353 +/* only, for example, results in overusage of 8x8dct, while the opposite occurs when using SA8D. */\r
 354 +/* This is because frequencies stored in an 8x8dct sum up to a larger value when viewed through a 4x4 */\r
 355 +/* transform and vice versa with a 4x4dct and an 8x8 transform. */\r
 356 +/* SSD is still used as the primary RD metric; this value is merely added to it for psy purposes. */\r
 357 +\r
 358 +/* FIXME:  Is there a better metric than averaged SATD/SA8D difference for complexity difference? */\r
 359 +/* Hadamard transform is recursive, so a SATD+SA8D can be done faster by taking advantage of this fact. */\r
 360 +/* This optimization can also be used in non-RD transform decision. */\r
 361 +\r
 362 +static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )\r
 363  {\r
 364 -    return h->pixf.ssd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,\r
 365 -                                     h->mb.pic.p_fdec[0], FDEC_STRIDE )\r
 366 -         + h->pixf.ssd[PIXEL_8x8](   h->mb.pic.p_fenc[1], FENC_STRIDE,\r
 367 -                                     h->mb.pic.p_fdec[1], FDEC_STRIDE )\r
 368 -         + h->pixf.ssd[PIXEL_8x8](   h->mb.pic.p_fenc[2], FENC_STRIDE,\r
 369 -                                     h->mb.pic.p_fdec[2], FDEC_STRIDE );\r
 370 +    DECLARE_ALIGNED_16(uint8_t zero[16]) = {0};\r
 371 +    int satd = 0;\r
 372 +    uint8_t *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE;\r
 373 +    uint8_t *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE;\r
 374 +    if( p == 0 && h->param.analyse.i_psy_rd )\r
 375 +    {\r
 376 +        int dc_coef = h->pixf.sad[size](zero, 0, fdec, FDEC_STRIDE) >> 1;\r
 377 +        ADD_ABS_SATD(satd, size);\r
 378 +        /* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */\r
 379 +        if(size <= PIXEL_8x8)\r
 380 +        {\r
 381 +            dc_coef >>= 1;\r
 382 +            ADD_ABS_SATD(sa8d, size);\r
 383 +            satd >>= 1;\r
 384 +        }\r
 385 +        satd = (satd * h->param.analyse.i_psy_rd * x264_lambda_tab[h->mb.i_qp] + 128) >> 8;\r
 386 +    }\r
 387 +    return h->pixf.ssd[size](fenc, FENC_STRIDE, fdec, FDEC_STRIDE) + satd;\r
 388  }\r
 389  \r
 390 -static int ssd_plane( x264_t *h, int size, int p, int x, int y )\r
 391 +static inline int ssd_mb( x264_t *h )\r
 392  {\r
 393 -    return h->pixf.ssd[size]( h->mb.pic.p_fenc[p] + x+y*FENC_STRIDE, FENC_STRIDE,\r
 394 -                              h->mb.pic.p_fdec[p] + x+y*FDEC_STRIDE, FDEC_STRIDE );\r
 395 +    return ssd_plane(h, PIXEL_16x16, 0, 0, 0)\r
 396 +         + ssd_plane(h, PIXEL_8x8,   1, 0, 0)\r
 397 +         + ssd_plane(h, PIXEL_8x8,   2, 0, 0);\r
 398  }\r
 399  \r
 400  static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )\r
 401 @@ -267,7 +328,7 @@ static const int lambda2_tab[2][52] = {\r
 402  };\r
 403  \r
 404  typedef struct {\r
 405 -    uint64_t score;\r
 406 +    int64_t score;\r
 407      int level_idx; // index into level_tree[]\r
 408      uint8_t cabac_state[10]; //just the contexts relevant to coding abs_level_m1\r
 409  } trellis_node_t;\r
 410 @@ -296,7 +357,7 @@ typedef struct {\r
 411  static inline void quant_trellis_cabac( x264_t *h, int16_t *dct,\r
 412                                   const uint16_t *quant_mf, const int *unquant_mf,\r
 413                                   const int *coef_weight, const uint8_t *zigzag,\r
 414 -                                 int i_ctxBlockCat, int i_lambda2, int b_ac, int i_coefs )\r
 415 +                                 int i_ctxBlockCat, int i_lambda2, int b_ac, int i_coefs, int idx )\r
 416  {\r
 417      int abs_coefs[64], signs[64];\r
 418      trellis_node_t nodes[2][8];\r
 419 @@ -428,8 +489,19 @@ static inline void quant_trellis_cabac( x264_t *h, int16_t *dct,\r
 420          // that are better left coded, especially at QP > 40.\r
 421          for( abs_level = q; abs_level >= q-1; abs_level-- )\r
 422          {\r
 423 -            int d = i_coef - ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8);\r
 424 -            uint64_t ssd = (int64_t)d*d * coef_weight[i];\r
 425 +            int unquant_abs_level = ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8);\r
 426 +            int d = i_coef - unquant_abs_level;\r
 427 +            int64_t ssd;\r
 428 +            if( h->param.analyse.i_psy_rd && i != 0 )\r
 429 +            {\r
 430 +                int orig_coef = (i_coefs == 64) ? h->mb.pic.fenc_dct8[idx][i] : h->mb.pic.fenc_dct4[idx][i];\r
 431 +                int predicted_coef = abs(orig_coef - i_coef * signs[i] );\r
 432 +                int psy_value = h->param.analyse.i_psy_rd * abs(predicted_coef + unquant_abs_level * signs[i]);\r
 433 +                int psy_weight = (i_coefs == 64) ? x264_dct8_weight_tab[zigzag[i]] : x264_dct4_weight_tab[zigzag[i]];\r
 434 +                ssd = (int64_t)d*d * coef_weight[i] - psy_weight * (psy_value>>2);\r
 435 +            }\r
 436 +            else\r
 437 +                ssd = (int64_t)d*d * coef_weight[i];\r
 438  \r
 439              for( j = 0; j < 8; j++ )\r
 440              {\r
 441 @@ -493,24 +565,24 @@ static inline void quant_trellis_cabac( x264_t *h, int16_t *dct,\r
 442  \r
 443  \r
 444  void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,\r
 445 -                             int i_qp, int i_ctxBlockCat, int b_intra )\r
 446 +                             int i_qp, int i_ctxBlockCat, int b_intra, int idx )\r
 447  {\r
 448      int b_ac = (i_ctxBlockCat == DCT_LUMA_AC);\r
 449      quant_trellis_cabac( h, (int16_t*)dct,\r
 450          h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],\r
 451          x264_dct4_weight2_zigzag[h->mb.b_interlaced],\r
 452          x264_zigzag_scan4[h->mb.b_interlaced],\r
 453 -        i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 16 );\r
 454 +        i_ctxBlockCat, lambda2_tab[b_intra][h->mb.i_qp], b_ac, 16, idx );\r
 455  }\r
 456  \r
 457  \r
 458  void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,\r
 459 -                             int i_qp, int b_intra )\r
 460 +                             int i_qp, int b_intra, int idx )\r
 461  {\r
 462      quant_trellis_cabac( h, (int16_t*)dct,\r
 463          h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],\r
 464          x264_dct8_weight2_zigzag[h->mb.b_interlaced],\r
 465          x264_zigzag_scan8[h->mb.b_interlaced],\r
 466 -        DCT_LUMA_8x8, lambda2_tab[b_intra][i_qp], 0, 64 );\r
 467 +        DCT_LUMA_8x8, lambda2_tab[b_intra][h->mb.i_qp], 0, 64, idx );\r
 468  }\r
 469  \r
 470 diff --git a/x264.c b/x264.c\r
 471 index 14466e5..504ca12 100644\r
 472 --- a/x264.c\r
 473 +++ b/x264.c\r
 474 @@ -243,6 +243,9 @@ static void Help( x264_param_t *defaults, int b_longhelp )\r
 475      H0( "  -m, --subme <integer>       Subpixel motion estimation and partition\n"\r
 476          "                                  decision quality: 1=fast, 7=best. [%d]\n", defaults->analyse.i_subpel_refine );\r
 477      H0( "      --b-rdo                 RD based mode decision for B-frames. Requires subme 6.\n" );\r
 478 +    H0( "      --psy-rd                Strength of mode decision psychovisual optimization [\"%.1f\"]\n"\r
 479 +        "                              Does nothing at subme < 6.\n",\r
 480 +                                       defaults->analyse.f_psy_rd );\r
 481      H0( "      --mixed-refs            Decide references on a per partition basis\n" );\r
 482      H1( "      --no-chroma-me          Ignore chroma in motion estimation\n" );\r
 483      H1( "      --bime                  Jointly optimize both MVs in B-frames\n" );\r
 484 @@ -411,6 +414,7 @@ static int  Parse( int argc, char **argv,\r
 485              { "mvrange", required_argument, NULL, 0 },\r
 486              { "mvrange-thread", required_argument, NULL, 0 },\r
 487              { "subme",   required_argument, NULL, 'm' },\r
 488 +            { "psy-rd",   required_argument, NULL, 0 },\r
 489              { "b-rdo",   no_argument,       NULL, 0 },\r
 490              { "mixed-refs", no_argument,    NULL, 0 },\r
 491              { "no-chroma-me", no_argument,  NULL, 0 },\r
 492 diff --git a/x264.h b/x264.h\r
 493 index 3b678dc..02266c1 100644\r
 494 --- a/x264.h\r
 495 +++ b/x264.h\r
 496 @@ -239,6 +239,8 @@ typedef struct x264_param_t\r
 497          int          b_fast_pskip; /* early SKIP detection on P-frames */\r
 498          int          b_dct_decimate; /* transform coefficient thresholding on P-frames */\r
 499          int          i_noise_reduction; /* adaptive pseudo-deadzone */\r
 500 +        float        f_psy_rd; /* Psy RD strength */\r
 501 +        int          i_psy_rd; /* Psy RD strength--fixed point value*/\r
 502  \r
 503          /* the deadzone size that will be used in luma quantization */\r
 504          int          i_luma_deadzone[2]; /* {inter, intra} */\r